diff --git a/src/swrenderer/drawers/r_draw_wall32.h b/src/swrenderer/drawers/r_draw_wall32.h index a85b96805..1e4da006c 100644 --- a/src/swrenderer/drawers/r_draw_wall32.h +++ b/src/swrenderer/drawers/r_draw_wall32.h @@ -74,15 +74,54 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; // Sample + unsigned int ifgcolor[2]; + { int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int ifgcolor = source[sample_index]; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index * pitch; + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); @@ -92,7 +131,6 @@ namespace swrenderer outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } else @@ -123,12 +161,17 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; // Sample + unsigned int ifgcolor[2]; + { unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; unsigned int y0 = frac_y0 >> FRACBITS; @@ -149,8 +192,80 @@ namespace swrenderer unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int ifgcolor = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index * pitch; + + // Sample + unsigned int ifgcolor[2]; + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); @@ -160,7 +275,6 @@ namespace swrenderer outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } } @@ -201,23 +315,82 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; // Sample + unsigned int ifgcolor[2]; + { int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int ifgcolor = source[sample_index]; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - int blue0 = BPART(ifgcolor); - int green0 = GPART(ifgcolor); - int red0 = RPART(ifgcolor); - + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - __m128i intensity = _mm_set_epi16(0, intensity0, intensity0, intensity0, 0, intensity0, intensity0, intensity0); + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index * pitch; + + // Sample + unsigned int ifgcolor[2]; + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); fgcolor = _mm_mullo_epi16(fgcolor, mlight); @@ -229,7 +402,6 @@ namespace swrenderer outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } else @@ -265,12 +437,17 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; // Sample + unsigned int ifgcolor[2]; + { unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; unsigned int y0 = frac_y0 >> FRACBITS; @@ -291,16 +468,108 @@ namespace swrenderer unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int ifgcolor = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - int blue0 = BPART(ifgcolor); - int green0 = GPART(ifgcolor); - int red0 = RPART(ifgcolor); - + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - __m128i intensity = _mm_set_epi16(0, intensity0, intensity0, intensity0, 0, intensity0, intensity0, intensity0); + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i outcolor = fgcolor; + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index * pitch; + + // Sample + unsigned int ifgcolor[2]; + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); fgcolor = _mm_mullo_epi16(fgcolor, mlight); @@ -312,7 +581,6 @@ namespace swrenderer outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } } @@ -364,15 +632,63 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; int offset = index * pitch; __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int ifgcolor = source[sample_index]; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); @@ -389,7 +705,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } else @@ -420,12 +735,18 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; + { unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; unsigned int y0 = frac_y0 >> FRACBITS; @@ -446,8 +767,88 @@ namespace swrenderer unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int ifgcolor = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index * pitch; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); @@ -464,7 +865,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } } @@ -505,23 +905,91 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; int offset = index * pitch; __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int ifgcolor = source[sample_index]; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - int blue0 = BPART(ifgcolor); - int green0 = GPART(ifgcolor); - int red0 = RPART(ifgcolor); - + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - __m128i intensity = _mm_set_epi16(0, intensity0, intensity0, intensity0, 0, intensity0, intensity0, intensity0); + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); fgcolor = _mm_mullo_epi16(fgcolor, mlight); @@ -540,7 +1008,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } else @@ -576,12 +1043,18 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; + { unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; unsigned int y0 = frac_y0 >> FRACBITS; @@ -602,16 +1075,116 @@ namespace swrenderer unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int ifgcolor = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - int blue0 = BPART(ifgcolor); - int green0 = GPART(ifgcolor); - int red0 = RPART(ifgcolor); - + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - __m128i intensity = _mm_set_epi16(0, intensity0, intensity0, intensity0, 0, intensity0, intensity0, intensity0); + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + fgcolor = _mm_mullo_epi16(fgcolor, alpha); + bgcolor = _mm_mullo_epi16(bgcolor, inv_alpha); + __m128i outcolor = _mm_srli_epi16(_mm_add_epi16(fgcolor, bgcolor), 8); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index * pitch; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); fgcolor = _mm_mullo_epi16(fgcolor, mlight); @@ -630,7 +1203,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } } @@ -682,15 +1254,78 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); + bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + + __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); + fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); + __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); + __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; int offset = index * pitch; __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int ifgcolor = source[sample_index]; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); @@ -722,7 +1357,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } else @@ -753,12 +1387,18 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; + { unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; unsigned int y0 = frac_y0 >> FRACBITS; @@ -779,8 +1419,103 @@ namespace swrenderer unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int ifgcolor = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); + bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + + __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); + fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); + __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); + __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index * pitch; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); @@ -812,7 +1547,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } } @@ -853,23 +1587,106 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); + bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + + __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); + fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); + __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); + __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; int offset = index * pitch; __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int ifgcolor = source[sample_index]; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - int blue0 = BPART(ifgcolor); - int green0 = GPART(ifgcolor); - int red0 = RPART(ifgcolor); - + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - __m128i intensity = _mm_set_epi16(0, intensity0, intensity0, intensity0, 0, intensity0, intensity0, intensity0); + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); fgcolor = _mm_mullo_epi16(fgcolor, mlight); @@ -903,7 +1720,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } else @@ -939,12 +1755,18 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; + { unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; unsigned int y0 = frac_y0 >> FRACBITS; @@ -965,16 +1787,131 @@ namespace swrenderer unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int ifgcolor = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - int blue0 = BPART(ifgcolor); - int green0 = GPART(ifgcolor); - int red0 = RPART(ifgcolor); - + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - __m128i intensity = _mm_set_epi16(0, intensity0, intensity0, intensity0, 0, intensity0, intensity0, intensity0); + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); + bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + + __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); + fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_srai_epi16(_mm_add_epi32(fg_lo, bg_lo), 8); + __m128i out_hi = _mm_srai_epi16(_mm_add_epi32(fg_hi, bg_hi), 8); + __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index * pitch; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); fgcolor = _mm_mullo_epi16(fgcolor, mlight); @@ -1008,7 +1945,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } } @@ -1060,15 +1996,78 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); + bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + + __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); + fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); + __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); + __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; int offset = index * pitch; __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int ifgcolor = source[sample_index]; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); @@ -1100,7 +2099,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } else @@ -1131,12 +2129,18 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; + { unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; unsigned int y0 = frac_y0 >> FRACBITS; @@ -1157,8 +2161,103 @@ namespace swrenderer unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int ifgcolor = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); + bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + + __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); + fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); + __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); + __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index * pitch; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); @@ -1190,7 +2289,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } } @@ -1231,23 +2329,106 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); + bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + + __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); + fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); + __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); + __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; int offset = index * pitch; __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int ifgcolor = source[sample_index]; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - int blue0 = BPART(ifgcolor); - int green0 = GPART(ifgcolor); - int red0 = RPART(ifgcolor); - + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - __m128i intensity = _mm_set_epi16(0, intensity0, intensity0, intensity0, 0, intensity0, intensity0, intensity0); + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); fgcolor = _mm_mullo_epi16(fgcolor, mlight); @@ -1281,7 +2462,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } else @@ -1317,12 +2497,18 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; + { unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; unsigned int y0 = frac_y0 >> FRACBITS; @@ -1343,16 +2529,131 @@ namespace swrenderer unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int ifgcolor = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - int blue0 = BPART(ifgcolor); - int green0 = GPART(ifgcolor); - int red0 = RPART(ifgcolor); - + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - __m128i intensity = _mm_set_epi16(0, intensity0, intensity0, intensity0, 0, intensity0, intensity0, intensity0); + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); + bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + + __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); + fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(fg_lo, bg_lo), 8); + __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(fg_hi, bg_hi), 8); + __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index * pitch; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); fgcolor = _mm_mullo_epi16(fgcolor, mlight); @@ -1386,7 +2687,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } } @@ -1438,15 +2738,78 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); + bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + + __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); + fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); + __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); + __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; int offset = index * pitch; __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int ifgcolor = source[sample_index]; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); @@ -1478,7 +2841,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } else @@ -1509,12 +2871,18 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; + { unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; unsigned int y0 = frac_y0 >> FRACBITS; @@ -1535,8 +2903,103 @@ namespace swrenderer unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int ifgcolor = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); + bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + + __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); + fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); + __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); + __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index * pitch; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, mlight), 8); @@ -1568,7 +3031,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } } @@ -1609,23 +3071,106 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; + unsigned int sampleout = source[sample_index]; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); + bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + + __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); + fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); + __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); + __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; int offset = index * pitch; __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int ifgcolor = source[sample_index]; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = source[sample_index]; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - int blue0 = BPART(ifgcolor); - int green0 = GPART(ifgcolor); - int red0 = RPART(ifgcolor); - + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - __m128i intensity = _mm_set_epi16(0, intensity0, intensity0, intensity0, 0, intensity0, intensity0, intensity0); + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); fgcolor = _mm_mullo_epi16(fgcolor, mlight); @@ -1659,7 +3204,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } else @@ -1695,12 +3239,18 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); // Sample + unsigned int ifgcolor[2]; + { unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; unsigned int y0 = frac_y0 >> FRACBITS; @@ -1721,16 +3271,131 @@ namespace swrenderer unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int ifgcolor = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade - int blue0 = BPART(ifgcolor); - int green0 = GPART(ifgcolor); - int red0 = RPART(ifgcolor); - + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - __m128i intensity = _mm_set_epi16(0, intensity0, intensity0, intensity0, 0, intensity0, intensity0, intensity0); + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); + + fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); + fgcolor = _mm_mullo_epi16(fgcolor, mlight); + fgcolor = _mm_srli_epi16(_mm_add_epi16(shade_fade, fgcolor), 8); + fgcolor = _mm_srli_epi16(_mm_mullo_epi16(fgcolor, shade_light), 8); + + // Blend + __m128i alpha = _mm_shufflelo_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_shufflehi_epi16(fgcolor, _MM_SHUFFLE(3,3,3,3)); + alpha = _mm_add_epi16(alpha, _mm_srli_epi16(alpha, 7)); // 255 -> 256 + __m128i inv_alpha = _mm_sub_epi16(_mm_set1_epi16(256), alpha); + + __m128i bgalpha = _mm_mullo_epi16(destalpha, alpha); + bgalpha = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(bgalpha, _mm_slli_epi16(inv_alpha, 8)), _mm_set1_epi16(128)), 8); + + __m128i fgalpha = _mm_mullo_epi16(srcalpha, alpha); + fgalpha = _mm_srli_epi16(_mm_add_epi16(fgalpha, _mm_set1_epi16(128)), 8); + + fgcolor = _mm_mullo_epi16(fgcolor, fgalpha); + bgcolor = _mm_mullo_epi16(bgcolor, bgalpha); + + __m128i fg_lo = _mm_unpacklo_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi16(bgcolor, _mm_setzero_si128()); + __m128i fg_hi = _mm_unpackhi_epi16(fgcolor, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi16(bgcolor, _mm_setzero_si128()); + + __m128i out_lo = _mm_srai_epi16(_mm_sub_epi32(bg_lo, fg_lo), 8); + __m128i out_hi = _mm_srai_epi16(_mm_sub_epi32(bg_hi, fg_hi), 8); + __m128i outcolor = _mm_packs_epi32(fg_lo, fg_hi); + outcolor = _mm_packus_epi16(outcolor, _mm_setzero_si128()); + outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index * pitch; + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + // Sample + unsigned int ifgcolor[2]; + unsigned int frac_y0 = (frac >> FRACBITS) * textureheight; + unsigned int frac_y1 = ((frac + one) >> FRACBITS) * textureheight; + unsigned int y0 = frac_y0 >> FRACBITS; + unsigned int y1 = frac_y1 >> FRACBITS; + + unsigned int p00 = source[y0]; + unsigned int p01 = source[y1]; + unsigned int p10 = source2[y0]; + unsigned int p11 = source2[y1]; + + unsigned int inv_b = texturefracx; + unsigned int inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + unsigned int a = 16 - inv_a; + unsigned int b = 16 - inv_b; + + unsigned int sred = (RPART(p00) * (a * b) + RPART(p01) * (inv_a * b) + RPART(p10) * (a * inv_b) + RPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sgreen = (GPART(p00) * (a * b) + GPART(p01) * (inv_a * b) + GPART(p10) * (a * inv_b) + GPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; + unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; + + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); + int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); fgcolor = _mm_mullo_epi16(fgcolor, mlight); @@ -1764,7 +3429,6 @@ namespace swrenderer outcolor = _mm_or_si128(outcolor, _mm_set1_epi32(0xff000000)); dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } } } diff --git a/src/swrenderer/drawers/r_draw_wall32.php b/src/swrenderer/drawers/r_draw_wall32.php index d51c52455..81abbfe69 100644 --- a/src/swrenderer/drawers/r_draw_wall32.php +++ b/src/swrenderer/drawers/r_draw_wall32.php @@ -123,13 +123,56 @@ namespace swrenderer __m128i srcalpha = _mm_set1_epi16(args.SrcAlpha()); __m128i destalpha = _mm_set1_epi16(args.DestAlpha()); - for (int index = 0; index < count; index++) + int ssecount = count / 2; + for (int index = 0; index < ssecount; index++) { - int offset = index * pitch; - __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + int offset = index * pitch * 2; + uint32_t desttmp[2]; + desttmp[0] = dest[offset]; + desttmp[1] = dest[offset + pitch]; + + __m128i bgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)desttmp), _mm_setzero_si128()); + // Sample + unsigned int ifgcolor[2]; + { + ifgcolor[0] = sampleout; + frac += fracstep; + } + { + + ifgcolor[1] = sampleout; + frac += fracstep; + } + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); + + // Shade + + + // Blend + + + _mm_storel_epi64((__m128i*)desttmp, outcolor); + dest[offset] = desttmp[0]; + dest[offset + pitch] = desttmp[1]; + } + + if (ssecount * 2 != count) + { + int index = ssecount * 2; + int offset = index * pitch; + + __m128i bgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(dest[offset]), _mm_setzero_si128()); + + + // Sample + unsigned int ifgcolor[2]; + + ifgcolor[0] = sampleout; + ifgcolor[1] = 0; + __m128i fgcolor = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ifgcolor), _mm_setzero_si128()); // Shade @@ -138,7 +181,6 @@ namespace swrenderer dest[offset] = _mm_cvtsi128_si32(outcolor); - frac += fracstep; } int sample_index = ((frac >> FRACBITS) * textureheight) >> FRACBITS; - unsigned int ifgcolor = source[sample_index]; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = source[sample_index]; @@ -172,8 +213,7 @@ namespace swrenderer unsigned int sblue = (BPART(p00) * (a * b) + BPART(p01) * (inv_a * b) + BPART(p10) * (a * inv_b) + BPART(p11) * (inv_a * inv_b) + 127) >> 8; unsigned int salpha = (APART(p00) * (a * b) + APART(p01) * (inv_a * b) + APART(p10) * (a * inv_b) + APART(p11) * (inv_a * inv_b) + 127) >> 8; - unsigned int ifgcolor = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - __m128i fgcolor = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ifgcolor), _mm_setzero_si128()); + unsigned int sampleout = (salpha << 24) | (sred << 16) | (sgreen << 8) | sblue; - int blue0 = BPART(ifgcolor); - int green0 = GPART(ifgcolor); - int red0 = RPART(ifgcolor); - + int blue0 = BPART(ifgcolor[0]); + int green0 = GPART(ifgcolor[0]); + int red0 = RPART(ifgcolor[0]); int intensity0 = ((red0 * 77 + green0 * 143 + blue0 * 37) >> 8) * desaturate; - __m128i intensity = _mm_set_epi16(0, intensity0, intensity0, intensity0, 0, intensity0, intensity0, intensity0); + + int blue1 = BPART(ifgcolor[1]); + int green1 = GPART(ifgcolor[1]); + int red1 = RPART(ifgcolor[1]); + int intensity1 = ((red1 * 77 + green1 * 143 + blue1 * 37) >> 8) * desaturate; + + __m128i intensity = _mm_set_epi16(0, intensity1, intensity1, intensity1, 0, intensity0, intensity0, intensity0); fgcolor = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(fgcolor, inv_desaturate), intensity), 8); fgcolor = _mm_mullo_epi16(fgcolor, mlight);