From 6c037fa24971df781b5581a42cf58651bcb71954 Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Sun, 26 Jun 2016 21:23:32 +0200 Subject: [PATCH] Throwing templates at the code redundancy problem in drawers --- src/r_draw_rgba.cpp | 630 ++++++++++++++++++++-------- src/r_draw_rgba.h | 88 ++-- src/r_draw_rgba_sse.h | 923 +---------------------------------------- src/r_drawt_rgba.cpp | 5 + src/r_drawt_rgba_sse.h | 10 + src/r_segs.cpp | 3 +- 6 files changed, 551 insertions(+), 1108 deletions(-) diff --git a/src/r_draw_rgba.cpp b/src/r_draw_rgba.cpp index bfabdfbbb..fbb2c12c5 100644 --- a/src/r_draw_rgba.cpp +++ b/src/r_draw_rgba.cpp @@ -67,8 +67,13 @@ CVAR(Bool, r_mipmap, true, CVAR_ARCHIVE | CVAR_GLOBALCONFIG); #ifndef NO_SSE +#ifdef _MSC_VER +#pragma warning(disable: 4101) // warning C4101: unreferenced local variable +#endif + // Generate SSE drawers: #define VecCommand(name) name##_SSE_Command +#define VEC_SHADE_VARS SSE_SHADE_VARS #define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT #define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4 #define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE @@ -1552,8 +1557,446 @@ public: return (--count) != 0; } }; + +#ifdef NO_SSE + struct NearestSampler + { + FORCEINLINE static uint32_t Sample1(DrawerWall4Command &cmd, LoopIterator &loop, int index) + { + return cmd._bufplce[index][loop.sample_index(index)]; + } + }; + struct LinearSampler + { + FORCEINLINE static uint32_t Sample1(DrawerWall4Command &cmd, LoopIterator &loop, int index) + { + return SampleBgra::sample_bilinear(cmd._bufplce[index], cmd._bufplce2[index], cmd._buftexturefracx[index], loop.vplce[index], loop.half[index], loop.height[index]); + } + }; +#else + struct NearestSampler + { + FORCEINLINE static __m128i Sample4(DrawerWall4Command &cmd, LoopIterator &loop) + { + return _mm_set_epi32(cmd._bufplce[3][loop.sample_index(3)], cmd._bufplce[2][loop.sample_index(2)], cmd._bufplce[1][loop.sample_index(1)], cmd._bufplce[0][loop.sample_index(0)]); + } + }; + + struct LinearSampler + { + FORCEINLINE static __m128i Sample4(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg; + VEC_SAMPLE_BILINEAR4_COLUMN(fg, cmd._bufplce, cmd._bufplce2, cmd._buftexturefracx, loop.vplce, loop.half, loop.height); + return fg; + } + }; +#endif + +#ifdef NO_SSE + template + struct Copy + { + Copy(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::copy(fg); + } + } + }; + + template + struct Mask + { + Mask(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::alpha_blend(fg, loop.dest[i]); + } + } + }; + + template + struct TMaskAdd + { + TMaskAdd(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::add(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha)); + } + } + }; + + template + struct TMaskSub + { + TMaskSub(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::sub(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha)); + } + } + }; + + template + struct TMaskRevSub + { + TMaskRevSub(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::revsub(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha)); + } + } + }; + + typedef Copy CopyNearestSimple; + typedef Copy CopyLinearSimple; + typedef Copy CopyNearest; + typedef Copy CopyLinear; + typedef Mask MaskNearestSimple; + typedef Mask MaskLinearSimple; + typedef Mask MaskNearest; + typedef Mask MaskLinear; + typedef TMaskAdd TMaskAddNearestSimple; + typedef TMaskAdd TMaskAddLinearSimple; + typedef TMaskAdd TMaskAddNearest; + typedef TMaskAdd TMaskAddLinear; + typedef TMaskSub TMaskSubNearestSimple; + typedef TMaskSub TMaskSubLinearSimple; + typedef TMaskSub TMaskSubNearest; + typedef TMaskSub TMaskSubLinear; + typedef TMaskRevSub TMaskRevSubNearestSimple; + typedef TMaskRevSub TMaskRevSubLinearSimple; + typedef TMaskRevSub TMaskRevSubNearest; + typedef TMaskRevSub TMaskRevSubLinear; +#else + template + struct CopySimple + { + VEC_SHADE_VARS(); + CopySimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)loop.dest, fg); + } + }; + + template + struct Copy + { + VEC_SHADE_VARS(); + Copy(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + VEC_SHADE(fg, cmd._shade_constants); + _mm_storeu_si128((__m128i*)loop.dest, fg); + } + }; + + template + struct MaskSimple + { + VEC_SHADE_VARS(); + MaskSimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + VEC_SHADE_SIMPLE(fg); + VEC_ALPHA_BLEND(fg, bg); + _mm_storeu_si128((__m128i*)loop.dest, fg); + } + }; + + template + struct Mask + { + VEC_SHADE_VARS(); + Mask(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + VEC_SHADE(fg, cmd._shade_constants); + VEC_ALPHA_BLEND(fg, bg); + _mm_storeu_si128((__m128i*)loop.dest, fg); + } + }; + + template + struct TMaskAddSimple + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskAddSimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskAdd + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskAdd(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskSubSimple + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskSubSimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskSub + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskSub(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskRevSubSimple + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskRevSubSimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskRevSub + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskRevSub(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + typedef CopySimple CopyNearestSimple; + typedef CopySimple CopyLinearSimple; + typedef Copy CopyNearest; + typedef Copy CopyLinear; + typedef MaskSimple MaskNearestSimple; + typedef MaskSimple MaskLinearSimple; + typedef Mask MaskNearest; + typedef Mask MaskLinear; + typedef TMaskAddSimple TMaskAddNearestSimple; + typedef TMaskAddSimple TMaskAddLinearSimple; + typedef TMaskAdd TMaskAddNearest; + typedef TMaskAdd TMaskAddLinear; + typedef TMaskSubSimple TMaskSubNearestSimple; + typedef TMaskSubSimple TMaskSubLinearSimple; + typedef TMaskSub TMaskSubNearest; + typedef TMaskSub TMaskSubLinear; + typedef TMaskRevSubSimple TMaskRevSubNearestSimple; + typedef TMaskRevSubSimple TMaskRevSubLinearSimple; + typedef TMaskRevSub TMaskRevSubNearest; + typedef TMaskRevSub TMaskRevSubLinear; +#endif }; +typedef DrawerBlendCommand Vlinec4NearestSimpleRGBACommand; +typedef DrawerBlendCommand Vlinec4NearestRGBACommand; +typedef DrawerBlendCommand Vlinec4LinearSimpleRGBACommand; +typedef DrawerBlendCommand Vlinec4LinearRGBACommand; +typedef DrawerBlendCommand Mvlinec4NearestSimpleRGBACommand; +typedef DrawerBlendCommand Mvlinec4NearestRGBACommand; +typedef DrawerBlendCommand Mvlinec4LinearSimpleRGBACommand; +typedef DrawerBlendCommand Mvlinec4LinearRGBACommand; +typedef DrawerBlendCommand Tmvline4AddNearestSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4AddNearestRGBACommand; +typedef DrawerBlendCommand Tmvline4AddLinearSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4AddLinearRGBACommand; +typedef DrawerBlendCommand Tmvline4AddClampNearestSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4AddClampNearestRGBACommand; +typedef DrawerBlendCommand Tmvline4AddClampLinearSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4AddClampLinearRGBACommand; +typedef DrawerBlendCommand Tmvline4SubClampNearestSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4SubClampNearestRGBACommand; +typedef DrawerBlendCommand Tmvline4SubClampLinearSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4SubClampLinearRGBACommand; +typedef DrawerBlendCommand Tmvline4RevSubClampNearestSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4RevSubClampNearestRGBACommand; +typedef DrawerBlendCommand Tmvline4RevSubClampLinearSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4RevSubClampLinearRGBACommand; + class Vlinec1RGBACommand : public DrawerWall1Command { public: @@ -1581,39 +2024,6 @@ public: } }; -class Vlinec4RGBACommand : public DrawerWall4Command -{ -public: - void Execute(DrawerThread *thread) override - { - LoopIterator loop(this, thread); - if (!loop) return; - - if (_bufplce2[0] == nullptr) - { - do - { - for (int i = 0; i < 4; i++) - { - uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants); - loop.dest[i] = BlendBgra::copy(fg); - } - } while (loop.next()); - } - else - { - do - { - for (int i = 0; i < 4; i++) - { - uint32_t fg = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_bufplce[i], _bufplce2[i], _buftexturefracx[i], loop.sample_index(i), loop.half[i], loop.height[i]), _light[i], _shade_constants); - loop.dest[i] = BlendBgra::copy(fg); - } - } while (loop.next()); - } - } -}; - class Mvlinec1RGBACommand : public DrawerWall1Command { public: @@ -1641,39 +2051,6 @@ public: } }; -class Mvlinec4RGBACommand : public DrawerWall4Command -{ -public: - void Execute(DrawerThread *thread) override - { - LoopIterator loop(this, thread); - if (!loop) return; - - if (_bufplce2[0] == nullptr) - { - do - { - for (int i = 0; i < 4; i++) - { - uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants); - loop.dest[i] = BlendBgra::alpha_blend(fg, loop.dest[i]); - } - } while (loop.next()); - } - else - { - do - { - for (int i = 0; i < 4; i++) - { - uint32_t fg = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_bufplce[i], _bufplce2[i], _buftexturefracx[i], loop.sample_index(i), loop.half[i], loop.height[i]), _light[i], _shade_constants); - loop.dest[i] = BlendBgra::alpha_blend(fg, loop.dest[i]); - } - } while (loop.next()); - } - } -}; - class Tmvline1AddRGBACommand : public DrawerWall1Command { public: @@ -1689,24 +2066,6 @@ public: } }; -class Tmvline4AddRGBACommand : public DrawerWall4Command -{ -public: - void Execute(DrawerThread *thread) override - { - LoopIterator loop(this, thread); - if (!loop) return; - do - { - for (int i = 0; i < 4; i++) - { - uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants); - loop.dest[i] = BlendBgra::add(fg, loop.dest[i], _srcalpha, calc_blend_bgalpha(fg, _destalpha)); - } - } while (loop.next()); - } -}; - class Tmvline1AddClampRGBACommand : public DrawerWall1Command { public: @@ -1722,24 +2081,6 @@ public: } }; -class Tmvline4AddClampRGBACommand : public DrawerWall4Command -{ -public: - void Execute(DrawerThread *thread) override - { - LoopIterator loop(this, thread); - if (!loop) return; - do - { - for (int i = 0; i < 4; i++) - { - uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants); - loop.dest[i] = BlendBgra::add(fg, loop.dest[i], _srcalpha, calc_blend_bgalpha(fg, _destalpha)); - } - } while (loop.next()); - } -}; - class Tmvline1SubClampRGBACommand : public DrawerWall1Command { public: @@ -1755,24 +2096,6 @@ public: } }; -class Tmvline4SubClampRGBACommand : public DrawerWall4Command -{ -public: - void Execute(DrawerThread *thread) override - { - LoopIterator loop(this, thread); - if (!loop) return; - do - { - for (int i = 0; i < 4; i++) - { - uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants); - loop.dest[i] = BlendBgra::sub(fg, loop.dest[i], _srcalpha, calc_blend_bgalpha(fg, _destalpha)); - } - } while (loop.next()); - } -}; - class Tmvline1RevSubClampRGBACommand : public DrawerWall1Command { public: @@ -1788,24 +2111,6 @@ public: } }; -class Tmvline4RevSubClampRGBACommand : public DrawerWall4Command -{ -public: - void Execute(DrawerThread *thread) override - { - LoopIterator loop(this, thread); - if (!loop) return; - do - { - for (int i = 0; i < 4; i++) - { - uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants); - loop.dest[i] = BlendBgra::revsub(fg, loop.dest[i], _srcalpha, calc_blend_bgalpha(fg, _destalpha)); - } - } while (loop.next()); - } -}; - ///////////////////////////////////////////////////////////////////////////// class DrawFogBoundaryLineRGBACommand : public DrawerCommand @@ -2355,13 +2660,22 @@ DWORD vlinec1_rgba() return dc_texturefrac + dc_count * dc_iscale; } +template +void queue_wallcommand() +{ + if (bufplce2[0] == nullptr && dc_shade_constants.simple_shade) + DrawerCommandQueue::QueueCommand(); + else if (bufplce2[0] == nullptr) + DrawerCommandQueue::QueueCommand(); + else if (dc_shade_constants.simple_shade) + DrawerCommandQueue::QueueCommand(); + else + DrawerCommandQueue::QueueCommand(); +} + void vlinec4_rgba() { -#ifdef NO_SSE - DrawerCommandQueue::QueueCommand(); -#else - DrawerCommandQueue::QueueCommand(); -#endif + queue_wallcommand(); for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } @@ -2374,11 +2688,7 @@ DWORD mvlinec1_rgba() void mvlinec4_rgba() { -#ifdef NO_SSE - DrawerCommandQueue::QueueCommand(); -#else - DrawerCommandQueue::QueueCommand(); -#endif + queue_wallcommand(); for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } @@ -2391,11 +2701,7 @@ fixed_t tmvline1_add_rgba() void tmvline4_add_rgba() { -#ifdef NO_SSE - DrawerCommandQueue::QueueCommand(); -#else - DrawerCommandQueue::QueueCommand(); -#endif + queue_wallcommand(); for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } @@ -2408,11 +2714,7 @@ fixed_t tmvline1_addclamp_rgba() void tmvline4_addclamp_rgba() { -#ifdef NO_SSE - DrawerCommandQueue::QueueCommand(); -#else - DrawerCommandQueue::QueueCommand(); -#endif + queue_wallcommand(); for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } @@ -2425,11 +2727,7 @@ fixed_t tmvline1_subclamp_rgba() void tmvline4_subclamp_rgba() { -#ifdef NO_SSE - DrawerCommandQueue::QueueCommand(); -#else - DrawerCommandQueue::QueueCommand(); -#endif + queue_wallcommand(); for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } @@ -2442,11 +2740,7 @@ fixed_t tmvline1_revsubclamp_rgba() void tmvline4_revsubclamp_rgba() { -#ifdef NO_SSE - DrawerCommandQueue::QueueCommand(); -#else - DrawerCommandQueue::QueueCommand(); -#endif + queue_wallcommand(); for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } diff --git a/src/r_draw_rgba.h b/src/r_draw_rgba.h index 4961fa6dc..53572c88b 100644 --- a/src/r_draw_rgba.h +++ b/src/r_draw_rgba.h @@ -286,6 +286,22 @@ public: void Execute(DrawerThread *thread) override; }; +template +class DrawerBlendCommand : public CommandType +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + BlendMode blend(*this, loop); + do + { + blend.Blend(*this, loop); + } while (loop.next()); + } +}; + ///////////////////////////////////////////////////////////////////////////// // Pixel shading inline functions: @@ -624,7 +640,7 @@ public: __m128i ab_invab = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2); \ __m128i ainvb_invainvb = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2 + 1); \ \ - __m128i gather = _mm_set_epi32(col1[i][y1], col1[i][y0], col0[i][y1], col1[i][y0]); \ + __m128i gather = _mm_set_epi32(col1[i][y1], col1[i][y0], col0[i][y1], col0[i][y0]); \ __m128i p0 = _mm_unpacklo_epi8(gather, _mm_setzero_si128()); \ __m128i p1 = _mm_unpackhi_epi8(gather, _mm_setzero_si128()); \ \ @@ -635,6 +651,26 @@ public: } \ } +#define VEC_SAMPLE_MIP_NEAREST4_COLUMN(fg, col0, col1, mipfrac, texturefracy, height0, height1) { \ + uint32_t y0[4], y1[4]; \ + for (int i = 0; i < 4; i++) \ + { \ + y0[i] = (texturefracy[i] >> FRACBITS) * height0[i]; \ + y1[i] = (texturefracy[i] >> FRACBITS) * height1[i]; \ + } \ + __m128i p0 = _mm_set_epi32(col0[y0[3]], col0[y0[2]], col0[y0[1]], col0[y0[0]]); \ + __m128i p1 = _mm_set_epi32(col1[y1[3]], col1[y1[2]], col1[y1[1]], col1[y1[0]]); \ + __m128i t = _mm_loadu_si128((const __m128i*)mipfrac); \ + __m128i inv_t = _mm_sub_epi32(_mm_set1_epi32(256), mipfrac); \ + __m128i p0_lo = _mm_unpacklo_epi8(p0, _mm_setzero_si128()); \ + __m128i p0_hi = _mm_unpackhi_epi8(p0, _mm_setzero_si128()); \ + __m128i p1_lo = _mm_unpacklo_epi8(p1, _mm_setzero_si128()); \ + __m128i p1_hi = _mm_unpackhi_epi8(p1, _mm_setzero_si128()); \ + __m128i fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(p0_lo, t), _mm_mullo_epi16(p1_lo, inv_t)), 8); \ + __m128i fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(p0_hi, t), _mm_mullo_epi16(p1_hi, inv_t)), 8); \ + fg = _mm_packus_epi16(fg_lo, fg_hi); \ +} + #define VEC_SAMPLE_BILINEAR4_SPAN(fg, texture, xfrac, yfrac, xstep, ystep, xbits, ybits) { \ int xshift = (32 - xbits); \ int yshift = (32 - ybits); \ @@ -844,12 +880,14 @@ FORCEINLINE uint32_t calc_blend_bgalpha(uint32_t fg, uint32_t dest_alpha) return (dest_alpha * alpha + 256 * inv_alpha + 128) >> 8; } +#define VEC_CALC_BLEND_ALPHA_VARS() __m128i msrc_alpha, mdest_alpha, m256, m255, m128; + #define VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha) \ - __m128i msrc_alpha = _mm_set1_epi16(src_alpha); \ - __m128i mdest_alpha = _mm_set1_epi16(dest_alpha * 255 / 256); \ - __m128i m256 = _mm_set1_epi16(256); \ - __m128i m255 = _mm_set1_epi16(255); \ - __m128i m128 = _mm_set1_epi16(128); + msrc_alpha = _mm_set1_epi16(src_alpha); \ + mdest_alpha = _mm_set1_epi16(dest_alpha * 255 / 256); \ + m256 = _mm_set1_epi16(256); \ + m255 = _mm_set1_epi16(255); \ + m128 = _mm_set1_epi16(128); // Calculates the final alpha values to be used when combined with the source texture alpha channel #define VEC_CALC_BLEND_ALPHA(fg) \ @@ -866,15 +904,17 @@ FORCEINLINE uint32_t calc_blend_bgalpha(uint32_t fg, uint32_t dest_alpha) fg_alpha_lo = msrc_alpha; \ } +#define SSE_SHADE_VARS() __m128i mlight_hi, mlight_lo, color, fade, fade_amount_hi, fade_amount_lo, inv_desaturate; + // Calculate constants for a simple shade #define SSE_SHADE_SIMPLE_INIT(light) \ - __m128i mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \ - __m128i mlight_lo = mlight_hi; + mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \ + mlight_lo = mlight_hi; // Calculate constants for a simple shade with different light levels for each pixel #define SSE_SHADE_SIMPLE_INIT4(light3, light2, light1, light0) \ - __m128i mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \ - __m128i mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2); + mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \ + mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2); // Simple shade 4 pixels #define SSE_SHADE_SIMPLE(fg) { \ @@ -889,31 +929,31 @@ FORCEINLINE uint32_t calc_blend_bgalpha(uint32_t fg, uint32_t dest_alpha) // Calculate constants for a complex shade #define SSE_SHADE_INIT(light, shade_constants) \ - __m128i mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \ - __m128i mlight_lo = mlight_hi; \ - __m128i color = _mm_set_epi16( \ + mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \ + mlight_lo = mlight_hi; \ + color = _mm_set_epi16( \ 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \ 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \ - __m128i fade = _mm_set_epi16( \ + fade = _mm_set_epi16( \ 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \ 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \ - __m128i fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \ - __m128i fade_amount_lo = fade_amount_hi; \ - __m128i inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \ + fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \ + fade_amount_lo = fade_amount_hi; \ + inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \ // Calculate constants for a complex shade with different light levels for each pixel #define SSE_SHADE_INIT4(light3, light2, light1, light0, shade_constants) \ - __m128i mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \ - __m128i mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2); \ - __m128i color = _mm_set_epi16( \ + mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \ + mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2); \ + color = _mm_set_epi16( \ 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \ 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \ - __m128i fade = _mm_set_epi16( \ + fade = _mm_set_epi16( \ 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \ 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \ - __m128i fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \ - __m128i fade_amount_lo = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_lo)); \ - __m128i inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \ + fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \ + fade_amount_lo = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_lo)); \ + inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \ // Complex shade 4 pixels #define SSE_SHADE(fg, shade_constants) { \ diff --git a/src/r_draw_rgba_sse.h b/src/r_draw_rgba_sse.h index ae8d3bf42..4ee557693 100644 --- a/src/r_draw_rgba_sse.h +++ b/src/r_draw_rgba_sse.h @@ -84,6 +84,7 @@ public: if (shade_constants.simple_shade) { + VEC_SHADE_VARS(); VEC_SHADE_SIMPLE_INIT(light); while (sse_count--) @@ -121,6 +122,7 @@ public: } else { + VEC_SHADE_VARS(); VEC_SHADE_INIT(light, shade_constants); while (sse_count--) @@ -184,6 +186,7 @@ public: if (shade_constants.simple_shade) { + VEC_SHADE_VARS(); VEC_SHADE_SIMPLE_INIT(light); while (sse_count--) @@ -217,6 +220,7 @@ public: } else { + VEC_SHADE_VARS(); VEC_SHADE_INIT(light, shade_constants); while (sse_count--) @@ -277,6 +281,7 @@ public: if (shade_constants.simple_shade) { + VEC_SHADE_VARS(); VEC_SHADE_SIMPLE_INIT(light); while (sse_count--) { @@ -289,6 +294,7 @@ public: } else { + VEC_SHADE_VARS(); VEC_SHADE_INIT(light, shade_constants); while (sse_count--) { @@ -317,6 +323,7 @@ public: if (shade_constants.simple_shade) { + VEC_SHADE_VARS(); VEC_SHADE_SIMPLE_INIT(light); while (sse_count--) { @@ -331,6 +338,7 @@ public: } else { + VEC_SHADE_VARS(); VEC_SHADE_INIT(light, shade_constants); while (sse_count--) { @@ -357,918 +365,3 @@ public: } } }; - -class VecCommand(Vlinec4RGBA) : public DrawerCommand -{ - BYTE * RESTRICT _dest; - int _count; - int _pitch; - ShadeConstants _shade_constants; - fixed_t palookuplight[4]; - DWORD vplce[4]; - DWORD vince[4]; - const uint32 * RESTRICT bufplce[4]; - const uint32_t * RESTRICT bufplce2[4]; - uint32_t buftexturefracx[4]; - uint32_t bufheight[4]; - -public: - VecCommand(Vlinec4RGBA)() - { - _dest = dc_dest; - _count = dc_count; - _pitch = dc_pitch; - _shade_constants = dc_shade_constants; - for (int i = 0; i < 4; i++) - { - palookuplight[i] = ::palookuplight[i]; - vplce[i] = ::vplce[i]; - vince[i] = ::vince[i]; - bufplce[i] = (const uint32 *)::bufplce[i]; - bufplce2[i] = (const uint32_t *)::bufplce2[i]; - buftexturefracx[i] = ::buftexturefracx[i]; - bufheight[i] = ::bufheight[i]; - } - } - - void Execute(DrawerThread *thread) override - { - int count = thread->count_for_thread(_dest_y, _count); - if (count <= 0) - return; - - uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest); - int pitch = _pitch * thread->num_cores; - - uint32_t height[4]; - uint32_t half[4]; - for (int i = 0; i < 4; i++) - { - height[i] = bufheight[i]; - half[i] = (0x80000000 + height[i] - 1) / height[i]; - } - - uint32_t light0 = LightBgra::calc_light_multiplier(palookuplight[0]); - uint32_t light1 = LightBgra::calc_light_multiplier(palookuplight[1]); - uint32_t light2 = LightBgra::calc_light_multiplier(palookuplight[2]); - uint32_t light3 = LightBgra::calc_light_multiplier(palookuplight[3]); - - ShadeConstants shade_constants = _shade_constants; - - DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; - DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; - int skipped = thread->skipped_by_thread(_dest_y); - for (int i = 0; i < 4; i++) - { - local_vplce[i] += local_vince[i] * skipped; - local_vince[i] *= thread->num_cores; - } - - if (bufplce2[0] == nullptr) - { - if (shade_constants.simple_shade) - { - VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0); - do - { - DWORD place0 = local_vplce[0]; - DWORD place1 = local_vplce[1]; - DWORD place2 = local_vplce[2]; - DWORD place3 = local_vplce[3]; - - uint32_t p0 = bufplce[0][((place0 >> FRACBITS) * height[0]) >> FRACBITS]; - uint32_t p1 = bufplce[1][((place1 >> FRACBITS) * height[1]) >> FRACBITS]; - uint32_t p2 = bufplce[2][((place2 >> FRACBITS) * height[2]) >> FRACBITS]; - uint32_t p3 = bufplce[3][((place3 >> FRACBITS) * height[3]) >> FRACBITS]; - - local_vplce[0] = place0 + local_vince[0]; - local_vplce[1] = place1 + local_vince[1]; - local_vplce[2] = place2 + local_vince[2]; - local_vplce[3] = place3 + local_vince[3]; - - __m128i fg = _mm_set_epi32(p3, p2, p1, p0); - VEC_SHADE_SIMPLE(fg); - _mm_storeu_si128((__m128i*)dest, fg); - dest += pitch; - } while (--count); - } - else - { - VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants); - do - { - DWORD place0 = local_vplce[0]; - DWORD place1 = local_vplce[1]; - DWORD place2 = local_vplce[2]; - DWORD place3 = local_vplce[3]; - - uint32_t p0 = bufplce[0][((place0 >> FRACBITS) * height[0]) >> FRACBITS]; - uint32_t p1 = bufplce[1][((place1 >> FRACBITS) * height[1]) >> FRACBITS]; - uint32_t p2 = bufplce[2][((place2 >> FRACBITS) * height[2]) >> FRACBITS]; - uint32_t p3 = bufplce[3][((place3 >> FRACBITS) * height[3]) >> FRACBITS]; - - local_vplce[0] = place0 + local_vince[0]; - local_vplce[1] = place1 + local_vince[1]; - local_vplce[2] = place2 + local_vince[2]; - local_vplce[3] = place3 + local_vince[3]; - - __m128i fg = _mm_set_epi32(p3, p2, p1, p0); - VEC_SHADE(fg, shade_constants); - _mm_storeu_si128((__m128i*)dest, fg); - dest += pitch; - } while (--count); - } - } - else - { - if (shade_constants.simple_shade) - { - VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0); - do - { - __m128i fg; - VEC_SAMPLE_BILINEAR4_COLUMN(fg, bufplce, bufplce2, buftexturefracx, local_vplce, half, height); - - local_vplce[0] = local_vplce[0] + local_vince[0]; - local_vplce[1] = local_vplce[1] + local_vince[1]; - local_vplce[2] = local_vplce[2] + local_vince[2]; - local_vplce[3] = local_vplce[3] + local_vince[3]; - - VEC_SHADE_SIMPLE(fg); - _mm_storeu_si128((__m128i*)dest, fg); - dest += pitch; - } while (--count); - } - else - { - VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants); - do - { - __m128i fg; - VEC_SAMPLE_BILINEAR4_COLUMN(fg, bufplce, bufplce2, buftexturefracx, local_vplce, half, height); - - local_vplce[0] = local_vplce[0] + local_vince[0]; - local_vplce[1] = local_vplce[1] + local_vince[1]; - local_vplce[2] = local_vplce[2] + local_vince[2]; - local_vplce[3] = local_vplce[3] + local_vince[3]; - - VEC_SHADE(fg, shade_constants); - _mm_storeu_si128((__m128i*)dest, fg); - dest += pitch; - } while (--count); - } - } - } -}; - -class VecCommand(Mvlinec4RGBA) : public DrawerCommand -{ - BYTE * RESTRICT _dest; - int _count; - int _pitch; - ShadeConstants _shade_constants; - uint32_t _mvlinemax; - fixed_t palookuplight[4]; - DWORD vplce[4]; - DWORD vince[4]; - const uint32 * RESTRICT bufplce[4]; - const uint32 * RESTRICT bufplce2[4]; - uint32_t buftexturefracx[4]; - uint32_t bufheight[4]; - -public: - VecCommand(Mvlinec4RGBA)() - { - _dest = dc_dest; - _count = dc_count; - _pitch = dc_pitch; - _shade_constants = dc_shade_constants; - for (int i = 0; i < 4; i++) - { - palookuplight[i] = ::palookuplight[i]; - vplce[i] = ::vplce[i]; - vince[i] = ::vince[i]; - bufplce[i] = (const uint32 *)::bufplce[i]; - bufplce2[i] = (const uint32_t *)::bufplce2[i]; - buftexturefracx[i] = ::buftexturefracx[i]; - bufheight[i] = ::bufheight[i]; - } - } - - void Execute(DrawerThread *thread) override - { - int count = thread->count_for_thread(_dest_y, _count); - if (count <= 0) - return; - - uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest); - int pitch = _pitch * thread->num_cores; - uint32_t height[4]; - uint32_t half[4]; - for (int i = 0; i < 4; i++) - { - height[i] = bufheight[i]; - half[i] = (0x80000000 + height[i] - 1) / height[i]; - } - - uint32_t light0 = LightBgra::calc_light_multiplier(palookuplight[0]); - uint32_t light1 = LightBgra::calc_light_multiplier(palookuplight[1]); - uint32_t light2 = LightBgra::calc_light_multiplier(palookuplight[2]); - uint32_t light3 = LightBgra::calc_light_multiplier(palookuplight[3]); - - ShadeConstants shade_constants = _shade_constants; - - DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; - DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; - int skipped = thread->skipped_by_thread(_dest_y); - for (int i = 0; i < 4; i++) - { - local_vplce[i] += local_vince[i] * skipped; - local_vince[i] *= thread->num_cores; - } - - if (bufplce2[0] == nullptr) - { - if (shade_constants.simple_shade) - { - VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0); - do - { - DWORD place0 = local_vplce[0]; - DWORD place1 = local_vplce[1]; - DWORD place2 = local_vplce[2]; - DWORD place3 = local_vplce[3]; - - uint32_t pix0 = bufplce[0][((place0 >> FRACBITS) * height[0]) >> FRACBITS]; - uint32_t pix1 = bufplce[1][((place1 >> FRACBITS) * height[1]) >> FRACBITS]; - uint32_t pix2 = bufplce[2][((place2 >> FRACBITS) * height[2]) >> FRACBITS]; - uint32_t pix3 = bufplce[3][((place3 >> FRACBITS) * height[3]) >> FRACBITS]; - - local_vplce[0] = place0 + local_vince[0]; - local_vplce[1] = place1 + local_vince[1]; - local_vplce[2] = place2 + local_vince[2]; - local_vplce[3] = place3 + local_vince[3]; - - __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); - __m128i bg = _mm_loadu_si128((const __m128i*)dest); - VEC_SHADE_SIMPLE(fg); - VEC_ALPHA_BLEND(fg, bg); - _mm_storeu_si128((__m128i*)dest, fg); - dest += pitch; - } while (--count); - } - else - { - VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants); - do - { - DWORD place0 = local_vplce[0]; - DWORD place1 = local_vplce[1]; - DWORD place2 = local_vplce[2]; - DWORD place3 = local_vplce[3]; - - uint32_t pix0 = bufplce[0][((place0 >> FRACBITS) * height[0]) >> FRACBITS]; - uint32_t pix1 = bufplce[1][((place1 >> FRACBITS) * height[1]) >> FRACBITS]; - uint32_t pix2 = bufplce[2][((place2 >> FRACBITS) * height[2]) >> FRACBITS]; - uint32_t pix3 = bufplce[3][((place3 >> FRACBITS) * height[3]) >> FRACBITS]; - - local_vplce[0] = place0 + local_vince[0]; - local_vplce[1] = place1 + local_vince[1]; - local_vplce[2] = place2 + local_vince[2]; - local_vplce[3] = place3 + local_vince[3]; - - __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); - __m128i bg = _mm_loadu_si128((const __m128i*)dest); - VEC_SHADE(fg, shade_constants); - VEC_ALPHA_BLEND(fg, bg); - _mm_storeu_si128((__m128i*)dest, fg); - dest += pitch; - } while (--count); - } - } - else - { - if (shade_constants.simple_shade) - { - VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0); - do - { - __m128i fg; - VEC_SAMPLE_BILINEAR4_COLUMN(fg, bufplce, bufplce2, buftexturefracx, local_vplce, half, height); - - local_vplce[0] = local_vplce[0] + local_vince[0]; - local_vplce[1] = local_vplce[1] + local_vince[1]; - local_vplce[2] = local_vplce[2] + local_vince[2]; - local_vplce[3] = local_vplce[3] + local_vince[3]; - - __m128i bg = _mm_loadu_si128((const __m128i*)dest); - VEC_SHADE_SIMPLE(fg); - VEC_ALPHA_BLEND(fg, bg); - _mm_storeu_si128((__m128i*)dest, fg); - dest += pitch; - } while (--count); - } - else - { - VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants); - do - { - __m128i fg; - VEC_SAMPLE_BILINEAR4_COLUMN(fg, bufplce, bufplce2, buftexturefracx, local_vplce, half, height); - - local_vplce[0] = local_vplce[0] + local_vince[0]; - local_vplce[1] = local_vplce[1] + local_vince[1]; - local_vplce[2] = local_vplce[2] + local_vince[2]; - local_vplce[3] = local_vplce[3] + local_vince[3]; - - __m128i bg = _mm_loadu_si128((const __m128i*)dest); - VEC_SHADE(fg, shade_constants); - VEC_ALPHA_BLEND(fg, bg); - _mm_storeu_si128((__m128i*)dest, fg); - dest += pitch; - } while (--count); - } - } - } -}; - -class VecCommand(Tmvline4AddRGBA) : public DrawerCommand -{ - BYTE * RESTRICT _dest; - int _count; - int _pitch; - ShadeConstants _shade_constants; - fixed_t _srcalpha; - fixed_t _destalpha; - fixed_t palookuplight[4]; - DWORD vplce[4]; - DWORD vince[4]; - const uint32 * RESTRICT bufplce[4]; - uint32_t bufheight[4]; - -public: - VecCommand(Tmvline4AddRGBA)() - { - _dest = dc_dest; - _count = dc_count; - _pitch = dc_pitch; - _shade_constants = dc_shade_constants; - _srcalpha = dc_srcalpha; - _destalpha = dc_destalpha; - for (int i = 0; i < 4; i++) - { - palookuplight[i] = ::palookuplight[i]; - vplce[i] = ::vplce[i]; - vince[i] = ::vince[i]; - bufplce[i] = (const uint32 *)::bufplce[i]; - bufheight[i] = ::bufheight[i]; - } - } - - void Execute(DrawerThread *thread) override - { - int count = thread->count_for_thread(_dest_y, _count); - if (count <= 0) - return; - - uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest); - int pitch = _pitch * thread->num_cores; - - uint32_t height[4]; - uint32_t half[4]; - for (int i = 0; i < 4; i++) - { - height[i] = bufheight[i]; - half[i] = (0x80000000 + height[i] - 1) / height[i]; - } - - uint32_t light[4]; - light[0] = LightBgra::calc_light_multiplier(palookuplight[0]); - light[1] = LightBgra::calc_light_multiplier(palookuplight[1]); - light[2] = LightBgra::calc_light_multiplier(palookuplight[2]); - light[3] = LightBgra::calc_light_multiplier(palookuplight[3]); - - ShadeConstants shade_constants = _shade_constants; - - uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); - uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); - - DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; - DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; - int skipped = thread->skipped_by_thread(_dest_y); - for (int i = 0; i < 4; i++) - { - local_vplce[i] += local_vince[i] * skipped; - local_vince[i] *= thread->num_cores; - } - - if (shade_constants.simple_shade) - { - VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]); - VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha); - - do - { - uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS]; - uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS]; - uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS]; - uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS]; - - local_vplce[0] = local_vplce[0] + local_vince[0]; - local_vplce[1] = local_vplce[1] + local_vince[1]; - local_vplce[2] = local_vplce[2] + local_vince[2]; - local_vplce[3] = local_vplce[3] + local_vince[3]; - - __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); - - VEC_CALC_BLEND_ALPHA(fg); - VEC_SHADE_SIMPLE(fg); - - __m128i bg = _mm_loadu_si128((const __m128i*)dest); - - __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); - __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); - - __m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); - __m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); - __m128i out = _mm_packus_epi16(out_lo, out_hi); - - _mm_storeu_si128((__m128i*)dest, out); - dest += pitch; - } while (--count); - } - else - { - VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants); - VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha); - - do - { - uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS]; - uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS]; - uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS]; - uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS]; - - local_vplce[0] = local_vplce[0] + local_vince[0]; - local_vplce[1] = local_vplce[1] + local_vince[1]; - local_vplce[2] = local_vplce[2] + local_vince[2]; - local_vplce[3] = local_vplce[3] + local_vince[3]; - - __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); - VEC_CALC_BLEND_ALPHA(fg); - VEC_SHADE(fg, shade_constants); - - __m128i bg = _mm_loadu_si128((const __m128i*)dest); - - __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); - __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); - - __m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); - __m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); - __m128i out = _mm_packus_epi16(out_lo, out_hi); - - _mm_storeu_si128((__m128i*)dest, out); - dest += pitch; - } while (--count); - } - } -}; - -class VecCommand(Tmvline4AddClampRGBA) : public DrawerCommand -{ - BYTE * RESTRICT _dest; - int _count; - int _pitch; - ShadeConstants _shade_constants; - fixed_t _srcalpha; - fixed_t _destalpha; - fixed_t palookuplight[4]; - DWORD vplce[4]; - DWORD vince[4]; - const uint32 *RESTRICT bufplce[4]; - uint32_t bufheight[4]; - -public: - VecCommand(Tmvline4AddClampRGBA)() - { - _dest = dc_dest; - _count = dc_count; - _pitch = dc_pitch; - _shade_constants = dc_shade_constants; - _srcalpha = dc_srcalpha; - _destalpha = dc_destalpha; - for (int i = 0; i < 4; i++) - { - palookuplight[i] = ::palookuplight[i]; - vplce[i] = ::vplce[i]; - vince[i] = ::vince[i]; - bufplce[i] = (const uint32 *)::bufplce[i]; - bufheight[i] = ::bufheight[i]; - } - } - - void Execute(DrawerThread *thread) override - { - int count = thread->count_for_thread(_dest_y, _count); - if (count <= 0) - return; - - uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest); - int pitch = _pitch * thread->num_cores; - - uint32_t height[4]; - uint32_t half[4]; - for (int i = 0; i < 4; i++) - { - height[i] = bufheight[i]; - half[i] = (0x80000000 + height[i] - 1) / height[i]; - } - - uint32_t light[4]; - light[0] = LightBgra::calc_light_multiplier(palookuplight[0]); - light[1] = LightBgra::calc_light_multiplier(palookuplight[1]); - light[2] = LightBgra::calc_light_multiplier(palookuplight[2]); - light[3] = LightBgra::calc_light_multiplier(palookuplight[3]); - - ShadeConstants shade_constants = _shade_constants; - - uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); - uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); - - DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; - DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; - int skipped = thread->skipped_by_thread(_dest_y); - for (int i = 0; i < 4; i++) - { - local_vplce[i] += local_vince[i] * skipped; - local_vince[i] *= thread->num_cores; - } - - if (shade_constants.simple_shade) - { - VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]); - VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha); - - do - { - uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS]; - uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS]; - uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS]; - uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS]; - - local_vplce[0] = local_vplce[0] + local_vince[0]; - local_vplce[1] = local_vplce[1] + local_vince[1]; - local_vplce[2] = local_vplce[2] + local_vince[2]; - local_vplce[3] = local_vplce[3] + local_vince[3]; - - __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); - VEC_CALC_BLEND_ALPHA(fg); - VEC_SHADE_SIMPLE(fg); - - __m128i bg = _mm_loadu_si128((const __m128i*)dest); - - __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); - __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); - - __m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); - __m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); - __m128i out = _mm_packus_epi16(out_lo, out_hi); - - _mm_storeu_si128((__m128i*)dest, out); - dest += pitch; - } while (--count); - } - else - { - VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants); - VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha); - - do - { - uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS]; - uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS]; - uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS]; - uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS]; - - local_vplce[0] = local_vplce[0] + local_vince[0]; - local_vplce[1] = local_vplce[1] + local_vince[1]; - local_vplce[2] = local_vplce[2] + local_vince[2]; - local_vplce[3] = local_vplce[3] + local_vince[3]; - - __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); - VEC_CALC_BLEND_ALPHA(fg); - VEC_SHADE(fg, shade_constants); - - __m128i bg = _mm_loadu_si128((const __m128i*)dest); - - __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); - __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); - - __m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); - __m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); - __m128i out = _mm_packus_epi16(out_lo, out_hi); - - _mm_storeu_si128((__m128i*)dest, out); - dest += pitch; - } while (--count); - } - } -}; - -class VecCommand(Tmvline4SubClampRGBA) : public DrawerCommand -{ - BYTE * RESTRICT _dest; - int _count; - int _pitch; - ShadeConstants _shade_constants; - fixed_t _srcalpha; - fixed_t _destalpha; - fixed_t palookuplight[4]; - DWORD vplce[4]; - DWORD vince[4]; - const uint32 *RESTRICT bufplce[4]; - uint32_t bufheight[4]; - -public: - VecCommand(Tmvline4SubClampRGBA)() - { - _dest = dc_dest; - _count = dc_count; - _pitch = dc_pitch; - _shade_constants = dc_shade_constants; - _srcalpha = dc_srcalpha; - _destalpha = dc_destalpha; - for (int i = 0; i < 4; i++) - { - palookuplight[i] = ::palookuplight[i]; - vplce[i] = ::vplce[i]; - vince[i] = ::vince[i]; - bufplce[i] = (const uint32 *)::bufplce[i]; - bufheight[i] = ::bufheight[i]; - } - } - - void Execute(DrawerThread *thread) override - { - int count = thread->count_for_thread(_dest_y, _count); - if (count <= 0) - return; - - uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest); - int pitch = _pitch * thread->num_cores; - - uint32_t height[4]; - uint32_t half[4]; - for (int i = 0; i < 4; i++) - { - height[i] = bufheight[i]; - half[i] = (0x80000000 + height[i] - 1) / height[i]; - } - - uint32_t light[4]; - light[0] = LightBgra::calc_light_multiplier(palookuplight[0]); - light[1] = LightBgra::calc_light_multiplier(palookuplight[1]); - light[2] = LightBgra::calc_light_multiplier(palookuplight[2]); - light[3] = LightBgra::calc_light_multiplier(palookuplight[3]); - - ShadeConstants shade_constants = _shade_constants; - - uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); - uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); - - DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; - DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; - int skipped = thread->skipped_by_thread(_dest_y); - for (int i = 0; i < 4; i++) - { - local_vplce[i] += local_vince[i] * skipped; - local_vince[i] *= thread->num_cores; - } - - if (shade_constants.simple_shade) - { - VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]); - VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha); - - do - { - uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS]; - uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS]; - uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS]; - uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS]; - - local_vplce[0] = local_vplce[0] + local_vince[0]; - local_vplce[1] = local_vplce[1] + local_vince[1]; - local_vplce[2] = local_vplce[2] + local_vince[2]; - local_vplce[3] = local_vplce[3] + local_vince[3]; - - __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); - VEC_CALC_BLEND_ALPHA(fg); - VEC_SHADE_SIMPLE(fg); - - __m128i bg = _mm_loadu_si128((const __m128i*)dest); - - __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); - __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); - - __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8); - __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8); - __m128i out = _mm_packus_epi16(out_lo, out_hi); - - _mm_storeu_si128((__m128i*)dest, out); - dest += pitch; - } while (--count); - } - else - { - VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants); - VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha); - - do - { - uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS]; - uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS]; - uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS]; - uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS]; - - local_vplce[0] = local_vplce[0] + local_vince[0]; - local_vplce[1] = local_vplce[1] + local_vince[1]; - local_vplce[2] = local_vplce[2] + local_vince[2]; - local_vplce[3] = local_vplce[3] + local_vince[3]; - - __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); - VEC_CALC_BLEND_ALPHA(fg); - VEC_SHADE(fg, shade_constants); - - __m128i bg = _mm_loadu_si128((const __m128i*)dest); - - __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); - __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); - - __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8); - __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8); - __m128i out = _mm_packus_epi16(out_lo, out_hi); - - _mm_storeu_si128((__m128i*)dest, out); - dest += pitch; - } while (--count); - } - } -}; - -class VecCommand(Tmvline4RevSubClampRGBA) : public DrawerCommand -{ - BYTE * RESTRICT _dest; - int _count; - int _pitch; - ShadeConstants _shade_constants; - fixed_t _srcalpha; - fixed_t _destalpha; - fixed_t palookuplight[4]; - DWORD vplce[4]; - DWORD vince[4]; - const uint32 *RESTRICT bufplce[4]; - uint32_t bufheight[4]; - -public: - VecCommand(Tmvline4RevSubClampRGBA)() - { - _dest = dc_dest; - _count = dc_count; - _pitch = dc_pitch; - _shade_constants = dc_shade_constants; - _srcalpha = dc_srcalpha; - _destalpha = dc_destalpha; - for (int i = 0; i < 4; i++) - { - palookuplight[i] = ::palookuplight[i]; - vplce[i] = ::vplce[i]; - vince[i] = ::vince[i]; - bufplce[i] = (const uint32 *)::bufplce[i]; - bufheight[i] = ::bufheight[4]; - } - } - - void Execute(DrawerThread *thread) override - { - int count = thread->count_for_thread(_dest_y, _count); - if (count <= 0) - return; - - uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest); - int pitch = _pitch * thread->num_cores; - - uint32_t height[4]; - uint32_t half[4]; - for (int i = 0; i < 4; i++) - { - height[i] = bufheight[i]; - half[i] = (0x80000000 + height[i] - 1) / height[i]; - } - - uint32_t light[4]; - light[0] = LightBgra::calc_light_multiplier(palookuplight[0]); - light[1] = LightBgra::calc_light_multiplier(palookuplight[1]); - light[2] = LightBgra::calc_light_multiplier(palookuplight[2]); - light[3] = LightBgra::calc_light_multiplier(palookuplight[3]); - - ShadeConstants shade_constants = _shade_constants; - - uint32_t src_alpha = _srcalpha >> (FRACBITS - 8); - uint32_t dest_alpha = _destalpha >> (FRACBITS - 8); - - DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; - DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; - int skipped = thread->skipped_by_thread(_dest_y); - for (int i = 0; i < 4; i++) - { - local_vplce[i] += local_vince[i] * skipped; - local_vince[i] *= thread->num_cores; - } - - if (shade_constants.simple_shade) - { - VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]); - VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha); - - do - { - uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS]; - uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS]; - uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS]; - uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS]; - - local_vplce[0] = local_vplce[0] + local_vince[0]; - local_vplce[1] = local_vplce[1] + local_vince[1]; - local_vplce[2] = local_vplce[2] + local_vince[2]; - local_vplce[3] = local_vplce[3] + local_vince[3]; - - __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); - VEC_CALC_BLEND_ALPHA(fg); - VEC_SHADE_SIMPLE(fg); - - __m128i bg = _mm_loadu_si128((const __m128i*)dest); - - __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); - __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); - - __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); - __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); - __m128i out = _mm_packus_epi16(out_lo, out_hi); - - _mm_storeu_si128((__m128i*)dest, out); - dest += pitch; - } while (--count); - } - else - { - VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants); - VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha); - - do - { - uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS]; - uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS]; - uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS]; - uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS]; - - local_vplce[0] = local_vplce[0] + local_vince[0]; - local_vplce[1] = local_vplce[1] + local_vince[1]; - local_vplce[2] = local_vplce[2] + local_vince[2]; - local_vplce[3] = local_vplce[3] + local_vince[3]; - - __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); - VEC_CALC_BLEND_ALPHA(fg); - VEC_SHADE(fg, shade_constants); - - __m128i bg = _mm_loadu_si128((const __m128i*)dest); - - __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); - __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); - __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); - __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); - - __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); - __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); - __m128i out = _mm_packus_epi16(out_lo, out_hi); - - _mm_storeu_si128((__m128i*)dest, out); - dest += pitch; - } while (--count); - } - } -}; diff --git a/src/r_drawt_rgba.cpp b/src/r_drawt_rgba.cpp index 82932b1f2..45bd5c029 100644 --- a/src/r_drawt_rgba.cpp +++ b/src/r_drawt_rgba.cpp @@ -53,8 +53,13 @@ extern unsigned int *horizspan[4]; #ifndef NO_SSE +#ifdef _MSC_VER +#pragma warning(disable: 4101) // warning C4101: unreferenced local variable +#endif + // Generate SSE drawers: #define VecCommand(name) name##_SSE_Command +#define VEC_SHADE_VARS SSE_SHADE_VARS #define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT #define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4 #define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE diff --git a/src/r_drawt_rgba_sse.h b/src/r_drawt_rgba_sse.h index 64a77e288..7a02f2282 100644 --- a/src/r_drawt_rgba_sse.h +++ b/src/r_drawt_rgba_sse.h @@ -60,6 +60,7 @@ public: if (shade_constants.simple_shade) { + VEC_SHADE_VARS(); VEC_SHADE_SIMPLE_INIT(light); if (count & 1) { @@ -110,6 +111,7 @@ public: } else { + VEC_SHADE_VARS(); VEC_SHADE_INIT(light, shade_constants); if (count & 1) { @@ -218,6 +220,7 @@ public: if (shade_constants.simple_shade) { + VEC_SHADE_VARS(); VEC_SHADE_SIMPLE_INIT(light); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); @@ -254,6 +257,7 @@ public: } else { + VEC_SHADE_VARS(); VEC_SHADE_INIT(light, shade_constants); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); @@ -421,6 +425,7 @@ public: if (shade_constants.simple_shade) { + VEC_SHADE_VARS(); VEC_SHADE_SIMPLE_INIT(light); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); @@ -457,6 +462,7 @@ public: } else { + VEC_SHADE_VARS(); VEC_SHADE_INIT(light, shade_constants); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); @@ -547,6 +553,7 @@ public: if (shade_constants.simple_shade) { + VEC_SHADE_VARS(); VEC_SHADE_SIMPLE_INIT(light); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); @@ -583,6 +590,7 @@ public: } else { + VEC_SHADE_VARS(); VEC_SHADE_INIT(light, shade_constants); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); @@ -673,6 +681,7 @@ public: if (shade_constants.simple_shade) { + VEC_SHADE_VARS(); VEC_SHADE_SIMPLE_INIT(light); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); @@ -709,6 +718,7 @@ public: } else { + VEC_SHADE_VARS(); VEC_SHADE_INIT(light, shade_constants); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); diff --git a/src/r_segs.cpp b/src/r_segs.cpp index 630d64da0..870d74894 100644 --- a/src/r_segs.cpp +++ b/src/r_segs.cpp @@ -1146,15 +1146,16 @@ WallscanSampler::WallscanSampler(int y1, float swal, double yrepeat, fixed_t xof int mip_height = texture->GetHeight(); if (r_mipmap && texture->Mipmapped()) { + uint32_t xpos = (uint32_t)((((uint64_t)xoffset) << FRACBITS) / mip_width); int level = (int)MAX(magnitude - 1.0, 0.0); while (level != 0) { mipmap_offset += mip_width * mip_height; - xoffset >>= 1; level >>= 1; mip_width = MAX(mip_width >> 1, 1); mip_height = MAX(mip_height >> 1, 1); } + xoffset = (xpos >> FRACBITS) * mip_width; } const uint32_t *pixels = texture->GetPixelsBgra() + mipmap_offset;