mirror of
https://github.com/ZDoom/gzdoom.git
synced 2024-11-28 23:02:07 +00:00
Throwing templates at the code redundancy problem in drawers
This commit is contained in:
parent
928e8e0d43
commit
6c037fa249
6 changed files with 551 additions and 1108 deletions
|
@ -67,8 +67,13 @@ CVAR(Bool, r_mipmap, true, CVAR_ARCHIVE | CVAR_GLOBALCONFIG);
|
|||
|
||||
#ifndef NO_SSE
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(disable: 4101) // warning C4101: unreferenced local variable
|
||||
#endif
|
||||
|
||||
// Generate SSE drawers:
|
||||
#define VecCommand(name) name##_SSE_Command
|
||||
#define VEC_SHADE_VARS SSE_SHADE_VARS
|
||||
#define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT
|
||||
#define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4
|
||||
#define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE
|
||||
|
@ -1552,8 +1557,446 @@ public:
|
|||
return (--count) != 0;
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef NO_SSE
|
||||
struct NearestSampler
|
||||
{
|
||||
FORCEINLINE static uint32_t Sample1(DrawerWall4Command &cmd, LoopIterator &loop, int index)
|
||||
{
|
||||
return cmd._bufplce[index][loop.sample_index(index)];
|
||||
}
|
||||
};
|
||||
struct LinearSampler
|
||||
{
|
||||
FORCEINLINE static uint32_t Sample1(DrawerWall4Command &cmd, LoopIterator &loop, int index)
|
||||
{
|
||||
return SampleBgra::sample_bilinear(cmd._bufplce[index], cmd._bufplce2[index], cmd._buftexturefracx[index], loop.vplce[index], loop.half[index], loop.height[index]);
|
||||
}
|
||||
};
|
||||
#else
|
||||
struct NearestSampler
|
||||
{
|
||||
FORCEINLINE static __m128i Sample4(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
return _mm_set_epi32(cmd._bufplce[3][loop.sample_index(3)], cmd._bufplce[2][loop.sample_index(2)], cmd._bufplce[1][loop.sample_index(1)], cmd._bufplce[0][loop.sample_index(0)]);
|
||||
}
|
||||
};
|
||||
|
||||
struct LinearSampler
|
||||
{
|
||||
FORCEINLINE static __m128i Sample4(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
__m128i fg;
|
||||
VEC_SAMPLE_BILINEAR4_COLUMN(fg, cmd._bufplce, cmd._bufplce2, cmd._buftexturefracx, loop.vplce, loop.half, loop.height);
|
||||
return fg;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef NO_SSE
|
||||
template<typename Sampler>
|
||||
struct Copy
|
||||
{
|
||||
Copy(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants);
|
||||
loop.dest[i] = BlendBgra::copy(fg);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Sampler>
|
||||
struct Mask
|
||||
{
|
||||
Mask(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants);
|
||||
loop.dest[i] = BlendBgra::alpha_blend(fg, loop.dest[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Sampler>
|
||||
struct TMaskAdd
|
||||
{
|
||||
TMaskAdd(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants);
|
||||
loop.dest[i] = BlendBgra::add(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Sampler>
|
||||
struct TMaskSub
|
||||
{
|
||||
TMaskSub(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants);
|
||||
loop.dest[i] = BlendBgra::sub(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Sampler>
|
||||
struct TMaskRevSub
|
||||
{
|
||||
TMaskRevSub(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants);
|
||||
loop.dest[i] = BlendBgra::revsub(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
typedef Copy<NearestSampler> CopyNearestSimple;
|
||||
typedef Copy<LinearSampler> CopyLinearSimple;
|
||||
typedef Copy<NearestSampler> CopyNearest;
|
||||
typedef Copy<LinearSampler> CopyLinear;
|
||||
typedef Mask<NearestSampler> MaskNearestSimple;
|
||||
typedef Mask<LinearSampler> MaskLinearSimple;
|
||||
typedef Mask<NearestSampler> MaskNearest;
|
||||
typedef Mask<LinearSampler> MaskLinear;
|
||||
typedef TMaskAdd<NearestSampler> TMaskAddNearestSimple;
|
||||
typedef TMaskAdd<LinearSampler> TMaskAddLinearSimple;
|
||||
typedef TMaskAdd<NearestSampler> TMaskAddNearest;
|
||||
typedef TMaskAdd<LinearSampler> TMaskAddLinear;
|
||||
typedef TMaskSub<NearestSampler> TMaskSubNearestSimple;
|
||||
typedef TMaskSub<LinearSampler> TMaskSubLinearSimple;
|
||||
typedef TMaskSub<NearestSampler> TMaskSubNearest;
|
||||
typedef TMaskSub<LinearSampler> TMaskSubLinear;
|
||||
typedef TMaskRevSub<NearestSampler> TMaskRevSubNearestSimple;
|
||||
typedef TMaskRevSub<LinearSampler> TMaskRevSubLinearSimple;
|
||||
typedef TMaskRevSub<NearestSampler> TMaskRevSubNearest;
|
||||
typedef TMaskRevSub<LinearSampler> TMaskRevSubLinear;
|
||||
#else
|
||||
template<typename Sampler>
|
||||
struct CopySimple
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
CopySimple(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]);
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
__m128i fg = Sampler::Sample4(cmd, loop);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)loop.dest, fg);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Sampler>
|
||||
struct Copy
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
Copy(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants);
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
__m128i fg = Sampler::Sample4(cmd, loop);
|
||||
VEC_SHADE(fg, cmd._shade_constants);
|
||||
_mm_storeu_si128((__m128i*)loop.dest, fg);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Sampler>
|
||||
struct MaskSimple
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
MaskSimple(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]);
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
__m128i fg = Sampler::Sample4(cmd, loop);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
VEC_ALPHA_BLEND(fg, bg);
|
||||
_mm_storeu_si128((__m128i*)loop.dest, fg);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Sampler>
|
||||
struct Mask
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
Mask(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants);
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
__m128i fg = Sampler::Sample4(cmd, loop);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
|
||||
VEC_SHADE(fg, cmd._shade_constants);
|
||||
VEC_ALPHA_BLEND(fg, bg);
|
||||
_mm_storeu_si128((__m128i*)loop.dest, fg);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Sampler>
|
||||
struct TMaskAddSimple
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_CALC_BLEND_ALPHA_VARS();
|
||||
TMaskAddSimple(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]);
|
||||
VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha);
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
__m128i fg = Sampler::Sample4(cmd, loop);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
|
||||
|
||||
VEC_CALC_BLEND_ALPHA(fg);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)loop.dest, out);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Sampler>
|
||||
struct TMaskAdd
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_CALC_BLEND_ALPHA_VARS();
|
||||
TMaskAdd(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants);
|
||||
VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha);
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
__m128i fg = Sampler::Sample4(cmd, loop);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
|
||||
|
||||
VEC_CALC_BLEND_ALPHA(fg);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)loop.dest, out);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Sampler>
|
||||
struct TMaskSubSimple
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_CALC_BLEND_ALPHA_VARS();
|
||||
TMaskSubSimple(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]);
|
||||
VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha);
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
__m128i fg = Sampler::Sample4(cmd, loop);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
|
||||
|
||||
VEC_CALC_BLEND_ALPHA(fg);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)loop.dest, out);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Sampler>
|
||||
struct TMaskSub
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_CALC_BLEND_ALPHA_VARS();
|
||||
TMaskSub(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants);
|
||||
VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha);
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
__m128i fg = Sampler::Sample4(cmd, loop);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
|
||||
|
||||
VEC_CALC_BLEND_ALPHA(fg);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)loop.dest, out);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Sampler>
|
||||
struct TMaskRevSubSimple
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_CALC_BLEND_ALPHA_VARS();
|
||||
TMaskRevSubSimple(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]);
|
||||
VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha);
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
__m128i fg = Sampler::Sample4(cmd, loop);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
|
||||
|
||||
VEC_CALC_BLEND_ALPHA(fg);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)loop.dest, out);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Sampler>
|
||||
struct TMaskRevSub
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_CALC_BLEND_ALPHA_VARS();
|
||||
TMaskRevSub(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants);
|
||||
VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha);
|
||||
}
|
||||
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
|
||||
{
|
||||
__m128i fg = Sampler::Sample4(cmd, loop);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
|
||||
|
||||
VEC_CALC_BLEND_ALPHA(fg);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)loop.dest, out);
|
||||
}
|
||||
};
|
||||
|
||||
typedef CopySimple<NearestSampler> CopyNearestSimple;
|
||||
typedef CopySimple<LinearSampler> CopyLinearSimple;
|
||||
typedef Copy<NearestSampler> CopyNearest;
|
||||
typedef Copy<LinearSampler> CopyLinear;
|
||||
typedef MaskSimple<NearestSampler> MaskNearestSimple;
|
||||
typedef MaskSimple<LinearSampler> MaskLinearSimple;
|
||||
typedef Mask<NearestSampler> MaskNearest;
|
||||
typedef Mask<LinearSampler> MaskLinear;
|
||||
typedef TMaskAddSimple<NearestSampler> TMaskAddNearestSimple;
|
||||
typedef TMaskAddSimple<LinearSampler> TMaskAddLinearSimple;
|
||||
typedef TMaskAdd<NearestSampler> TMaskAddNearest;
|
||||
typedef TMaskAdd<LinearSampler> TMaskAddLinear;
|
||||
typedef TMaskSubSimple<NearestSampler> TMaskSubNearestSimple;
|
||||
typedef TMaskSubSimple<LinearSampler> TMaskSubLinearSimple;
|
||||
typedef TMaskSub<NearestSampler> TMaskSubNearest;
|
||||
typedef TMaskSub<LinearSampler> TMaskSubLinear;
|
||||
typedef TMaskRevSubSimple<NearestSampler> TMaskRevSubNearestSimple;
|
||||
typedef TMaskRevSubSimple<LinearSampler> TMaskRevSubLinearSimple;
|
||||
typedef TMaskRevSub<NearestSampler> TMaskRevSubNearest;
|
||||
typedef TMaskRevSub<LinearSampler> TMaskRevSubLinear;
|
||||
#endif
|
||||
};
|
||||
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::CopyNearestSimple> Vlinec4NearestSimpleRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::CopyNearest> Vlinec4NearestRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::CopyLinearSimple> Vlinec4LinearSimpleRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::CopyLinear> Vlinec4LinearRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::MaskNearestSimple> Mvlinec4NearestSimpleRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::MaskNearest> Mvlinec4NearestRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::MaskLinearSimple> Mvlinec4LinearSimpleRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::MaskLinear> Mvlinec4LinearRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddNearestSimple> Tmvline4AddNearestSimpleRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddNearest> Tmvline4AddNearestRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddLinearSimple> Tmvline4AddLinearSimpleRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddLinear> Tmvline4AddLinearRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddNearestSimple> Tmvline4AddClampNearestSimpleRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddNearest> Tmvline4AddClampNearestRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddLinearSimple> Tmvline4AddClampLinearSimpleRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddLinear> Tmvline4AddClampLinearRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskSubNearestSimple> Tmvline4SubClampNearestSimpleRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskSubNearest> Tmvline4SubClampNearestRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskSubLinearSimple> Tmvline4SubClampLinearSimpleRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskSubLinear> Tmvline4SubClampLinearRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskRevSubNearestSimple> Tmvline4RevSubClampNearestSimpleRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskRevSubNearest> Tmvline4RevSubClampNearestRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskRevSubLinearSimple> Tmvline4RevSubClampLinearSimpleRGBACommand;
|
||||
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskRevSubLinear> Tmvline4RevSubClampLinearRGBACommand;
|
||||
|
||||
class Vlinec1RGBACommand : public DrawerWall1Command
|
||||
{
|
||||
public:
|
||||
|
@ -1581,39 +2024,6 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
class Vlinec4RGBACommand : public DrawerWall4Command
|
||||
{
|
||||
public:
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
LoopIterator loop(this, thread);
|
||||
if (!loop) return;
|
||||
|
||||
if (_bufplce2[0] == nullptr)
|
||||
{
|
||||
do
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants);
|
||||
loop.dest[i] = BlendBgra::copy(fg);
|
||||
}
|
||||
} while (loop.next());
|
||||
}
|
||||
else
|
||||
{
|
||||
do
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t fg = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_bufplce[i], _bufplce2[i], _buftexturefracx[i], loop.sample_index(i), loop.half[i], loop.height[i]), _light[i], _shade_constants);
|
||||
loop.dest[i] = BlendBgra::copy(fg);
|
||||
}
|
||||
} while (loop.next());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class Mvlinec1RGBACommand : public DrawerWall1Command
|
||||
{
|
||||
public:
|
||||
|
@ -1641,39 +2051,6 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
class Mvlinec4RGBACommand : public DrawerWall4Command
|
||||
{
|
||||
public:
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
LoopIterator loop(this, thread);
|
||||
if (!loop) return;
|
||||
|
||||
if (_bufplce2[0] == nullptr)
|
||||
{
|
||||
do
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants);
|
||||
loop.dest[i] = BlendBgra::alpha_blend(fg, loop.dest[i]);
|
||||
}
|
||||
} while (loop.next());
|
||||
}
|
||||
else
|
||||
{
|
||||
do
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t fg = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_bufplce[i], _bufplce2[i], _buftexturefracx[i], loop.sample_index(i), loop.half[i], loop.height[i]), _light[i], _shade_constants);
|
||||
loop.dest[i] = BlendBgra::alpha_blend(fg, loop.dest[i]);
|
||||
}
|
||||
} while (loop.next());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class Tmvline1AddRGBACommand : public DrawerWall1Command
|
||||
{
|
||||
public:
|
||||
|
@ -1689,24 +2066,6 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
class Tmvline4AddRGBACommand : public DrawerWall4Command
|
||||
{
|
||||
public:
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
LoopIterator loop(this, thread);
|
||||
if (!loop) return;
|
||||
do
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants);
|
||||
loop.dest[i] = BlendBgra::add(fg, loop.dest[i], _srcalpha, calc_blend_bgalpha(fg, _destalpha));
|
||||
}
|
||||
} while (loop.next());
|
||||
}
|
||||
};
|
||||
|
||||
class Tmvline1AddClampRGBACommand : public DrawerWall1Command
|
||||
{
|
||||
public:
|
||||
|
@ -1722,24 +2081,6 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
class Tmvline4AddClampRGBACommand : public DrawerWall4Command
|
||||
{
|
||||
public:
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
LoopIterator loop(this, thread);
|
||||
if (!loop) return;
|
||||
do
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants);
|
||||
loop.dest[i] = BlendBgra::add(fg, loop.dest[i], _srcalpha, calc_blend_bgalpha(fg, _destalpha));
|
||||
}
|
||||
} while (loop.next());
|
||||
}
|
||||
};
|
||||
|
||||
class Tmvline1SubClampRGBACommand : public DrawerWall1Command
|
||||
{
|
||||
public:
|
||||
|
@ -1755,24 +2096,6 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
class Tmvline4SubClampRGBACommand : public DrawerWall4Command
|
||||
{
|
||||
public:
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
LoopIterator loop(this, thread);
|
||||
if (!loop) return;
|
||||
do
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants);
|
||||
loop.dest[i] = BlendBgra::sub(fg, loop.dest[i], _srcalpha, calc_blend_bgalpha(fg, _destalpha));
|
||||
}
|
||||
} while (loop.next());
|
||||
}
|
||||
};
|
||||
|
||||
class Tmvline1RevSubClampRGBACommand : public DrawerWall1Command
|
||||
{
|
||||
public:
|
||||
|
@ -1788,24 +2111,6 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
class Tmvline4RevSubClampRGBACommand : public DrawerWall4Command
|
||||
{
|
||||
public:
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
LoopIterator loop(this, thread);
|
||||
if (!loop) return;
|
||||
do
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants);
|
||||
loop.dest[i] = BlendBgra::revsub(fg, loop.dest[i], _srcalpha, calc_blend_bgalpha(fg, _destalpha));
|
||||
}
|
||||
} while (loop.next());
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class DrawFogBoundaryLineRGBACommand : public DrawerCommand
|
||||
|
@ -2355,13 +2660,22 @@ DWORD vlinec1_rgba()
|
|||
return dc_texturefrac + dc_count * dc_iscale;
|
||||
}
|
||||
|
||||
template<typename NearestSimple, typename Nearest, typename LinearSimple, typename Linear>
|
||||
void queue_wallcommand()
|
||||
{
|
||||
if (bufplce2[0] == nullptr && dc_shade_constants.simple_shade)
|
||||
DrawerCommandQueue::QueueCommand<NearestSimple>();
|
||||
else if (bufplce2[0] == nullptr)
|
||||
DrawerCommandQueue::QueueCommand<Nearest>();
|
||||
else if (dc_shade_constants.simple_shade)
|
||||
DrawerCommandQueue::QueueCommand<LinearSimple>();
|
||||
else
|
||||
DrawerCommandQueue::QueueCommand<Linear>();
|
||||
}
|
||||
|
||||
void vlinec4_rgba()
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<Vlinec4RGBACommand>();
|
||||
#else
|
||||
DrawerCommandQueue::QueueCommand<Vlinec4RGBA_SSE_Command>();
|
||||
#endif
|
||||
queue_wallcommand<Vlinec4NearestSimpleRGBACommand, Vlinec4NearestRGBACommand, Vlinec4LinearSimpleRGBACommand, Vlinec4LinearRGBACommand>();
|
||||
for (int i = 0; i < 4; i++)
|
||||
vplce[i] += vince[i] * dc_count;
|
||||
}
|
||||
|
@ -2374,11 +2688,7 @@ DWORD mvlinec1_rgba()
|
|||
|
||||
void mvlinec4_rgba()
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<Mvlinec4RGBACommand>();
|
||||
#else
|
||||
DrawerCommandQueue::QueueCommand<Mvlinec4RGBA_SSE_Command>();
|
||||
#endif
|
||||
queue_wallcommand<Mvlinec4NearestSimpleRGBACommand, Mvlinec4NearestRGBACommand, Mvlinec4LinearSimpleRGBACommand, Mvlinec4LinearRGBACommand>();
|
||||
for (int i = 0; i < 4; i++)
|
||||
vplce[i] += vince[i] * dc_count;
|
||||
}
|
||||
|
@ -2391,11 +2701,7 @@ fixed_t tmvline1_add_rgba()
|
|||
|
||||
void tmvline4_add_rgba()
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4AddRGBACommand>();
|
||||
#else
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4AddRGBA_SSE_Command>();
|
||||
#endif
|
||||
queue_wallcommand<Tmvline4AddNearestSimpleRGBACommand, Tmvline4AddNearestRGBACommand, Tmvline4AddLinearSimpleRGBACommand, Tmvline4AddLinearRGBACommand>();
|
||||
for (int i = 0; i < 4; i++)
|
||||
vplce[i] += vince[i] * dc_count;
|
||||
}
|
||||
|
@ -2408,11 +2714,7 @@ fixed_t tmvline1_addclamp_rgba()
|
|||
|
||||
void tmvline4_addclamp_rgba()
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4AddClampRGBACommand>();
|
||||
#else
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4AddClampRGBA_SSE_Command>();
|
||||
#endif
|
||||
queue_wallcommand<Tmvline4AddClampNearestSimpleRGBACommand, Tmvline4AddClampNearestRGBACommand, Tmvline4AddClampLinearSimpleRGBACommand, Tmvline4AddClampLinearRGBACommand>();
|
||||
for (int i = 0; i < 4; i++)
|
||||
vplce[i] += vince[i] * dc_count;
|
||||
}
|
||||
|
@ -2425,11 +2727,7 @@ fixed_t tmvline1_subclamp_rgba()
|
|||
|
||||
void tmvline4_subclamp_rgba()
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4SubClampRGBACommand>();
|
||||
#else
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4SubClampRGBA_SSE_Command>();
|
||||
#endif
|
||||
queue_wallcommand<Tmvline4SubClampNearestSimpleRGBACommand, Tmvline4SubClampNearestRGBACommand, Tmvline4SubClampLinearSimpleRGBACommand, Tmvline4SubClampLinearRGBACommand>();
|
||||
for (int i = 0; i < 4; i++)
|
||||
vplce[i] += vince[i] * dc_count;
|
||||
}
|
||||
|
@ -2442,11 +2740,7 @@ fixed_t tmvline1_revsubclamp_rgba()
|
|||
|
||||
void tmvline4_revsubclamp_rgba()
|
||||
{
|
||||
#ifdef NO_SSE
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4RevSubClampRGBACommand>();
|
||||
#else
|
||||
DrawerCommandQueue::QueueCommand<Tmvline4RevSubClampRGBA_SSE_Command>();
|
||||
#endif
|
||||
queue_wallcommand<Tmvline4RevSubClampNearestSimpleRGBACommand, Tmvline4RevSubClampNearestRGBACommand, Tmvline4RevSubClampLinearSimpleRGBACommand, Tmvline4RevSubClampLinearRGBACommand>();
|
||||
for (int i = 0; i < 4; i++)
|
||||
vplce[i] += vince[i] * dc_count;
|
||||
}
|
||||
|
|
|
@ -286,6 +286,22 @@ public:
|
|||
void Execute(DrawerThread *thread) override;
|
||||
};
|
||||
|
||||
template<typename CommandType, typename BlendMode>
|
||||
class DrawerBlendCommand : public CommandType
|
||||
{
|
||||
public:
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
LoopIterator loop(this, thread);
|
||||
if (!loop) return;
|
||||
BlendMode blend(*this, loop);
|
||||
do
|
||||
{
|
||||
blend.Blend(*this, loop);
|
||||
} while (loop.next());
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Pixel shading inline functions:
|
||||
|
||||
|
@ -624,7 +640,7 @@ public:
|
|||
__m128i ab_invab = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2); \
|
||||
__m128i ainvb_invainvb = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2 + 1); \
|
||||
\
|
||||
__m128i gather = _mm_set_epi32(col1[i][y1], col1[i][y0], col0[i][y1], col1[i][y0]); \
|
||||
__m128i gather = _mm_set_epi32(col1[i][y1], col1[i][y0], col0[i][y1], col0[i][y0]); \
|
||||
__m128i p0 = _mm_unpacklo_epi8(gather, _mm_setzero_si128()); \
|
||||
__m128i p1 = _mm_unpackhi_epi8(gather, _mm_setzero_si128()); \
|
||||
\
|
||||
|
@ -635,6 +651,26 @@ public:
|
|||
} \
|
||||
}
|
||||
|
||||
#define VEC_SAMPLE_MIP_NEAREST4_COLUMN(fg, col0, col1, mipfrac, texturefracy, height0, height1) { \
|
||||
uint32_t y0[4], y1[4]; \
|
||||
for (int i = 0; i < 4; i++) \
|
||||
{ \
|
||||
y0[i] = (texturefracy[i] >> FRACBITS) * height0[i]; \
|
||||
y1[i] = (texturefracy[i] >> FRACBITS) * height1[i]; \
|
||||
} \
|
||||
__m128i p0 = _mm_set_epi32(col0[y0[3]], col0[y0[2]], col0[y0[1]], col0[y0[0]]); \
|
||||
__m128i p1 = _mm_set_epi32(col1[y1[3]], col1[y1[2]], col1[y1[1]], col1[y1[0]]); \
|
||||
__m128i t = _mm_loadu_si128((const __m128i*)mipfrac); \
|
||||
__m128i inv_t = _mm_sub_epi32(_mm_set1_epi32(256), mipfrac); \
|
||||
__m128i p0_lo = _mm_unpacklo_epi8(p0, _mm_setzero_si128()); \
|
||||
__m128i p0_hi = _mm_unpackhi_epi8(p0, _mm_setzero_si128()); \
|
||||
__m128i p1_lo = _mm_unpacklo_epi8(p1, _mm_setzero_si128()); \
|
||||
__m128i p1_hi = _mm_unpackhi_epi8(p1, _mm_setzero_si128()); \
|
||||
__m128i fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(p0_lo, t), _mm_mullo_epi16(p1_lo, inv_t)), 8); \
|
||||
__m128i fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(p0_hi, t), _mm_mullo_epi16(p1_hi, inv_t)), 8); \
|
||||
fg = _mm_packus_epi16(fg_lo, fg_hi); \
|
||||
}
|
||||
|
||||
#define VEC_SAMPLE_BILINEAR4_SPAN(fg, texture, xfrac, yfrac, xstep, ystep, xbits, ybits) { \
|
||||
int xshift = (32 - xbits); \
|
||||
int yshift = (32 - ybits); \
|
||||
|
@ -844,12 +880,14 @@ FORCEINLINE uint32_t calc_blend_bgalpha(uint32_t fg, uint32_t dest_alpha)
|
|||
return (dest_alpha * alpha + 256 * inv_alpha + 128) >> 8;
|
||||
}
|
||||
|
||||
#define VEC_CALC_BLEND_ALPHA_VARS() __m128i msrc_alpha, mdest_alpha, m256, m255, m128;
|
||||
|
||||
#define VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha) \
|
||||
__m128i msrc_alpha = _mm_set1_epi16(src_alpha); \
|
||||
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha * 255 / 256); \
|
||||
__m128i m256 = _mm_set1_epi16(256); \
|
||||
__m128i m255 = _mm_set1_epi16(255); \
|
||||
__m128i m128 = _mm_set1_epi16(128);
|
||||
msrc_alpha = _mm_set1_epi16(src_alpha); \
|
||||
mdest_alpha = _mm_set1_epi16(dest_alpha * 255 / 256); \
|
||||
m256 = _mm_set1_epi16(256); \
|
||||
m255 = _mm_set1_epi16(255); \
|
||||
m128 = _mm_set1_epi16(128);
|
||||
|
||||
// Calculates the final alpha values to be used when combined with the source texture alpha channel
|
||||
#define VEC_CALC_BLEND_ALPHA(fg) \
|
||||
|
@ -866,15 +904,17 @@ FORCEINLINE uint32_t calc_blend_bgalpha(uint32_t fg, uint32_t dest_alpha)
|
|||
fg_alpha_lo = msrc_alpha; \
|
||||
}
|
||||
|
||||
#define SSE_SHADE_VARS() __m128i mlight_hi, mlight_lo, color, fade, fade_amount_hi, fade_amount_lo, inv_desaturate;
|
||||
|
||||
// Calculate constants for a simple shade
|
||||
#define SSE_SHADE_SIMPLE_INIT(light) \
|
||||
__m128i mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \
|
||||
__m128i mlight_lo = mlight_hi;
|
||||
mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \
|
||||
mlight_lo = mlight_hi;
|
||||
|
||||
// Calculate constants for a simple shade with different light levels for each pixel
|
||||
#define SSE_SHADE_SIMPLE_INIT4(light3, light2, light1, light0) \
|
||||
__m128i mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \
|
||||
__m128i mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2);
|
||||
mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \
|
||||
mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2);
|
||||
|
||||
// Simple shade 4 pixels
|
||||
#define SSE_SHADE_SIMPLE(fg) { \
|
||||
|
@ -889,31 +929,31 @@ FORCEINLINE uint32_t calc_blend_bgalpha(uint32_t fg, uint32_t dest_alpha)
|
|||
|
||||
// Calculate constants for a complex shade
|
||||
#define SSE_SHADE_INIT(light, shade_constants) \
|
||||
__m128i mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \
|
||||
__m128i mlight_lo = mlight_hi; \
|
||||
__m128i color = _mm_set_epi16( \
|
||||
mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \
|
||||
mlight_lo = mlight_hi; \
|
||||
color = _mm_set_epi16( \
|
||||
256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \
|
||||
256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \
|
||||
__m128i fade = _mm_set_epi16( \
|
||||
fade = _mm_set_epi16( \
|
||||
0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \
|
||||
0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \
|
||||
__m128i fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \
|
||||
__m128i fade_amount_lo = fade_amount_hi; \
|
||||
__m128i inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \
|
||||
fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \
|
||||
fade_amount_lo = fade_amount_hi; \
|
||||
inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \
|
||||
|
||||
// Calculate constants for a complex shade with different light levels for each pixel
|
||||
#define SSE_SHADE_INIT4(light3, light2, light1, light0, shade_constants) \
|
||||
__m128i mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \
|
||||
__m128i mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2); \
|
||||
__m128i color = _mm_set_epi16( \
|
||||
mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \
|
||||
mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2); \
|
||||
color = _mm_set_epi16( \
|
||||
256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \
|
||||
256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \
|
||||
__m128i fade = _mm_set_epi16( \
|
||||
fade = _mm_set_epi16( \
|
||||
0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \
|
||||
0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \
|
||||
__m128i fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \
|
||||
__m128i fade_amount_lo = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_lo)); \
|
||||
__m128i inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \
|
||||
fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \
|
||||
fade_amount_lo = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_lo)); \
|
||||
inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \
|
||||
|
||||
// Complex shade 4 pixels
|
||||
#define SSE_SHADE(fg, shade_constants) { \
|
||||
|
|
|
@ -84,6 +84,7 @@ public:
|
|||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
while (sse_count--)
|
||||
|
@ -121,6 +122,7 @@ public:
|
|||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
while (sse_count--)
|
||||
|
@ -184,6 +186,7 @@ public:
|
|||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
while (sse_count--)
|
||||
|
@ -217,6 +220,7 @@ public:
|
|||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
while (sse_count--)
|
||||
|
@ -277,6 +281,7 @@ public:
|
|||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
while (sse_count--)
|
||||
{
|
||||
|
@ -289,6 +294,7 @@ public:
|
|||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
while (sse_count--)
|
||||
{
|
||||
|
@ -317,6 +323,7 @@ public:
|
|||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
while (sse_count--)
|
||||
{
|
||||
|
@ -331,6 +338,7 @@ public:
|
|||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
while (sse_count--)
|
||||
{
|
||||
|
@ -357,918 +365,3 @@ public:
|
|||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(Vlinec4RGBA) : public DrawerCommand
|
||||
{
|
||||
BYTE * RESTRICT _dest;
|
||||
int _count;
|
||||
int _pitch;
|
||||
ShadeConstants _shade_constants;
|
||||
fixed_t palookuplight[4];
|
||||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 * RESTRICT bufplce[4];
|
||||
const uint32_t * RESTRICT bufplce2[4];
|
||||
uint32_t buftexturefracx[4];
|
||||
uint32_t bufheight[4];
|
||||
|
||||
public:
|
||||
VecCommand(Vlinec4RGBA)()
|
||||
{
|
||||
_dest = dc_dest;
|
||||
_count = dc_count;
|
||||
_pitch = dc_pitch;
|
||||
_shade_constants = dc_shade_constants;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
palookuplight[i] = ::palookuplight[i];
|
||||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
bufplce2[i] = (const uint32_t *)::bufplce2[i];
|
||||
buftexturefracx[i] = ::buftexturefracx[i];
|
||||
bufheight[i] = ::bufheight[i];
|
||||
}
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
|
||||
uint32_t height[4];
|
||||
uint32_t half[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
height[i] = bufheight[i];
|
||||
half[i] = (0x80000000 + height[i] - 1) / height[i];
|
||||
}
|
||||
|
||||
uint32_t light0 = LightBgra::calc_light_multiplier(palookuplight[0]);
|
||||
uint32_t light1 = LightBgra::calc_light_multiplier(palookuplight[1]);
|
||||
uint32_t light2 = LightBgra::calc_light_multiplier(palookuplight[2]);
|
||||
uint32_t light3 = LightBgra::calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (bufplce2[0] == nullptr)
|
||||
{
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t p0 = bufplce[0][((place0 >> FRACBITS) * height[0]) >> FRACBITS];
|
||||
uint32_t p1 = bufplce[1][((place1 >> FRACBITS) * height[1]) >> FRACBITS];
|
||||
uint32_t p2 = bufplce[2][((place2 >> FRACBITS) * height[2]) >> FRACBITS];
|
||||
uint32_t p3 = bufplce[3][((place3 >> FRACBITS) * height[3]) >> FRACBITS];
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t p0 = bufplce[0][((place0 >> FRACBITS) * height[0]) >> FRACBITS];
|
||||
uint32_t p1 = bufplce[1][((place1 >> FRACBITS) * height[1]) >> FRACBITS];
|
||||
uint32_t p2 = bufplce[2][((place2 >> FRACBITS) * height[2]) >> FRACBITS];
|
||||
uint32_t p3 = bufplce[3][((place3 >> FRACBITS) * height[3]) >> FRACBITS];
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
do
|
||||
{
|
||||
__m128i fg;
|
||||
VEC_SAMPLE_BILINEAR4_COLUMN(fg, bufplce, bufplce2, buftexturefracx, local_vplce, half, height);
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
do
|
||||
{
|
||||
__m128i fg;
|
||||
VEC_SAMPLE_BILINEAR4_COLUMN(fg, bufplce, bufplce2, buftexturefracx, local_vplce, half, height);
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(Mvlinec4RGBA) : public DrawerCommand
|
||||
{
|
||||
BYTE * RESTRICT _dest;
|
||||
int _count;
|
||||
int _pitch;
|
||||
ShadeConstants _shade_constants;
|
||||
uint32_t _mvlinemax;
|
||||
fixed_t palookuplight[4];
|
||||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 * RESTRICT bufplce[4];
|
||||
const uint32 * RESTRICT bufplce2[4];
|
||||
uint32_t buftexturefracx[4];
|
||||
uint32_t bufheight[4];
|
||||
|
||||
public:
|
||||
VecCommand(Mvlinec4RGBA)()
|
||||
{
|
||||
_dest = dc_dest;
|
||||
_count = dc_count;
|
||||
_pitch = dc_pitch;
|
||||
_shade_constants = dc_shade_constants;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
palookuplight[i] = ::palookuplight[i];
|
||||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
bufplce2[i] = (const uint32_t *)::bufplce2[i];
|
||||
buftexturefracx[i] = ::buftexturefracx[i];
|
||||
bufheight[i] = ::bufheight[i];
|
||||
}
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
uint32_t height[4];
|
||||
uint32_t half[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
height[i] = bufheight[i];
|
||||
half[i] = (0x80000000 + height[i] - 1) / height[i];
|
||||
}
|
||||
|
||||
uint32_t light0 = LightBgra::calc_light_multiplier(palookuplight[0]);
|
||||
uint32_t light1 = LightBgra::calc_light_multiplier(palookuplight[1]);
|
||||
uint32_t light2 = LightBgra::calc_light_multiplier(palookuplight[2]);
|
||||
uint32_t light3 = LightBgra::calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (bufplce2[0] == nullptr)
|
||||
{
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t pix0 = bufplce[0][((place0 >> FRACBITS) * height[0]) >> FRACBITS];
|
||||
uint32_t pix1 = bufplce[1][((place1 >> FRACBITS) * height[1]) >> FRACBITS];
|
||||
uint32_t pix2 = bufplce[2][((place2 >> FRACBITS) * height[2]) >> FRACBITS];
|
||||
uint32_t pix3 = bufplce[3][((place3 >> FRACBITS) * height[3]) >> FRACBITS];
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
VEC_ALPHA_BLEND(fg, bg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
do
|
||||
{
|
||||
DWORD place0 = local_vplce[0];
|
||||
DWORD place1 = local_vplce[1];
|
||||
DWORD place2 = local_vplce[2];
|
||||
DWORD place3 = local_vplce[3];
|
||||
|
||||
uint32_t pix0 = bufplce[0][((place0 >> FRACBITS) * height[0]) >> FRACBITS];
|
||||
uint32_t pix1 = bufplce[1][((place1 >> FRACBITS) * height[1]) >> FRACBITS];
|
||||
uint32_t pix2 = bufplce[2][((place2 >> FRACBITS) * height[2]) >> FRACBITS];
|
||||
uint32_t pix3 = bufplce[3][((place3 >> FRACBITS) * height[3]) >> FRACBITS];
|
||||
|
||||
local_vplce[0] = place0 + local_vince[0];
|
||||
local_vplce[1] = place1 + local_vince[1];
|
||||
local_vplce[2] = place2 + local_vince[2];
|
||||
local_vplce[3] = place3 + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
VEC_ALPHA_BLEND(fg, bg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
|
||||
do
|
||||
{
|
||||
__m128i fg;
|
||||
VEC_SAMPLE_BILINEAR4_COLUMN(fg, bufplce, bufplce2, buftexturefracx, local_vplce, half, height);
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
VEC_ALPHA_BLEND(fg, bg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
|
||||
do
|
||||
{
|
||||
__m128i fg;
|
||||
VEC_SAMPLE_BILINEAR4_COLUMN(fg, bufplce, bufplce2, buftexturefracx, local_vplce, half, height);
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
VEC_ALPHA_BLEND(fg, bg);
|
||||
_mm_storeu_si128((__m128i*)dest, fg);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(Tmvline4AddRGBA) : public DrawerCommand
|
||||
{
|
||||
BYTE * RESTRICT _dest;
|
||||
int _count;
|
||||
int _pitch;
|
||||
ShadeConstants _shade_constants;
|
||||
fixed_t _srcalpha;
|
||||
fixed_t _destalpha;
|
||||
fixed_t palookuplight[4];
|
||||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 * RESTRICT bufplce[4];
|
||||
uint32_t bufheight[4];
|
||||
|
||||
public:
|
||||
VecCommand(Tmvline4AddRGBA)()
|
||||
{
|
||||
_dest = dc_dest;
|
||||
_count = dc_count;
|
||||
_pitch = dc_pitch;
|
||||
_shade_constants = dc_shade_constants;
|
||||
_srcalpha = dc_srcalpha;
|
||||
_destalpha = dc_destalpha;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
palookuplight[i] = ::palookuplight[i];
|
||||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
bufheight[i] = ::bufheight[i];
|
||||
}
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
|
||||
uint32_t height[4];
|
||||
uint32_t half[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
height[i] = bufheight[i];
|
||||
half[i] = (0x80000000 + height[i] - 1) / height[i];
|
||||
}
|
||||
|
||||
uint32_t light[4];
|
||||
light[0] = LightBgra::calc_light_multiplier(palookuplight[0]);
|
||||
light[1] = LightBgra::calc_light_multiplier(palookuplight[1]);
|
||||
light[2] = LightBgra::calc_light_multiplier(palookuplight[2]);
|
||||
light[3] = LightBgra::calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
|
||||
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
|
||||
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
|
||||
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
|
||||
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
|
||||
VEC_CALC_BLEND_ALPHA(fg);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
|
||||
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
|
||||
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
|
||||
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
|
||||
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_CALC_BLEND_ALPHA(fg);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(Tmvline4AddClampRGBA) : public DrawerCommand
|
||||
{
|
||||
BYTE * RESTRICT _dest;
|
||||
int _count;
|
||||
int _pitch;
|
||||
ShadeConstants _shade_constants;
|
||||
fixed_t _srcalpha;
|
||||
fixed_t _destalpha;
|
||||
fixed_t palookuplight[4];
|
||||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 *RESTRICT bufplce[4];
|
||||
uint32_t bufheight[4];
|
||||
|
||||
public:
|
||||
VecCommand(Tmvline4AddClampRGBA)()
|
||||
{
|
||||
_dest = dc_dest;
|
||||
_count = dc_count;
|
||||
_pitch = dc_pitch;
|
||||
_shade_constants = dc_shade_constants;
|
||||
_srcalpha = dc_srcalpha;
|
||||
_destalpha = dc_destalpha;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
palookuplight[i] = ::palookuplight[i];
|
||||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
bufheight[i] = ::bufheight[i];
|
||||
}
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
|
||||
uint32_t height[4];
|
||||
uint32_t half[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
height[i] = bufheight[i];
|
||||
half[i] = (0x80000000 + height[i] - 1) / height[i];
|
||||
}
|
||||
|
||||
uint32_t light[4];
|
||||
light[0] = LightBgra::calc_light_multiplier(palookuplight[0]);
|
||||
light[1] = LightBgra::calc_light_multiplier(palookuplight[1]);
|
||||
light[2] = LightBgra::calc_light_multiplier(palookuplight[2]);
|
||||
light[3] = LightBgra::calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
|
||||
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
|
||||
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
|
||||
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
|
||||
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_CALC_BLEND_ALPHA(fg);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
|
||||
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
|
||||
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
|
||||
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
|
||||
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_CALC_BLEND_ALPHA(fg);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(Tmvline4SubClampRGBA) : public DrawerCommand
|
||||
{
|
||||
BYTE * RESTRICT _dest;
|
||||
int _count;
|
||||
int _pitch;
|
||||
ShadeConstants _shade_constants;
|
||||
fixed_t _srcalpha;
|
||||
fixed_t _destalpha;
|
||||
fixed_t palookuplight[4];
|
||||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 *RESTRICT bufplce[4];
|
||||
uint32_t bufheight[4];
|
||||
|
||||
public:
|
||||
VecCommand(Tmvline4SubClampRGBA)()
|
||||
{
|
||||
_dest = dc_dest;
|
||||
_count = dc_count;
|
||||
_pitch = dc_pitch;
|
||||
_shade_constants = dc_shade_constants;
|
||||
_srcalpha = dc_srcalpha;
|
||||
_destalpha = dc_destalpha;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
palookuplight[i] = ::palookuplight[i];
|
||||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
bufheight[i] = ::bufheight[i];
|
||||
}
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
|
||||
uint32_t height[4];
|
||||
uint32_t half[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
height[i] = bufheight[i];
|
||||
half[i] = (0x80000000 + height[i] - 1) / height[i];
|
||||
}
|
||||
|
||||
uint32_t light[4];
|
||||
light[0] = LightBgra::calc_light_multiplier(palookuplight[0]);
|
||||
light[1] = LightBgra::calc_light_multiplier(palookuplight[1]);
|
||||
light[2] = LightBgra::calc_light_multiplier(palookuplight[2]);
|
||||
light[3] = LightBgra::calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
|
||||
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
|
||||
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
|
||||
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
|
||||
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_CALC_BLEND_ALPHA(fg);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
|
||||
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
|
||||
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
|
||||
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
|
||||
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_CALC_BLEND_ALPHA(fg);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class VecCommand(Tmvline4RevSubClampRGBA) : public DrawerCommand
|
||||
{
|
||||
BYTE * RESTRICT _dest;
|
||||
int _count;
|
||||
int _pitch;
|
||||
ShadeConstants _shade_constants;
|
||||
fixed_t _srcalpha;
|
||||
fixed_t _destalpha;
|
||||
fixed_t palookuplight[4];
|
||||
DWORD vplce[4];
|
||||
DWORD vince[4];
|
||||
const uint32 *RESTRICT bufplce[4];
|
||||
uint32_t bufheight[4];
|
||||
|
||||
public:
|
||||
VecCommand(Tmvline4RevSubClampRGBA)()
|
||||
{
|
||||
_dest = dc_dest;
|
||||
_count = dc_count;
|
||||
_pitch = dc_pitch;
|
||||
_shade_constants = dc_shade_constants;
|
||||
_srcalpha = dc_srcalpha;
|
||||
_destalpha = dc_destalpha;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
palookuplight[i] = ::palookuplight[i];
|
||||
vplce[i] = ::vplce[i];
|
||||
vince[i] = ::vince[i];
|
||||
bufplce[i] = (const uint32 *)::bufplce[i];
|
||||
bufheight[i] = ::bufheight[4];
|
||||
}
|
||||
}
|
||||
|
||||
void Execute(DrawerThread *thread) override
|
||||
{
|
||||
int count = thread->count_for_thread(_dest_y, _count);
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
|
||||
int pitch = _pitch * thread->num_cores;
|
||||
|
||||
uint32_t height[4];
|
||||
uint32_t half[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
height[i] = bufheight[i];
|
||||
half[i] = (0x80000000 + height[i] - 1) / height[i];
|
||||
}
|
||||
|
||||
uint32_t light[4];
|
||||
light[0] = LightBgra::calc_light_multiplier(palookuplight[0]);
|
||||
light[1] = LightBgra::calc_light_multiplier(palookuplight[1]);
|
||||
light[2] = LightBgra::calc_light_multiplier(palookuplight[2]);
|
||||
light[3] = LightBgra::calc_light_multiplier(palookuplight[3]);
|
||||
|
||||
ShadeConstants shade_constants = _shade_constants;
|
||||
|
||||
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
|
||||
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
|
||||
|
||||
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
|
||||
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
|
||||
int skipped = thread->skipped_by_thread(_dest_y);
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
local_vplce[i] += local_vince[i] * skipped;
|
||||
local_vince[i] *= thread->num_cores;
|
||||
}
|
||||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
|
||||
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
|
||||
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
|
||||
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
|
||||
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_CALC_BLEND_ALPHA(fg);
|
||||
VEC_SHADE_SIMPLE(fg);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
|
||||
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
|
||||
|
||||
do
|
||||
{
|
||||
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
|
||||
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
|
||||
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
|
||||
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
|
||||
|
||||
local_vplce[0] = local_vplce[0] + local_vince[0];
|
||||
local_vplce[1] = local_vplce[1] + local_vince[1];
|
||||
local_vplce[2] = local_vplce[2] + local_vince[2];
|
||||
local_vplce[3] = local_vplce[3] + local_vince[3];
|
||||
|
||||
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
|
||||
VEC_CALC_BLEND_ALPHA(fg);
|
||||
VEC_SHADE(fg, shade_constants);
|
||||
|
||||
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
|
||||
|
||||
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
|
||||
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
|
||||
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
|
||||
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
|
||||
|
||||
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
|
||||
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
|
||||
__m128i out = _mm_packus_epi16(out_lo, out_hi);
|
||||
|
||||
_mm_storeu_si128((__m128i*)dest, out);
|
||||
dest += pitch;
|
||||
} while (--count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -53,8 +53,13 @@ extern unsigned int *horizspan[4];
|
|||
|
||||
#ifndef NO_SSE
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(disable: 4101) // warning C4101: unreferenced local variable
|
||||
#endif
|
||||
|
||||
// Generate SSE drawers:
|
||||
#define VecCommand(name) name##_SSE_Command
|
||||
#define VEC_SHADE_VARS SSE_SHADE_VARS
|
||||
#define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT
|
||||
#define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4
|
||||
#define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE
|
||||
|
|
|
@ -60,6 +60,7 @@ public:
|
|||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
if (count & 1) {
|
||||
|
@ -110,6 +111,7 @@ public:
|
|||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
if (count & 1) {
|
||||
|
@ -218,6 +220,7 @@ public:
|
|||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
|
@ -254,6 +257,7 @@ public:
|
|||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
|
@ -421,6 +425,7 @@ public:
|
|||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
|
@ -457,6 +462,7 @@ public:
|
|||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
|
@ -547,6 +553,7 @@ public:
|
|||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
|
@ -583,6 +590,7 @@ public:
|
|||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
|
@ -673,6 +681,7 @@ public:
|
|||
|
||||
if (shade_constants.simple_shade)
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_SIMPLE_INIT(light);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
|
@ -709,6 +718,7 @@ public:
|
|||
}
|
||||
else
|
||||
{
|
||||
VEC_SHADE_VARS();
|
||||
VEC_SHADE_INIT(light, shade_constants);
|
||||
|
||||
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
|
||||
|
|
|
@ -1146,15 +1146,16 @@ WallscanSampler::WallscanSampler(int y1, float swal, double yrepeat, fixed_t xof
|
|||
int mip_height = texture->GetHeight();
|
||||
if (r_mipmap && texture->Mipmapped())
|
||||
{
|
||||
uint32_t xpos = (uint32_t)((((uint64_t)xoffset) << FRACBITS) / mip_width);
|
||||
int level = (int)MAX(magnitude - 1.0, 0.0);
|
||||
while (level != 0)
|
||||
{
|
||||
mipmap_offset += mip_width * mip_height;
|
||||
xoffset >>= 1;
|
||||
level >>= 1;
|
||||
mip_width = MAX(mip_width >> 1, 1);
|
||||
mip_height = MAX(mip_height >> 1, 1);
|
||||
}
|
||||
xoffset = (xpos >> FRACBITS) * mip_width;
|
||||
}
|
||||
|
||||
const uint32_t *pixels = texture->GetPixelsBgra() + mipmap_offset;
|
||||
|
|
Loading…
Reference in a new issue