Throwing templates at the code redundancy problem in drawers

This commit is contained in:
Magnus Norddahl 2016-06-26 21:23:32 +02:00
parent 928e8e0d43
commit 6c037fa249
6 changed files with 551 additions and 1108 deletions

View file

@ -67,8 +67,13 @@ CVAR(Bool, r_mipmap, true, CVAR_ARCHIVE | CVAR_GLOBALCONFIG);
#ifndef NO_SSE #ifndef NO_SSE
#ifdef _MSC_VER
#pragma warning(disable: 4101) // warning C4101: unreferenced local variable
#endif
// Generate SSE drawers: // Generate SSE drawers:
#define VecCommand(name) name##_SSE_Command #define VecCommand(name) name##_SSE_Command
#define VEC_SHADE_VARS SSE_SHADE_VARS
#define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT #define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT
#define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4 #define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4
#define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE #define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE
@ -1552,8 +1557,446 @@ public:
return (--count) != 0; return (--count) != 0;
} }
}; };
#ifdef NO_SSE
struct NearestSampler
{
FORCEINLINE static uint32_t Sample1(DrawerWall4Command &cmd, LoopIterator &loop, int index)
{
return cmd._bufplce[index][loop.sample_index(index)];
}
};
struct LinearSampler
{
FORCEINLINE static uint32_t Sample1(DrawerWall4Command &cmd, LoopIterator &loop, int index)
{
return SampleBgra::sample_bilinear(cmd._bufplce[index], cmd._bufplce2[index], cmd._buftexturefracx[index], loop.vplce[index], loop.half[index], loop.height[index]);
}
};
#else
struct NearestSampler
{
FORCEINLINE static __m128i Sample4(DrawerWall4Command &cmd, LoopIterator &loop)
{
return _mm_set_epi32(cmd._bufplce[3][loop.sample_index(3)], cmd._bufplce[2][loop.sample_index(2)], cmd._bufplce[1][loop.sample_index(1)], cmd._bufplce[0][loop.sample_index(0)]);
}
};
struct LinearSampler
{
FORCEINLINE static __m128i Sample4(DrawerWall4Command &cmd, LoopIterator &loop)
{
__m128i fg;
VEC_SAMPLE_BILINEAR4_COLUMN(fg, cmd._bufplce, cmd._bufplce2, cmd._buftexturefracx, loop.vplce, loop.half, loop.height);
return fg;
}
};
#endif
#ifdef NO_SSE
template<typename Sampler>
struct Copy
{
Copy(DrawerWall4Command &cmd, LoopIterator &loop)
{
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
for (int i = 0; i < 4; i++)
{
uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants);
loop.dest[i] = BlendBgra::copy(fg);
}
}
};
template<typename Sampler>
struct Mask
{
Mask(DrawerWall4Command &cmd, LoopIterator &loop)
{
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
for (int i = 0; i < 4; i++)
{
uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants);
loop.dest[i] = BlendBgra::alpha_blend(fg, loop.dest[i]);
}
}
};
template<typename Sampler>
struct TMaskAdd
{
TMaskAdd(DrawerWall4Command &cmd, LoopIterator &loop)
{
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
for (int i = 0; i < 4; i++)
{
uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants);
loop.dest[i] = BlendBgra::add(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha));
}
}
};
template<typename Sampler>
struct TMaskSub
{
TMaskSub(DrawerWall4Command &cmd, LoopIterator &loop)
{
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
for (int i = 0; i < 4; i++)
{
uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants);
loop.dest[i] = BlendBgra::sub(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha));
}
}
};
template<typename Sampler>
struct TMaskRevSub
{
TMaskRevSub(DrawerWall4Command &cmd, LoopIterator &loop)
{
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
for (int i = 0; i < 4; i++)
{
uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants);
loop.dest[i] = BlendBgra::revsub(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha));
}
}
};
typedef Copy<NearestSampler> CopyNearestSimple;
typedef Copy<LinearSampler> CopyLinearSimple;
typedef Copy<NearestSampler> CopyNearest;
typedef Copy<LinearSampler> CopyLinear;
typedef Mask<NearestSampler> MaskNearestSimple;
typedef Mask<LinearSampler> MaskLinearSimple;
typedef Mask<NearestSampler> MaskNearest;
typedef Mask<LinearSampler> MaskLinear;
typedef TMaskAdd<NearestSampler> TMaskAddNearestSimple;
typedef TMaskAdd<LinearSampler> TMaskAddLinearSimple;
typedef TMaskAdd<NearestSampler> TMaskAddNearest;
typedef TMaskAdd<LinearSampler> TMaskAddLinear;
typedef TMaskSub<NearestSampler> TMaskSubNearestSimple;
typedef TMaskSub<LinearSampler> TMaskSubLinearSimple;
typedef TMaskSub<NearestSampler> TMaskSubNearest;
typedef TMaskSub<LinearSampler> TMaskSubLinear;
typedef TMaskRevSub<NearestSampler> TMaskRevSubNearestSimple;
typedef TMaskRevSub<LinearSampler> TMaskRevSubLinearSimple;
typedef TMaskRevSub<NearestSampler> TMaskRevSubNearest;
typedef TMaskRevSub<LinearSampler> TMaskRevSubLinear;
#else
template<typename Sampler>
struct CopySimple
{
VEC_SHADE_VARS();
CopySimple(DrawerWall4Command &cmd, LoopIterator &loop)
{
VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]);
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
__m128i fg = Sampler::Sample4(cmd, loop);
VEC_SHADE_SIMPLE(fg);
_mm_storeu_si128((__m128i*)loop.dest, fg);
}
};
template<typename Sampler>
struct Copy
{
VEC_SHADE_VARS();
Copy(DrawerWall4Command &cmd, LoopIterator &loop)
{
VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants);
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
__m128i fg = Sampler::Sample4(cmd, loop);
VEC_SHADE(fg, cmd._shade_constants);
_mm_storeu_si128((__m128i*)loop.dest, fg);
}
};
template<typename Sampler>
struct MaskSimple
{
VEC_SHADE_VARS();
MaskSimple(DrawerWall4Command &cmd, LoopIterator &loop)
{
VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]);
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
__m128i fg = Sampler::Sample4(cmd, loop);
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
VEC_SHADE_SIMPLE(fg);
VEC_ALPHA_BLEND(fg, bg);
_mm_storeu_si128((__m128i*)loop.dest, fg);
}
};
template<typename Sampler>
struct Mask
{
VEC_SHADE_VARS();
Mask(DrawerWall4Command &cmd, LoopIterator &loop)
{
VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants);
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
__m128i fg = Sampler::Sample4(cmd, loop);
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
VEC_SHADE(fg, cmd._shade_constants);
VEC_ALPHA_BLEND(fg, bg);
_mm_storeu_si128((__m128i*)loop.dest, fg);
}
};
template<typename Sampler>
struct TMaskAddSimple
{
VEC_SHADE_VARS();
VEC_CALC_BLEND_ALPHA_VARS();
TMaskAddSimple(DrawerWall4Command &cmd, LoopIterator &loop)
{
VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]);
VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha);
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
__m128i fg = Sampler::Sample4(cmd, loop);
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
VEC_CALC_BLEND_ALPHA(fg);
VEC_SHADE_SIMPLE(fg);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)loop.dest, out);
}
};
template<typename Sampler>
struct TMaskAdd
{
VEC_SHADE_VARS();
VEC_CALC_BLEND_ALPHA_VARS();
TMaskAdd(DrawerWall4Command &cmd, LoopIterator &loop)
{
VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants);
VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha);
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
__m128i fg = Sampler::Sample4(cmd, loop);
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
VEC_CALC_BLEND_ALPHA(fg);
VEC_SHADE_SIMPLE(fg);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)loop.dest, out);
}
};
template<typename Sampler>
struct TMaskSubSimple
{
VEC_SHADE_VARS();
VEC_CALC_BLEND_ALPHA_VARS();
TMaskSubSimple(DrawerWall4Command &cmd, LoopIterator &loop)
{
VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]);
VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha);
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
__m128i fg = Sampler::Sample4(cmd, loop);
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
VEC_CALC_BLEND_ALPHA(fg);
VEC_SHADE_SIMPLE(fg);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)loop.dest, out);
}
};
template<typename Sampler>
struct TMaskSub
{
VEC_SHADE_VARS();
VEC_CALC_BLEND_ALPHA_VARS();
TMaskSub(DrawerWall4Command &cmd, LoopIterator &loop)
{
VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants);
VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha);
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
__m128i fg = Sampler::Sample4(cmd, loop);
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
VEC_CALC_BLEND_ALPHA(fg);
VEC_SHADE_SIMPLE(fg);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)loop.dest, out);
}
};
template<typename Sampler>
struct TMaskRevSubSimple
{
VEC_SHADE_VARS();
VEC_CALC_BLEND_ALPHA_VARS();
TMaskRevSubSimple(DrawerWall4Command &cmd, LoopIterator &loop)
{
VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]);
VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha);
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
__m128i fg = Sampler::Sample4(cmd, loop);
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
VEC_CALC_BLEND_ALPHA(fg);
VEC_SHADE_SIMPLE(fg);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)loop.dest, out);
}
};
template<typename Sampler>
struct TMaskRevSub
{
VEC_SHADE_VARS();
VEC_CALC_BLEND_ALPHA_VARS();
TMaskRevSub(DrawerWall4Command &cmd, LoopIterator &loop)
{
VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants);
VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha);
}
void Blend(DrawerWall4Command &cmd, LoopIterator &loop)
{
__m128i fg = Sampler::Sample4(cmd, loop);
__m128i bg = _mm_loadu_si128((const __m128i*)loop.dest);
VEC_CALC_BLEND_ALPHA(fg);
VEC_SHADE_SIMPLE(fg);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)loop.dest, out);
}
};
typedef CopySimple<NearestSampler> CopyNearestSimple;
typedef CopySimple<LinearSampler> CopyLinearSimple;
typedef Copy<NearestSampler> CopyNearest;
typedef Copy<LinearSampler> CopyLinear;
typedef MaskSimple<NearestSampler> MaskNearestSimple;
typedef MaskSimple<LinearSampler> MaskLinearSimple;
typedef Mask<NearestSampler> MaskNearest;
typedef Mask<LinearSampler> MaskLinear;
typedef TMaskAddSimple<NearestSampler> TMaskAddNearestSimple;
typedef TMaskAddSimple<LinearSampler> TMaskAddLinearSimple;
typedef TMaskAdd<NearestSampler> TMaskAddNearest;
typedef TMaskAdd<LinearSampler> TMaskAddLinear;
typedef TMaskSubSimple<NearestSampler> TMaskSubNearestSimple;
typedef TMaskSubSimple<LinearSampler> TMaskSubLinearSimple;
typedef TMaskSub<NearestSampler> TMaskSubNearest;
typedef TMaskSub<LinearSampler> TMaskSubLinear;
typedef TMaskRevSubSimple<NearestSampler> TMaskRevSubNearestSimple;
typedef TMaskRevSubSimple<LinearSampler> TMaskRevSubLinearSimple;
typedef TMaskRevSub<NearestSampler> TMaskRevSubNearest;
typedef TMaskRevSub<LinearSampler> TMaskRevSubLinear;
#endif
}; };
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::CopyNearestSimple> Vlinec4NearestSimpleRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::CopyNearest> Vlinec4NearestRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::CopyLinearSimple> Vlinec4LinearSimpleRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::CopyLinear> Vlinec4LinearRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::MaskNearestSimple> Mvlinec4NearestSimpleRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::MaskNearest> Mvlinec4NearestRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::MaskLinearSimple> Mvlinec4LinearSimpleRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::MaskLinear> Mvlinec4LinearRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddNearestSimple> Tmvline4AddNearestSimpleRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddNearest> Tmvline4AddNearestRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddLinearSimple> Tmvline4AddLinearSimpleRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddLinear> Tmvline4AddLinearRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddNearestSimple> Tmvline4AddClampNearestSimpleRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddNearest> Tmvline4AddClampNearestRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddLinearSimple> Tmvline4AddClampLinearSimpleRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskAddLinear> Tmvline4AddClampLinearRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskSubNearestSimple> Tmvline4SubClampNearestSimpleRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskSubNearest> Tmvline4SubClampNearestRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskSubLinearSimple> Tmvline4SubClampLinearSimpleRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskSubLinear> Tmvline4SubClampLinearRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskRevSubNearestSimple> Tmvline4RevSubClampNearestSimpleRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskRevSubNearest> Tmvline4RevSubClampNearestRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskRevSubLinearSimple> Tmvline4RevSubClampLinearSimpleRGBACommand;
typedef DrawerBlendCommand<DrawerWall4Command, DrawerWall4Command::TMaskRevSubLinear> Tmvline4RevSubClampLinearRGBACommand;
class Vlinec1RGBACommand : public DrawerWall1Command class Vlinec1RGBACommand : public DrawerWall1Command
{ {
public: public:
@ -1581,39 +2024,6 @@ public:
} }
}; };
class Vlinec4RGBACommand : public DrawerWall4Command
{
public:
void Execute(DrawerThread *thread) override
{
LoopIterator loop(this, thread);
if (!loop) return;
if (_bufplce2[0] == nullptr)
{
do
{
for (int i = 0; i < 4; i++)
{
uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants);
loop.dest[i] = BlendBgra::copy(fg);
}
} while (loop.next());
}
else
{
do
{
for (int i = 0; i < 4; i++)
{
uint32_t fg = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_bufplce[i], _bufplce2[i], _buftexturefracx[i], loop.sample_index(i), loop.half[i], loop.height[i]), _light[i], _shade_constants);
loop.dest[i] = BlendBgra::copy(fg);
}
} while (loop.next());
}
}
};
class Mvlinec1RGBACommand : public DrawerWall1Command class Mvlinec1RGBACommand : public DrawerWall1Command
{ {
public: public:
@ -1641,39 +2051,6 @@ public:
} }
}; };
class Mvlinec4RGBACommand : public DrawerWall4Command
{
public:
void Execute(DrawerThread *thread) override
{
LoopIterator loop(this, thread);
if (!loop) return;
if (_bufplce2[0] == nullptr)
{
do
{
for (int i = 0; i < 4; i++)
{
uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants);
loop.dest[i] = BlendBgra::alpha_blend(fg, loop.dest[i]);
}
} while (loop.next());
}
else
{
do
{
for (int i = 0; i < 4; i++)
{
uint32_t fg = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_bufplce[i], _bufplce2[i], _buftexturefracx[i], loop.sample_index(i), loop.half[i], loop.height[i]), _light[i], _shade_constants);
loop.dest[i] = BlendBgra::alpha_blend(fg, loop.dest[i]);
}
} while (loop.next());
}
}
};
class Tmvline1AddRGBACommand : public DrawerWall1Command class Tmvline1AddRGBACommand : public DrawerWall1Command
{ {
public: public:
@ -1689,24 +2066,6 @@ public:
} }
}; };
class Tmvline4AddRGBACommand : public DrawerWall4Command
{
public:
void Execute(DrawerThread *thread) override
{
LoopIterator loop(this, thread);
if (!loop) return;
do
{
for (int i = 0; i < 4; i++)
{
uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants);
loop.dest[i] = BlendBgra::add(fg, loop.dest[i], _srcalpha, calc_blend_bgalpha(fg, _destalpha));
}
} while (loop.next());
}
};
class Tmvline1AddClampRGBACommand : public DrawerWall1Command class Tmvline1AddClampRGBACommand : public DrawerWall1Command
{ {
public: public:
@ -1722,24 +2081,6 @@ public:
} }
}; };
class Tmvline4AddClampRGBACommand : public DrawerWall4Command
{
public:
void Execute(DrawerThread *thread) override
{
LoopIterator loop(this, thread);
if (!loop) return;
do
{
for (int i = 0; i < 4; i++)
{
uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants);
loop.dest[i] = BlendBgra::add(fg, loop.dest[i], _srcalpha, calc_blend_bgalpha(fg, _destalpha));
}
} while (loop.next());
}
};
class Tmvline1SubClampRGBACommand : public DrawerWall1Command class Tmvline1SubClampRGBACommand : public DrawerWall1Command
{ {
public: public:
@ -1755,24 +2096,6 @@ public:
} }
}; };
class Tmvline4SubClampRGBACommand : public DrawerWall4Command
{
public:
void Execute(DrawerThread *thread) override
{
LoopIterator loop(this, thread);
if (!loop) return;
do
{
for (int i = 0; i < 4; i++)
{
uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants);
loop.dest[i] = BlendBgra::sub(fg, loop.dest[i], _srcalpha, calc_blend_bgalpha(fg, _destalpha));
}
} while (loop.next());
}
};
class Tmvline1RevSubClampRGBACommand : public DrawerWall1Command class Tmvline1RevSubClampRGBACommand : public DrawerWall1Command
{ {
public: public:
@ -1788,24 +2111,6 @@ public:
} }
}; };
class Tmvline4RevSubClampRGBACommand : public DrawerWall4Command
{
public:
void Execute(DrawerThread *thread) override
{
LoopIterator loop(this, thread);
if (!loop) return;
do
{
for (int i = 0; i < 4; i++)
{
uint32_t fg = LightBgra::shade_bgra(_bufplce[i][loop.sample_index(i)], _light[i], _shade_constants);
loop.dest[i] = BlendBgra::revsub(fg, loop.dest[i], _srcalpha, calc_blend_bgalpha(fg, _destalpha));
}
} while (loop.next());
}
};
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
class DrawFogBoundaryLineRGBACommand : public DrawerCommand class DrawFogBoundaryLineRGBACommand : public DrawerCommand
@ -2355,13 +2660,22 @@ DWORD vlinec1_rgba()
return dc_texturefrac + dc_count * dc_iscale; return dc_texturefrac + dc_count * dc_iscale;
} }
template<typename NearestSimple, typename Nearest, typename LinearSimple, typename Linear>
void queue_wallcommand()
{
if (bufplce2[0] == nullptr && dc_shade_constants.simple_shade)
DrawerCommandQueue::QueueCommand<NearestSimple>();
else if (bufplce2[0] == nullptr)
DrawerCommandQueue::QueueCommand<Nearest>();
else if (dc_shade_constants.simple_shade)
DrawerCommandQueue::QueueCommand<LinearSimple>();
else
DrawerCommandQueue::QueueCommand<Linear>();
}
void vlinec4_rgba() void vlinec4_rgba()
{ {
#ifdef NO_SSE queue_wallcommand<Vlinec4NearestSimpleRGBACommand, Vlinec4NearestRGBACommand, Vlinec4LinearSimpleRGBACommand, Vlinec4LinearRGBACommand>();
DrawerCommandQueue::QueueCommand<Vlinec4RGBACommand>();
#else
DrawerCommandQueue::QueueCommand<Vlinec4RGBA_SSE_Command>();
#endif
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
vplce[i] += vince[i] * dc_count; vplce[i] += vince[i] * dc_count;
} }
@ -2374,11 +2688,7 @@ DWORD mvlinec1_rgba()
void mvlinec4_rgba() void mvlinec4_rgba()
{ {
#ifdef NO_SSE queue_wallcommand<Mvlinec4NearestSimpleRGBACommand, Mvlinec4NearestRGBACommand, Mvlinec4LinearSimpleRGBACommand, Mvlinec4LinearRGBACommand>();
DrawerCommandQueue::QueueCommand<Mvlinec4RGBACommand>();
#else
DrawerCommandQueue::QueueCommand<Mvlinec4RGBA_SSE_Command>();
#endif
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
vplce[i] += vince[i] * dc_count; vplce[i] += vince[i] * dc_count;
} }
@ -2391,11 +2701,7 @@ fixed_t tmvline1_add_rgba()
void tmvline4_add_rgba() void tmvline4_add_rgba()
{ {
#ifdef NO_SSE queue_wallcommand<Tmvline4AddNearestSimpleRGBACommand, Tmvline4AddNearestRGBACommand, Tmvline4AddLinearSimpleRGBACommand, Tmvline4AddLinearRGBACommand>();
DrawerCommandQueue::QueueCommand<Tmvline4AddRGBACommand>();
#else
DrawerCommandQueue::QueueCommand<Tmvline4AddRGBA_SSE_Command>();
#endif
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
vplce[i] += vince[i] * dc_count; vplce[i] += vince[i] * dc_count;
} }
@ -2408,11 +2714,7 @@ fixed_t tmvline1_addclamp_rgba()
void tmvline4_addclamp_rgba() void tmvline4_addclamp_rgba()
{ {
#ifdef NO_SSE queue_wallcommand<Tmvline4AddClampNearestSimpleRGBACommand, Tmvline4AddClampNearestRGBACommand, Tmvline4AddClampLinearSimpleRGBACommand, Tmvline4AddClampLinearRGBACommand>();
DrawerCommandQueue::QueueCommand<Tmvline4AddClampRGBACommand>();
#else
DrawerCommandQueue::QueueCommand<Tmvline4AddClampRGBA_SSE_Command>();
#endif
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
vplce[i] += vince[i] * dc_count; vplce[i] += vince[i] * dc_count;
} }
@ -2425,11 +2727,7 @@ fixed_t tmvline1_subclamp_rgba()
void tmvline4_subclamp_rgba() void tmvline4_subclamp_rgba()
{ {
#ifdef NO_SSE queue_wallcommand<Tmvline4SubClampNearestSimpleRGBACommand, Tmvline4SubClampNearestRGBACommand, Tmvline4SubClampLinearSimpleRGBACommand, Tmvline4SubClampLinearRGBACommand>();
DrawerCommandQueue::QueueCommand<Tmvline4SubClampRGBACommand>();
#else
DrawerCommandQueue::QueueCommand<Tmvline4SubClampRGBA_SSE_Command>();
#endif
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
vplce[i] += vince[i] * dc_count; vplce[i] += vince[i] * dc_count;
} }
@ -2442,11 +2740,7 @@ fixed_t tmvline1_revsubclamp_rgba()
void tmvline4_revsubclamp_rgba() void tmvline4_revsubclamp_rgba()
{ {
#ifdef NO_SSE queue_wallcommand<Tmvline4RevSubClampNearestSimpleRGBACommand, Tmvline4RevSubClampNearestRGBACommand, Tmvline4RevSubClampLinearSimpleRGBACommand, Tmvline4RevSubClampLinearRGBACommand>();
DrawerCommandQueue::QueueCommand<Tmvline4RevSubClampRGBACommand>();
#else
DrawerCommandQueue::QueueCommand<Tmvline4RevSubClampRGBA_SSE_Command>();
#endif
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
vplce[i] += vince[i] * dc_count; vplce[i] += vince[i] * dc_count;
} }

View file

@ -286,6 +286,22 @@ public:
void Execute(DrawerThread *thread) override; void Execute(DrawerThread *thread) override;
}; };
template<typename CommandType, typename BlendMode>
class DrawerBlendCommand : public CommandType
{
public:
void Execute(DrawerThread *thread) override
{
LoopIterator loop(this, thread);
if (!loop) return;
BlendMode blend(*this, loop);
do
{
blend.Blend(*this, loop);
} while (loop.next());
}
};
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Pixel shading inline functions: // Pixel shading inline functions:
@ -624,7 +640,7 @@ public:
__m128i ab_invab = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2); \ __m128i ab_invab = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2); \
__m128i ainvb_invainvb = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2 + 1); \ __m128i ainvb_invainvb = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2 + 1); \
\ \
__m128i gather = _mm_set_epi32(col1[i][y1], col1[i][y0], col0[i][y1], col1[i][y0]); \ __m128i gather = _mm_set_epi32(col1[i][y1], col1[i][y0], col0[i][y1], col0[i][y0]); \
__m128i p0 = _mm_unpacklo_epi8(gather, _mm_setzero_si128()); \ __m128i p0 = _mm_unpacklo_epi8(gather, _mm_setzero_si128()); \
__m128i p1 = _mm_unpackhi_epi8(gather, _mm_setzero_si128()); \ __m128i p1 = _mm_unpackhi_epi8(gather, _mm_setzero_si128()); \
\ \
@ -635,6 +651,26 @@ public:
} \ } \
} }
#define VEC_SAMPLE_MIP_NEAREST4_COLUMN(fg, col0, col1, mipfrac, texturefracy, height0, height1) { \
uint32_t y0[4], y1[4]; \
for (int i = 0; i < 4; i++) \
{ \
y0[i] = (texturefracy[i] >> FRACBITS) * height0[i]; \
y1[i] = (texturefracy[i] >> FRACBITS) * height1[i]; \
} \
__m128i p0 = _mm_set_epi32(col0[y0[3]], col0[y0[2]], col0[y0[1]], col0[y0[0]]); \
__m128i p1 = _mm_set_epi32(col1[y1[3]], col1[y1[2]], col1[y1[1]], col1[y1[0]]); \
__m128i t = _mm_loadu_si128((const __m128i*)mipfrac); \
__m128i inv_t = _mm_sub_epi32(_mm_set1_epi32(256), mipfrac); \
__m128i p0_lo = _mm_unpacklo_epi8(p0, _mm_setzero_si128()); \
__m128i p0_hi = _mm_unpackhi_epi8(p0, _mm_setzero_si128()); \
__m128i p1_lo = _mm_unpacklo_epi8(p1, _mm_setzero_si128()); \
__m128i p1_hi = _mm_unpackhi_epi8(p1, _mm_setzero_si128()); \
__m128i fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(p0_lo, t), _mm_mullo_epi16(p1_lo, inv_t)), 8); \
__m128i fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(p0_hi, t), _mm_mullo_epi16(p1_hi, inv_t)), 8); \
fg = _mm_packus_epi16(fg_lo, fg_hi); \
}
#define VEC_SAMPLE_BILINEAR4_SPAN(fg, texture, xfrac, yfrac, xstep, ystep, xbits, ybits) { \ #define VEC_SAMPLE_BILINEAR4_SPAN(fg, texture, xfrac, yfrac, xstep, ystep, xbits, ybits) { \
int xshift = (32 - xbits); \ int xshift = (32 - xbits); \
int yshift = (32 - ybits); \ int yshift = (32 - ybits); \
@ -844,12 +880,14 @@ FORCEINLINE uint32_t calc_blend_bgalpha(uint32_t fg, uint32_t dest_alpha)
return (dest_alpha * alpha + 256 * inv_alpha + 128) >> 8; return (dest_alpha * alpha + 256 * inv_alpha + 128) >> 8;
} }
#define VEC_CALC_BLEND_ALPHA_VARS() __m128i msrc_alpha, mdest_alpha, m256, m255, m128;
#define VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha) \ #define VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha) \
__m128i msrc_alpha = _mm_set1_epi16(src_alpha); \ msrc_alpha = _mm_set1_epi16(src_alpha); \
__m128i mdest_alpha = _mm_set1_epi16(dest_alpha * 255 / 256); \ mdest_alpha = _mm_set1_epi16(dest_alpha * 255 / 256); \
__m128i m256 = _mm_set1_epi16(256); \ m256 = _mm_set1_epi16(256); \
__m128i m255 = _mm_set1_epi16(255); \ m255 = _mm_set1_epi16(255); \
__m128i m128 = _mm_set1_epi16(128); m128 = _mm_set1_epi16(128);
// Calculates the final alpha values to be used when combined with the source texture alpha channel // Calculates the final alpha values to be used when combined with the source texture alpha channel
#define VEC_CALC_BLEND_ALPHA(fg) \ #define VEC_CALC_BLEND_ALPHA(fg) \
@ -866,15 +904,17 @@ FORCEINLINE uint32_t calc_blend_bgalpha(uint32_t fg, uint32_t dest_alpha)
fg_alpha_lo = msrc_alpha; \ fg_alpha_lo = msrc_alpha; \
} }
#define SSE_SHADE_VARS() __m128i mlight_hi, mlight_lo, color, fade, fade_amount_hi, fade_amount_lo, inv_desaturate;
// Calculate constants for a simple shade // Calculate constants for a simple shade
#define SSE_SHADE_SIMPLE_INIT(light) \ #define SSE_SHADE_SIMPLE_INIT(light) \
__m128i mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \ mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \
__m128i mlight_lo = mlight_hi; mlight_lo = mlight_hi;
// Calculate constants for a simple shade with different light levels for each pixel // Calculate constants for a simple shade with different light levels for each pixel
#define SSE_SHADE_SIMPLE_INIT4(light3, light2, light1, light0) \ #define SSE_SHADE_SIMPLE_INIT4(light3, light2, light1, light0) \
__m128i mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \ mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \
__m128i mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2); mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2);
// Simple shade 4 pixels // Simple shade 4 pixels
#define SSE_SHADE_SIMPLE(fg) { \ #define SSE_SHADE_SIMPLE(fg) { \
@ -889,31 +929,31 @@ FORCEINLINE uint32_t calc_blend_bgalpha(uint32_t fg, uint32_t dest_alpha)
// Calculate constants for a complex shade // Calculate constants for a complex shade
#define SSE_SHADE_INIT(light, shade_constants) \ #define SSE_SHADE_INIT(light, shade_constants) \
__m128i mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \ mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \
__m128i mlight_lo = mlight_hi; \ mlight_lo = mlight_hi; \
__m128i color = _mm_set_epi16( \ color = _mm_set_epi16( \
256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \ 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \
256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \ 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \
__m128i fade = _mm_set_epi16( \ fade = _mm_set_epi16( \
0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \ 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \
0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \ 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \
__m128i fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \ fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \
__m128i fade_amount_lo = fade_amount_hi; \ fade_amount_lo = fade_amount_hi; \
__m128i inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \ inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \
// Calculate constants for a complex shade with different light levels for each pixel // Calculate constants for a complex shade with different light levels for each pixel
#define SSE_SHADE_INIT4(light3, light2, light1, light0, shade_constants) \ #define SSE_SHADE_INIT4(light3, light2, light1, light0, shade_constants) \
__m128i mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \ mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \
__m128i mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2); \ mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2); \
__m128i color = _mm_set_epi16( \ color = _mm_set_epi16( \
256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \ 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \
256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \ 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \
__m128i fade = _mm_set_epi16( \ fade = _mm_set_epi16( \
0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \ 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \
0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \ 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \
__m128i fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \ fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \
__m128i fade_amount_lo = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_lo)); \ fade_amount_lo = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_lo)); \
__m128i inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \ inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \
// Complex shade 4 pixels // Complex shade 4 pixels
#define SSE_SHADE(fg, shade_constants) { \ #define SSE_SHADE(fg, shade_constants) { \

View file

@ -84,6 +84,7 @@ public:
if (shade_constants.simple_shade) if (shade_constants.simple_shade)
{ {
VEC_SHADE_VARS();
VEC_SHADE_SIMPLE_INIT(light); VEC_SHADE_SIMPLE_INIT(light);
while (sse_count--) while (sse_count--)
@ -121,6 +122,7 @@ public:
} }
else else
{ {
VEC_SHADE_VARS();
VEC_SHADE_INIT(light, shade_constants); VEC_SHADE_INIT(light, shade_constants);
while (sse_count--) while (sse_count--)
@ -184,6 +186,7 @@ public:
if (shade_constants.simple_shade) if (shade_constants.simple_shade)
{ {
VEC_SHADE_VARS();
VEC_SHADE_SIMPLE_INIT(light); VEC_SHADE_SIMPLE_INIT(light);
while (sse_count--) while (sse_count--)
@ -217,6 +220,7 @@ public:
} }
else else
{ {
VEC_SHADE_VARS();
VEC_SHADE_INIT(light, shade_constants); VEC_SHADE_INIT(light, shade_constants);
while (sse_count--) while (sse_count--)
@ -277,6 +281,7 @@ public:
if (shade_constants.simple_shade) if (shade_constants.simple_shade)
{ {
VEC_SHADE_VARS();
VEC_SHADE_SIMPLE_INIT(light); VEC_SHADE_SIMPLE_INIT(light);
while (sse_count--) while (sse_count--)
{ {
@ -289,6 +294,7 @@ public:
} }
else else
{ {
VEC_SHADE_VARS();
VEC_SHADE_INIT(light, shade_constants); VEC_SHADE_INIT(light, shade_constants);
while (sse_count--) while (sse_count--)
{ {
@ -317,6 +323,7 @@ public:
if (shade_constants.simple_shade) if (shade_constants.simple_shade)
{ {
VEC_SHADE_VARS();
VEC_SHADE_SIMPLE_INIT(light); VEC_SHADE_SIMPLE_INIT(light);
while (sse_count--) while (sse_count--)
{ {
@ -331,6 +338,7 @@ public:
} }
else else
{ {
VEC_SHADE_VARS();
VEC_SHADE_INIT(light, shade_constants); VEC_SHADE_INIT(light, shade_constants);
while (sse_count--) while (sse_count--)
{ {
@ -357,918 +365,3 @@ public:
} }
} }
}; };
class VecCommand(Vlinec4RGBA) : public DrawerCommand
{
BYTE * RESTRICT _dest;
int _count;
int _pitch;
ShadeConstants _shade_constants;
fixed_t palookuplight[4];
DWORD vplce[4];
DWORD vince[4];
const uint32 * RESTRICT bufplce[4];
const uint32_t * RESTRICT bufplce2[4];
uint32_t buftexturefracx[4];
uint32_t bufheight[4];
public:
VecCommand(Vlinec4RGBA)()
{
_dest = dc_dest;
_count = dc_count;
_pitch = dc_pitch;
_shade_constants = dc_shade_constants;
for (int i = 0; i < 4; i++)
{
palookuplight[i] = ::palookuplight[i];
vplce[i] = ::vplce[i];
vince[i] = ::vince[i];
bufplce[i] = (const uint32 *)::bufplce[i];
bufplce2[i] = (const uint32_t *)::bufplce2[i];
buftexturefracx[i] = ::buftexturefracx[i];
bufheight[i] = ::bufheight[i];
}
}
void Execute(DrawerThread *thread) override
{
int count = thread->count_for_thread(_dest_y, _count);
if (count <= 0)
return;
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
int pitch = _pitch * thread->num_cores;
uint32_t height[4];
uint32_t half[4];
for (int i = 0; i < 4; i++)
{
height[i] = bufheight[i];
half[i] = (0x80000000 + height[i] - 1) / height[i];
}
uint32_t light0 = LightBgra::calc_light_multiplier(palookuplight[0]);
uint32_t light1 = LightBgra::calc_light_multiplier(palookuplight[1]);
uint32_t light2 = LightBgra::calc_light_multiplier(palookuplight[2]);
uint32_t light3 = LightBgra::calc_light_multiplier(palookuplight[3]);
ShadeConstants shade_constants = _shade_constants;
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
int skipped = thread->skipped_by_thread(_dest_y);
for (int i = 0; i < 4; i++)
{
local_vplce[i] += local_vince[i] * skipped;
local_vince[i] *= thread->num_cores;
}
if (bufplce2[0] == nullptr)
{
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
do
{
DWORD place0 = local_vplce[0];
DWORD place1 = local_vplce[1];
DWORD place2 = local_vplce[2];
DWORD place3 = local_vplce[3];
uint32_t p0 = bufplce[0][((place0 >> FRACBITS) * height[0]) >> FRACBITS];
uint32_t p1 = bufplce[1][((place1 >> FRACBITS) * height[1]) >> FRACBITS];
uint32_t p2 = bufplce[2][((place2 >> FRACBITS) * height[2]) >> FRACBITS];
uint32_t p3 = bufplce[3][((place3 >> FRACBITS) * height[3]) >> FRACBITS];
local_vplce[0] = place0 + local_vince[0];
local_vplce[1] = place1 + local_vince[1];
local_vplce[2] = place2 + local_vince[2];
local_vplce[3] = place3 + local_vince[3];
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
VEC_SHADE_SIMPLE(fg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
do
{
DWORD place0 = local_vplce[0];
DWORD place1 = local_vplce[1];
DWORD place2 = local_vplce[2];
DWORD place3 = local_vplce[3];
uint32_t p0 = bufplce[0][((place0 >> FRACBITS) * height[0]) >> FRACBITS];
uint32_t p1 = bufplce[1][((place1 >> FRACBITS) * height[1]) >> FRACBITS];
uint32_t p2 = bufplce[2][((place2 >> FRACBITS) * height[2]) >> FRACBITS];
uint32_t p3 = bufplce[3][((place3 >> FRACBITS) * height[3]) >> FRACBITS];
local_vplce[0] = place0 + local_vince[0];
local_vplce[1] = place1 + local_vince[1];
local_vplce[2] = place2 + local_vince[2];
local_vplce[3] = place3 + local_vince[3];
__m128i fg = _mm_set_epi32(p3, p2, p1, p0);
VEC_SHADE(fg, shade_constants);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
}
else
{
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
do
{
__m128i fg;
VEC_SAMPLE_BILINEAR4_COLUMN(fg, bufplce, bufplce2, buftexturefracx, local_vplce, half, height);
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
VEC_SHADE_SIMPLE(fg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
do
{
__m128i fg;
VEC_SAMPLE_BILINEAR4_COLUMN(fg, bufplce, bufplce2, buftexturefracx, local_vplce, half, height);
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
VEC_SHADE(fg, shade_constants);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
}
}
};
class VecCommand(Mvlinec4RGBA) : public DrawerCommand
{
BYTE * RESTRICT _dest;
int _count;
int _pitch;
ShadeConstants _shade_constants;
uint32_t _mvlinemax;
fixed_t palookuplight[4];
DWORD vplce[4];
DWORD vince[4];
const uint32 * RESTRICT bufplce[4];
const uint32 * RESTRICT bufplce2[4];
uint32_t buftexturefracx[4];
uint32_t bufheight[4];
public:
VecCommand(Mvlinec4RGBA)()
{
_dest = dc_dest;
_count = dc_count;
_pitch = dc_pitch;
_shade_constants = dc_shade_constants;
for (int i = 0; i < 4; i++)
{
palookuplight[i] = ::palookuplight[i];
vplce[i] = ::vplce[i];
vince[i] = ::vince[i];
bufplce[i] = (const uint32 *)::bufplce[i];
bufplce2[i] = (const uint32_t *)::bufplce2[i];
buftexturefracx[i] = ::buftexturefracx[i];
bufheight[i] = ::bufheight[i];
}
}
void Execute(DrawerThread *thread) override
{
int count = thread->count_for_thread(_dest_y, _count);
if (count <= 0)
return;
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
int pitch = _pitch * thread->num_cores;
uint32_t height[4];
uint32_t half[4];
for (int i = 0; i < 4; i++)
{
height[i] = bufheight[i];
half[i] = (0x80000000 + height[i] - 1) / height[i];
}
uint32_t light0 = LightBgra::calc_light_multiplier(palookuplight[0]);
uint32_t light1 = LightBgra::calc_light_multiplier(palookuplight[1]);
uint32_t light2 = LightBgra::calc_light_multiplier(palookuplight[2]);
uint32_t light3 = LightBgra::calc_light_multiplier(palookuplight[3]);
ShadeConstants shade_constants = _shade_constants;
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
int skipped = thread->skipped_by_thread(_dest_y);
for (int i = 0; i < 4; i++)
{
local_vplce[i] += local_vince[i] * skipped;
local_vince[i] *= thread->num_cores;
}
if (bufplce2[0] == nullptr)
{
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
do
{
DWORD place0 = local_vplce[0];
DWORD place1 = local_vplce[1];
DWORD place2 = local_vplce[2];
DWORD place3 = local_vplce[3];
uint32_t pix0 = bufplce[0][((place0 >> FRACBITS) * height[0]) >> FRACBITS];
uint32_t pix1 = bufplce[1][((place1 >> FRACBITS) * height[1]) >> FRACBITS];
uint32_t pix2 = bufplce[2][((place2 >> FRACBITS) * height[2]) >> FRACBITS];
uint32_t pix3 = bufplce[3][((place3 >> FRACBITS) * height[3]) >> FRACBITS];
local_vplce[0] = place0 + local_vince[0];
local_vplce[1] = place1 + local_vince[1];
local_vplce[2] = place2 + local_vince[2];
local_vplce[3] = place3 + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
VEC_SHADE_SIMPLE(fg);
VEC_ALPHA_BLEND(fg, bg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
do
{
DWORD place0 = local_vplce[0];
DWORD place1 = local_vplce[1];
DWORD place2 = local_vplce[2];
DWORD place3 = local_vplce[3];
uint32_t pix0 = bufplce[0][((place0 >> FRACBITS) * height[0]) >> FRACBITS];
uint32_t pix1 = bufplce[1][((place1 >> FRACBITS) * height[1]) >> FRACBITS];
uint32_t pix2 = bufplce[2][((place2 >> FRACBITS) * height[2]) >> FRACBITS];
uint32_t pix3 = bufplce[3][((place3 >> FRACBITS) * height[3]) >> FRACBITS];
local_vplce[0] = place0 + local_vince[0];
local_vplce[1] = place1 + local_vince[1];
local_vplce[2] = place2 + local_vince[2];
local_vplce[3] = place3 + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
VEC_SHADE(fg, shade_constants);
VEC_ALPHA_BLEND(fg, bg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
}
else
{
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT4(light3, light2, light1, light0);
do
{
__m128i fg;
VEC_SAMPLE_BILINEAR4_COLUMN(fg, bufplce, bufplce2, buftexturefracx, local_vplce, half, height);
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
VEC_SHADE_SIMPLE(fg);
VEC_ALPHA_BLEND(fg, bg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light3, light2, light1, light0, shade_constants);
do
{
__m128i fg;
VEC_SAMPLE_BILINEAR4_COLUMN(fg, bufplce, bufplce2, buftexturefracx, local_vplce, half, height);
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
VEC_SHADE(fg, shade_constants);
VEC_ALPHA_BLEND(fg, bg);
_mm_storeu_si128((__m128i*)dest, fg);
dest += pitch;
} while (--count);
}
}
}
};
class VecCommand(Tmvline4AddRGBA) : public DrawerCommand
{
BYTE * RESTRICT _dest;
int _count;
int _pitch;
ShadeConstants _shade_constants;
fixed_t _srcalpha;
fixed_t _destalpha;
fixed_t palookuplight[4];
DWORD vplce[4];
DWORD vince[4];
const uint32 * RESTRICT bufplce[4];
uint32_t bufheight[4];
public:
VecCommand(Tmvline4AddRGBA)()
{
_dest = dc_dest;
_count = dc_count;
_pitch = dc_pitch;
_shade_constants = dc_shade_constants;
_srcalpha = dc_srcalpha;
_destalpha = dc_destalpha;
for (int i = 0; i < 4; i++)
{
palookuplight[i] = ::palookuplight[i];
vplce[i] = ::vplce[i];
vince[i] = ::vince[i];
bufplce[i] = (const uint32 *)::bufplce[i];
bufheight[i] = ::bufheight[i];
}
}
void Execute(DrawerThread *thread) override
{
int count = thread->count_for_thread(_dest_y, _count);
if (count <= 0)
return;
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
int pitch = _pitch * thread->num_cores;
uint32_t height[4];
uint32_t half[4];
for (int i = 0; i < 4; i++)
{
height[i] = bufheight[i];
half[i] = (0x80000000 + height[i] - 1) / height[i];
}
uint32_t light[4];
light[0] = LightBgra::calc_light_multiplier(palookuplight[0]);
light[1] = LightBgra::calc_light_multiplier(palookuplight[1]);
light[2] = LightBgra::calc_light_multiplier(palookuplight[2]);
light[3] = LightBgra::calc_light_multiplier(palookuplight[3]);
ShadeConstants shade_constants = _shade_constants;
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
int skipped = thread->skipped_by_thread(_dest_y);
for (int i = 0; i < 4; i++)
{
local_vplce[i] += local_vince[i] * skipped;
local_vince[i] *= thread->num_cores;
}
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
do
{
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg);
VEC_SHADE_SIMPLE(fg);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
do
{
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg);
VEC_SHADE(fg, shade_constants);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
}
};
class VecCommand(Tmvline4AddClampRGBA) : public DrawerCommand
{
BYTE * RESTRICT _dest;
int _count;
int _pitch;
ShadeConstants _shade_constants;
fixed_t _srcalpha;
fixed_t _destalpha;
fixed_t palookuplight[4];
DWORD vplce[4];
DWORD vince[4];
const uint32 *RESTRICT bufplce[4];
uint32_t bufheight[4];
public:
VecCommand(Tmvline4AddClampRGBA)()
{
_dest = dc_dest;
_count = dc_count;
_pitch = dc_pitch;
_shade_constants = dc_shade_constants;
_srcalpha = dc_srcalpha;
_destalpha = dc_destalpha;
for (int i = 0; i < 4; i++)
{
palookuplight[i] = ::palookuplight[i];
vplce[i] = ::vplce[i];
vince[i] = ::vince[i];
bufplce[i] = (const uint32 *)::bufplce[i];
bufheight[i] = ::bufheight[i];
}
}
void Execute(DrawerThread *thread) override
{
int count = thread->count_for_thread(_dest_y, _count);
if (count <= 0)
return;
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
int pitch = _pitch * thread->num_cores;
uint32_t height[4];
uint32_t half[4];
for (int i = 0; i < 4; i++)
{
height[i] = bufheight[i];
half[i] = (0x80000000 + height[i] - 1) / height[i];
}
uint32_t light[4];
light[0] = LightBgra::calc_light_multiplier(palookuplight[0]);
light[1] = LightBgra::calc_light_multiplier(palookuplight[1]);
light[2] = LightBgra::calc_light_multiplier(palookuplight[2]);
light[3] = LightBgra::calc_light_multiplier(palookuplight[3]);
ShadeConstants shade_constants = _shade_constants;
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
int skipped = thread->skipped_by_thread(_dest_y);
for (int i = 0; i < 4; i++)
{
local_vplce[i] += local_vince[i] * skipped;
local_vince[i] *= thread->num_cores;
}
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
do
{
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg);
VEC_SHADE_SIMPLE(fg);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
do
{
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg);
VEC_SHADE(fg, shade_constants);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
}
};
class VecCommand(Tmvline4SubClampRGBA) : public DrawerCommand
{
BYTE * RESTRICT _dest;
int _count;
int _pitch;
ShadeConstants _shade_constants;
fixed_t _srcalpha;
fixed_t _destalpha;
fixed_t palookuplight[4];
DWORD vplce[4];
DWORD vince[4];
const uint32 *RESTRICT bufplce[4];
uint32_t bufheight[4];
public:
VecCommand(Tmvline4SubClampRGBA)()
{
_dest = dc_dest;
_count = dc_count;
_pitch = dc_pitch;
_shade_constants = dc_shade_constants;
_srcalpha = dc_srcalpha;
_destalpha = dc_destalpha;
for (int i = 0; i < 4; i++)
{
palookuplight[i] = ::palookuplight[i];
vplce[i] = ::vplce[i];
vince[i] = ::vince[i];
bufplce[i] = (const uint32 *)::bufplce[i];
bufheight[i] = ::bufheight[i];
}
}
void Execute(DrawerThread *thread) override
{
int count = thread->count_for_thread(_dest_y, _count);
if (count <= 0)
return;
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
int pitch = _pitch * thread->num_cores;
uint32_t height[4];
uint32_t half[4];
for (int i = 0; i < 4; i++)
{
height[i] = bufheight[i];
half[i] = (0x80000000 + height[i] - 1) / height[i];
}
uint32_t light[4];
light[0] = LightBgra::calc_light_multiplier(palookuplight[0]);
light[1] = LightBgra::calc_light_multiplier(palookuplight[1]);
light[2] = LightBgra::calc_light_multiplier(palookuplight[2]);
light[3] = LightBgra::calc_light_multiplier(palookuplight[3]);
ShadeConstants shade_constants = _shade_constants;
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
int skipped = thread->skipped_by_thread(_dest_y);
for (int i = 0; i < 4; i++)
{
local_vplce[i] += local_vince[i] * skipped;
local_vince[i] *= thread->num_cores;
}
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
do
{
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg);
VEC_SHADE_SIMPLE(fg);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
do
{
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg);
VEC_SHADE(fg, shade_constants);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
}
};
class VecCommand(Tmvline4RevSubClampRGBA) : public DrawerCommand
{
BYTE * RESTRICT _dest;
int _count;
int _pitch;
ShadeConstants _shade_constants;
fixed_t _srcalpha;
fixed_t _destalpha;
fixed_t palookuplight[4];
DWORD vplce[4];
DWORD vince[4];
const uint32 *RESTRICT bufplce[4];
uint32_t bufheight[4];
public:
VecCommand(Tmvline4RevSubClampRGBA)()
{
_dest = dc_dest;
_count = dc_count;
_pitch = dc_pitch;
_shade_constants = dc_shade_constants;
_srcalpha = dc_srcalpha;
_destalpha = dc_destalpha;
for (int i = 0; i < 4; i++)
{
palookuplight[i] = ::palookuplight[i];
vplce[i] = ::vplce[i];
vince[i] = ::vince[i];
bufplce[i] = (const uint32 *)::bufplce[i];
bufheight[i] = ::bufheight[4];
}
}
void Execute(DrawerThread *thread) override
{
int count = thread->count_for_thread(_dest_y, _count);
if (count <= 0)
return;
uint32_t *dest = thread->dest_for_thread(_dest_y, _pitch, (uint32_t*)_dest);
int pitch = _pitch * thread->num_cores;
uint32_t height[4];
uint32_t half[4];
for (int i = 0; i < 4; i++)
{
height[i] = bufheight[i];
half[i] = (0x80000000 + height[i] - 1) / height[i];
}
uint32_t light[4];
light[0] = LightBgra::calc_light_multiplier(palookuplight[0]);
light[1] = LightBgra::calc_light_multiplier(palookuplight[1]);
light[2] = LightBgra::calc_light_multiplier(palookuplight[2]);
light[3] = LightBgra::calc_light_multiplier(palookuplight[3]);
ShadeConstants shade_constants = _shade_constants;
uint32_t src_alpha = _srcalpha >> (FRACBITS - 8);
uint32_t dest_alpha = _destalpha >> (FRACBITS - 8);
DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] };
DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] };
int skipped = thread->skipped_by_thread(_dest_y);
for (int i = 0; i < 4; i++)
{
local_vplce[i] += local_vince[i] * skipped;
local_vince[i] *= thread->num_cores;
}
if (shade_constants.simple_shade)
{
VEC_SHADE_SIMPLE_INIT4(light[3], light[2], light[1], light[0]);
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
do
{
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg);
VEC_SHADE_SIMPLE(fg);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
else
{
VEC_SHADE_INIT4(light[3], light[2], light[1], light[0], shade_constants);
VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha);
do
{
uint32_t pix0 = bufplce[0][((local_vplce[0] >> FRACBITS) * height[0]) >> FRACBITS];
uint32_t pix1 = bufplce[1][((local_vplce[1] >> FRACBITS) * height[1]) >> FRACBITS];
uint32_t pix2 = bufplce[2][((local_vplce[2] >> FRACBITS) * height[2]) >> FRACBITS];
uint32_t pix3 = bufplce[3][((local_vplce[3] >> FRACBITS) * height[3]) >> FRACBITS];
local_vplce[0] = local_vplce[0] + local_vince[0];
local_vplce[1] = local_vplce[1] + local_vince[1];
local_vplce[2] = local_vplce[2] + local_vince[2];
local_vplce[3] = local_vplce[3] + local_vince[3];
__m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0);
VEC_CALC_BLEND_ALPHA(fg);
VEC_SHADE(fg, shade_constants);
__m128i bg = _mm_loadu_si128((const __m128i*)dest);
__m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128());
__m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128());
__m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128());
__m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128());
__m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8);
__m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8);
__m128i out = _mm_packus_epi16(out_lo, out_hi);
_mm_storeu_si128((__m128i*)dest, out);
dest += pitch;
} while (--count);
}
}
};

View file

@ -53,8 +53,13 @@ extern unsigned int *horizspan[4];
#ifndef NO_SSE #ifndef NO_SSE
#ifdef _MSC_VER
#pragma warning(disable: 4101) // warning C4101: unreferenced local variable
#endif
// Generate SSE drawers: // Generate SSE drawers:
#define VecCommand(name) name##_SSE_Command #define VecCommand(name) name##_SSE_Command
#define VEC_SHADE_VARS SSE_SHADE_VARS
#define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT #define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT
#define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4 #define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4
#define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE #define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE

View file

@ -60,6 +60,7 @@ public:
if (shade_constants.simple_shade) if (shade_constants.simple_shade)
{ {
VEC_SHADE_VARS();
VEC_SHADE_SIMPLE_INIT(light); VEC_SHADE_SIMPLE_INIT(light);
if (count & 1) { if (count & 1) {
@ -110,6 +111,7 @@ public:
} }
else else
{ {
VEC_SHADE_VARS();
VEC_SHADE_INIT(light, shade_constants); VEC_SHADE_INIT(light, shade_constants);
if (count & 1) { if (count & 1) {
@ -218,6 +220,7 @@ public:
if (shade_constants.simple_shade) if (shade_constants.simple_shade)
{ {
VEC_SHADE_VARS();
VEC_SHADE_SIMPLE_INIT(light); VEC_SHADE_SIMPLE_INIT(light);
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
@ -254,6 +257,7 @@ public:
} }
else else
{ {
VEC_SHADE_VARS();
VEC_SHADE_INIT(light, shade_constants); VEC_SHADE_INIT(light, shade_constants);
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
@ -421,6 +425,7 @@ public:
if (shade_constants.simple_shade) if (shade_constants.simple_shade)
{ {
VEC_SHADE_VARS();
VEC_SHADE_SIMPLE_INIT(light); VEC_SHADE_SIMPLE_INIT(light);
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
@ -457,6 +462,7 @@ public:
} }
else else
{ {
VEC_SHADE_VARS();
VEC_SHADE_INIT(light, shade_constants); VEC_SHADE_INIT(light, shade_constants);
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
@ -547,6 +553,7 @@ public:
if (shade_constants.simple_shade) if (shade_constants.simple_shade)
{ {
VEC_SHADE_VARS();
VEC_SHADE_SIMPLE_INIT(light); VEC_SHADE_SIMPLE_INIT(light);
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
@ -583,6 +590,7 @@ public:
} }
else else
{ {
VEC_SHADE_VARS();
VEC_SHADE_INIT(light, shade_constants); VEC_SHADE_INIT(light, shade_constants);
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
@ -673,6 +681,7 @@ public:
if (shade_constants.simple_shade) if (shade_constants.simple_shade)
{ {
VEC_SHADE_VARS();
VEC_SHADE_SIMPLE_INIT(light); VEC_SHADE_SIMPLE_INIT(light);
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);
@ -709,6 +718,7 @@ public:
} }
else else
{ {
VEC_SHADE_VARS();
VEC_SHADE_INIT(light, shade_constants); VEC_SHADE_INIT(light, shade_constants);
__m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha);

View file

@ -1146,15 +1146,16 @@ WallscanSampler::WallscanSampler(int y1, float swal, double yrepeat, fixed_t xof
int mip_height = texture->GetHeight(); int mip_height = texture->GetHeight();
if (r_mipmap && texture->Mipmapped()) if (r_mipmap && texture->Mipmapped())
{ {
uint32_t xpos = (uint32_t)((((uint64_t)xoffset) << FRACBITS) / mip_width);
int level = (int)MAX(magnitude - 1.0, 0.0); int level = (int)MAX(magnitude - 1.0, 0.0);
while (level != 0) while (level != 0)
{ {
mipmap_offset += mip_width * mip_height; mipmap_offset += mip_width * mip_height;
xoffset >>= 1;
level >>= 1; level >>= 1;
mip_width = MAX(mip_width >> 1, 1); mip_width = MAX(mip_width >> 1, 1);
mip_height = MAX(mip_height >> 1, 1); mip_height = MAX(mip_height >> 1, 1);
} }
xoffset = (xpos >> FRACBITS) * mip_width;
} }
const uint32_t *pixels = texture->GetPixelsBgra() + mipmap_offset; const uint32_t *pixels = texture->GetPixelsBgra() + mipmap_offset;