diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 190a9560a..1453822ec 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -919,7 +919,9 @@ set( FASTMATH_PCH_SOURCES r_3dfloors.cpp r_bsp.cpp r_draw.cpp + r_draw_rgba.cpp r_drawt.cpp + r_drawt_rgba.cpp r_main.cpp r_plane.cpp r_segs.cpp diff --git a/src/doomtype.h b/src/doomtype.h index 34f750313..34e4e72e8 100644 --- a/src/doomtype.h +++ b/src/doomtype.h @@ -99,6 +99,11 @@ typedef TMap FClassMap; #endif +// Only use SSE intrinsics on Intel architecture +#if !defined(_M_IX86) && !defined(__i386__) && !defined(_M_X64) && !defined(__amd64__) +#define NO_SSE +#endif + #if defined(_MSC_VER) #define NOVTABLE __declspec(novtable) diff --git a/src/f_wipe.cpp b/src/f_wipe.cpp index a3ceb8d50..aa9038eeb 100644 --- a/src/f_wipe.cpp +++ b/src/f_wipe.cpp @@ -382,6 +382,9 @@ static bool (*wipes[])(int) = // Returns true if the wipe should be performed. bool wipe_StartScreen (int type) { + if (screen->IsBgra()) + return false; + CurrentWipeType = clamp(type, 0, wipe_NUMWIPES - 1); if (CurrentWipeType) @@ -395,11 +398,15 @@ bool wipe_StartScreen (int type) void wipe_EndScreen (void) { + if (screen->IsBgra()) + return; + if (CurrentWipeType) { wipe_scr_end = new short[SCREENWIDTH * SCREENHEIGHT / 2]; screen->GetBlock (0, 0, SCREENWIDTH, SCREENHEIGHT, (BYTE *)wipe_scr_end); screen->DrawBlock (0, 0, SCREENWIDTH, SCREENHEIGHT, (BYTE *)wipe_scr_start); // restore start scr. + // Initialize the wipe (*wipes[(CurrentWipeType-1)*3])(0); } @@ -410,6 +417,9 @@ bool wipe_ScreenWipe (int ticks) { bool rc; + if (screen->IsBgra()) + return true; + if (CurrentWipeType == wipe_None) return true; @@ -423,6 +433,9 @@ bool wipe_ScreenWipe (int ticks) // Final things for the wipe void wipe_Cleanup() { + if (screen->IsBgra()) + return; + if (wipe_scr_start != NULL) { delete[] wipe_scr_start; diff --git a/src/g_level.cpp b/src/g_level.cpp index 60e9b0699..b1d9c4959 100644 --- a/src/g_level.cpp +++ b/src/g_level.cpp @@ -1312,7 +1312,7 @@ void G_InitLevelLocals () level_info_t *info; BaseBlendA = 0.0f; // Remove underwater blend effect, if any - NormalLight.Maps = realcolormaps; + NormalLight.Maps = realcolormaps.Maps; // [BB] Instead of just setting the color, we also have to reset Desaturate and build the lights. NormalLight.ChangeColor (PalEntry (255, 255, 255), 0); diff --git a/src/g_shared/a_artifacts.cpp b/src/g_shared/a_artifacts.cpp index b5ac970ca..4dc154533 100644 --- a/src/g_shared/a_artifacts.cpp +++ b/src/g_shared/a_artifacts.cpp @@ -737,7 +737,8 @@ int APowerInvisibility::AlterWeaponSprite (visstyle_t *vis) if ((vis->Alpha < 0.25f && special1 > 0) || (vis->Alpha == 0)) { vis->Alpha = clamp((1.f - float(Strength/100)), 0.f, 1.f); - vis->colormap = SpecialColormaps[INVERSECOLORMAP].Colormap; + vis->BaseColormap = &SpecialColormaps[INVERSECOLORMAP]; + vis->ColormapNum = 0; } return -1; // This item is valid so another one shouldn't reset the translucency } diff --git a/src/g_strife/strife_sbar.cpp b/src/g_strife/strife_sbar.cpp index bcdf624d7..eb3fa2608 100644 --- a/src/g_strife/strife_sbar.cpp +++ b/src/g_strife/strife_sbar.cpp @@ -35,7 +35,6 @@ public: const BYTE *GetColumn (unsigned int column, const Span **spans_out); const BYTE *GetPixels (); bool CheckModified (); - void Unload (); void SetVial (int level); @@ -90,10 +89,6 @@ bool FHealthBar::CheckModified () return NeedRefresh; } -void FHealthBar::Unload () -{ -} - const BYTE *FHealthBar::GetColumn (unsigned int column, const Span **spans_out) { if (NeedRefresh) diff --git a/src/menu/playerdisplay.cpp b/src/menu/playerdisplay.cpp index c3d11a43a..16671975a 100644 --- a/src/menu/playerdisplay.cpp +++ b/src/menu/playerdisplay.cpp @@ -78,7 +78,6 @@ public: const BYTE *GetColumn(unsigned int column, const Span **spans_out); const BYTE *GetPixels(); - void Unload(); bool CheckModified(); protected: @@ -212,10 +211,6 @@ bool FBackdropTexture::CheckModified() return LastRenderTic != gametic; } -void FBackdropTexture::Unload() -{ -} - //============================================================================= // // diff --git a/src/posix/cocoa/i_video.mm b/src/posix/cocoa/i_video.mm index 5e073daf3..ddfccaa57 100644 --- a/src/posix/cocoa/i_video.mm +++ b/src/posix/cocoa/i_video.mm @@ -96,6 +96,17 @@ EXTERN_CVAR(Bool, ticker ) EXTERN_CVAR(Bool, vid_vsync) EXTERN_CVAR(Bool, vid_hidpi) +CUSTOM_CVAR(Bool, swtruecolor, false, CVAR_ARCHIVE | CVAR_GLOBALCONFIG | CVAR_NOINITCALL) +{ + // Strictly speaking this doesn't require a mode switch, but it is the easiest + // way to force a CreateFramebuffer call without a lot of refactoring. + extern int NewWidth, NewHeight, NewBits, DisplayBits; + NewWidth = screen->GetWidth(); + NewHeight = screen->GetHeight(); + NewBits = DisplayBits; + setmodeneeded = true; +} + CUSTOM_CVAR(Bool, fullscreen, false, CVAR_ARCHIVE | CVAR_GLOBALCONFIG) { extern int NewWidth, NewHeight, NewBits, DisplayBits; @@ -199,7 +210,7 @@ public: virtual EDisplayType GetDisplayType() { return DISPLAY_Both; } virtual void SetWindowedScale(float scale); - virtual DFrameBuffer* CreateFrameBuffer(int width, int height, bool fs, DFrameBuffer* old); + virtual DFrameBuffer* CreateFrameBuffer(int width, int height, bool bgra, bool fs, DFrameBuffer* old); virtual void StartModeIterator(int bits, bool fullscreen); virtual bool NextMode(int* width, int* height, bool* letterbox); @@ -238,7 +249,7 @@ private: class CocoaFrameBuffer : public DFrameBuffer { public: - CocoaFrameBuffer(int width, int height, bool fullscreen); + CocoaFrameBuffer(int width, int height, bool bgra, bool fullscreen); ~CocoaFrameBuffer(); virtual bool Lock(bool buffer); @@ -518,14 +529,14 @@ bool CocoaVideo::NextMode(int* const width, int* const height, bool* const lette return false; } -DFrameBuffer* CocoaVideo::CreateFrameBuffer(const int width, const int height, const bool fullscreen, DFrameBuffer* const old) +DFrameBuffer* CocoaVideo::CreateFrameBuffer(const int width, const int height, const bool bgra, const bool fullscreen, DFrameBuffer* const old) { PalEntry flashColor = 0; int flashAmount = 0; if (NULL != old) { - if (width == m_width && height == m_height) + if (width == m_width && height == m_height && bgra == old->IsBgra()) { SetMode(width, height, fullscreen, vid_hidpi); return old; @@ -542,7 +553,7 @@ DFrameBuffer* CocoaVideo::CreateFrameBuffer(const int width, const int height, c delete old; } - CocoaFrameBuffer* fb = new CocoaFrameBuffer(width, height, fullscreen); + CocoaFrameBuffer* fb = new CocoaFrameBuffer(width, height, bgra, fullscreen); fb->SetFlash(flashColor, flashAmount); SetMode(width, height, fullscreen, vid_hidpi); @@ -761,8 +772,8 @@ CocoaVideo* CocoaVideo::GetInstance() } -CocoaFrameBuffer::CocoaFrameBuffer(int width, int height, bool fullscreen) -: DFrameBuffer(width, height) +CocoaFrameBuffer::CocoaFrameBuffer(int width, int height, bool bgra, bool fullscreen) +: DFrameBuffer(width, height, bgra) , m_needPaletteUpdate(false) , m_gamma(0.0f) , m_needGammaUpdate(false) @@ -856,8 +867,15 @@ void CocoaFrameBuffer::Update() FlipCycles.Reset(); BlitCycles.Clock(); - GPfx.Convert(MemBuffer, Pitch, m_pixelBuffer, Width * BYTES_PER_PIXEL, - Width, Height, FRACUNIT, FRACUNIT, 0, 0); + if (IsBgra()) + { + CopyWithGammaBgra(m_pixelBuffer, Width * BYTES_PER_PIXEL, m_gammaTable[0], m_gammaTable[1], m_gammaTable[2], m_flashColor, m_flashAmount); + } + else + { + GPfx.Convert(MemBuffer, Pitch, m_pixelBuffer, Width * BYTES_PER_PIXEL, + Width, Height, FRACUNIT, FRACUNIT, 0, 0); + } FlipCycles.Clock(); Flip(); @@ -989,8 +1007,10 @@ void CocoaFrameBuffer::Flip() static const GLenum format = GL_ABGR_EXT; #endif // __LITTLE_ENDIAN__ - glTexImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, GL_RGBA8, - Width, Height, 0, format, GL_UNSIGNED_BYTE, m_pixelBuffer); + if (IsBgra()) + glTexImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, GL_RGBA8, Width, Height, 0, GL_BGRA_EXT, GL_UNSIGNED_BYTE, m_pixelBuffer); + else + glTexImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, GL_RGBA8, Width, Height, 0, format, GL_UNSIGNED_BYTE, m_pixelBuffer); glBegin(GL_QUADS); glColor4f(1.0f, 1.0f, 1.0f, 1.0f); @@ -1064,7 +1084,7 @@ void I_CreateRenderer() DFrameBuffer* I_SetMode(int &width, int &height, DFrameBuffer* old) { - return Video->CreateFrameBuffer(width, height, fullscreen, old); + return Video->CreateFrameBuffer(width, height, swtruecolor, fullscreen, old); } bool I_CheckResolution(const int width, const int height, const int bits) diff --git a/src/posix/hardware.h b/src/posix/hardware.h index 618941fe5..3c06cb6c6 100644 --- a/src/posix/hardware.h +++ b/src/posix/hardware.h @@ -74,7 +74,7 @@ class IVideo virtual EDisplayType GetDisplayType () = 0; virtual void SetWindowedScale (float scale) = 0; - virtual DFrameBuffer *CreateFrameBuffer (int width, int height, bool fs, DFrameBuffer *old) = 0; + virtual DFrameBuffer *CreateFrameBuffer (int width, int height, bool bgra, bool fs, DFrameBuffer *old) = 0; virtual void StartModeIterator (int bits, bool fs) = 0; virtual bool NextMode (int *width, int *height, bool *letterbox) = 0; diff --git a/src/posix/sdl/hardware.cpp b/src/posix/sdl/hardware.cpp index 8bae3f900..221cca44a 100644 --- a/src/posix/sdl/hardware.cpp +++ b/src/posix/sdl/hardware.cpp @@ -51,6 +51,7 @@ EXTERN_CVAR (Bool, ticker) EXTERN_CVAR (Bool, fullscreen) +EXTERN_CVAR (Bool, swtruecolor) EXTERN_CVAR (Float, vid_winscale) IVideo *Video; @@ -128,7 +129,7 @@ DFrameBuffer *I_SetMode (int &width, int &height, DFrameBuffer *old) fs = fullscreen; break; } - DFrameBuffer *res = Video->CreateFrameBuffer (width, height, fs, old); + DFrameBuffer *res = Video->CreateFrameBuffer (width, height, swtruecolor, fs, old); /* Right now, CreateFrameBuffer cannot return NULL if (res == NULL) @@ -282,6 +283,16 @@ CUSTOM_CVAR (Int, vid_maxfps, 200, CVAR_ARCHIVE | CVAR_GLOBALCONFIG) extern int NewWidth, NewHeight, NewBits, DisplayBits; +CUSTOM_CVAR(Bool, swtruecolor, false, CVAR_ARCHIVE|CVAR_GLOBALCONFIG|CVAR_NOINITCALL) +{ + // Strictly speaking this doesn't require a mode switch, but it is the easiest + // way to force a CreateFramebuffer call without a lot of refactoring. + NewWidth = screen->GetWidth(); + NewHeight = screen->GetHeight(); + NewBits = DisplayBits; + setmodeneeded = true; +} + CUSTOM_CVAR (Bool, fullscreen, false, CVAR_ARCHIVE|CVAR_GLOBALCONFIG) { NewWidth = screen->GetWidth(); diff --git a/src/posix/sdl/sdlvideo.cpp b/src/posix/sdl/sdlvideo.cpp index 04c3a3f2e..56b883978 100644 --- a/src/posix/sdl/sdlvideo.cpp +++ b/src/posix/sdl/sdlvideo.cpp @@ -28,7 +28,7 @@ class SDLFB : public DFrameBuffer { DECLARE_CLASS(SDLFB, DFrameBuffer) public: - SDLFB (int width, int height, bool fullscreen, SDL_Window *oldwin); + SDLFB (int width, int height, bool bgra, bool fullscreen, SDL_Window *oldwin); ~SDLFB (); bool Lock (bool buffer); @@ -257,7 +257,7 @@ bool SDLVideo::NextMode (int *width, int *height, bool *letterbox) return false; } -DFrameBuffer *SDLVideo::CreateFrameBuffer (int width, int height, bool fullscreen, DFrameBuffer *old) +DFrameBuffer *SDLVideo::CreateFrameBuffer (int width, int height, bool bgra, bool fullscreen, DFrameBuffer *old) { static int retry = 0; static int owidth, oheight; @@ -271,7 +271,8 @@ DFrameBuffer *SDLVideo::CreateFrameBuffer (int width, int height, bool fullscree { // Reuse the old framebuffer if its attributes are the same SDLFB *fb = static_cast (old); if (fb->Width == width && - fb->Height == height) + fb->Height == height && + fb->Bgra == bgra) { bool fsnow = (SDL_GetWindowFlags (fb->Screen) & SDL_WINDOW_FULLSCREEN_DESKTOP) != 0; @@ -296,7 +297,7 @@ DFrameBuffer *SDLVideo::CreateFrameBuffer (int width, int height, bool fullscree flashAmount = 0; } - SDLFB *fb = new SDLFB (width, height, fullscreen, oldwin); + SDLFB *fb = new SDLFB (width, height, bgra, fullscreen, oldwin); // If we could not create the framebuffer, try again with slightly // different parameters in this order: @@ -335,7 +336,7 @@ DFrameBuffer *SDLVideo::CreateFrameBuffer (int width, int height, bool fullscree } ++retry; - fb = static_cast(CreateFrameBuffer (width, height, fullscreen, NULL)); + fb = static_cast(CreateFrameBuffer (width, height, bgra, fullscreen, NULL)); } retry = 0; @@ -350,8 +351,8 @@ void SDLVideo::SetWindowedScale (float scale) // FrameBuffer implementation ----------------------------------------------- -SDLFB::SDLFB (int width, int height, bool fullscreen, SDL_Window *oldwin) - : DFrameBuffer (width, height) +SDLFB::SDLFB (int width, int height, bool bgra, bool fullscreen, SDL_Window *oldwin) + : DFrameBuffer (width, height, bgra) { int i; @@ -494,7 +495,11 @@ void SDLFB::Update () pitch = Surface->pitch; } - if (NotPaletted) + if (Bgra) + { + CopyWithGammaBgra(pixels, pitch, GammaTable[0], GammaTable[1], GammaTable[2], Flash, FlashAmount); + } + else if (NotPaletted) { GPfx.Convert (MemBuffer, Pitch, pixels, pitch, Width, Height, @@ -674,13 +679,20 @@ void SDLFB::ResetSDLRenderer () SDL_SetRenderDrawColor(Renderer, 0, 0, 0, 255); Uint32 fmt; - switch(vid_displaybits) + if (Bgra) { - default: fmt = SDL_PIXELFORMAT_ARGB8888; break; - case 30: fmt = SDL_PIXELFORMAT_ARGB2101010; break; - case 24: fmt = SDL_PIXELFORMAT_RGB888; break; - case 16: fmt = SDL_PIXELFORMAT_RGB565; break; - case 15: fmt = SDL_PIXELFORMAT_ARGB1555; break; + fmt = SDL_PIXELFORMAT_ARGB8888; + } + else + { + switch (vid_displaybits) + { + default: fmt = SDL_PIXELFORMAT_ARGB8888; break; + case 30: fmt = SDL_PIXELFORMAT_ARGB2101010; break; + case 24: fmt = SDL_PIXELFORMAT_RGB888; break; + case 16: fmt = SDL_PIXELFORMAT_RGB565; break; + case 15: fmt = SDL_PIXELFORMAT_ARGB1555; break; + } } Texture = SDL_CreateTexture (Renderer, fmt, SDL_TEXTUREACCESS_STREAMING, Width, Height); diff --git a/src/posix/sdl/sdlvideo.h b/src/posix/sdl/sdlvideo.h index 072167b5a..385733bc1 100644 --- a/src/posix/sdl/sdlvideo.h +++ b/src/posix/sdl/sdlvideo.h @@ -10,7 +10,7 @@ class SDLVideo : public IVideo EDisplayType GetDisplayType () { return DISPLAY_Both; } void SetWindowedScale (float scale); - DFrameBuffer *CreateFrameBuffer (int width, int height, bool fs, DFrameBuffer *old); + DFrameBuffer *CreateFrameBuffer (int width, int height, bool bgra, bool fs, DFrameBuffer *old); void StartModeIterator (int bits, bool fs); bool NextMode (int *width, int *height, bool *letterbox); diff --git a/src/r_data/colormaps.cpp b/src/r_data/colormaps.cpp index b46342463..ffaaa38ac 100644 --- a/src/r_data/colormaps.cpp +++ b/src/r_data/colormaps.cpp @@ -71,7 +71,7 @@ struct FakeCmap }; TArray fakecmaps; -BYTE *realcolormaps; +FColormap realcolormaps; size_t numfakecmaps; @@ -408,7 +408,7 @@ void R_SetDefaultColormap (const char *name) foo.Color = 0xFFFFFF; foo.Fade = 0; - foo.Maps = realcolormaps; + foo.Maps = realcolormaps.Maps; foo.Desaturate = 0; foo.Next = NULL; foo.BuildLights (); @@ -430,7 +430,7 @@ void R_SetDefaultColormap (const char *name) remap[0] = 0; for (i = 0; i < NUMCOLORMAPS; ++i) { - BYTE *map2 = &realcolormaps[i*256]; + BYTE *map2 = &realcolormaps.Maps[i*256]; lumpr.Read (map, 256); for (j = 0; j < 256; ++j) { @@ -454,11 +454,7 @@ void R_DeinitColormaps () { SpecialColormaps.Clear(); fakecmaps.Clear(); - if (realcolormaps != NULL) - { - delete[] realcolormaps; - realcolormaps = NULL; - } + delete[] realcolormaps.Maps; FreeSpecialLights(); } @@ -501,7 +497,7 @@ void R_InitColormaps () } } } - realcolormaps = new BYTE[256*NUMCOLORMAPS*fakecmaps.Size()]; + realcolormaps.Maps = new BYTE[256*NUMCOLORMAPS*fakecmaps.Size()]; R_SetDefaultColormap ("COLORMAP"); if (fakecmaps.Size() > 1) @@ -523,7 +519,7 @@ void R_InitColormaps () { int k, r, g, b; FWadLump lump = Wads.OpenLumpNum (fakecmaps[j].lump); - BYTE *const map = realcolormaps + NUMCOLORMAPS*256*j; + BYTE *const map = realcolormaps.Maps + NUMCOLORMAPS*256*j; for (k = 0; k < NUMCOLORMAPS; ++k) { @@ -550,8 +546,8 @@ void R_InitColormaps () } NormalLight.Color = PalEntry (255, 255, 255); NormalLight.Fade = 0; - NormalLight.Maps = realcolormaps; - NormalLightHasFixedLights = R_CheckForFixedLights(realcolormaps); + NormalLight.Maps = realcolormaps.Maps; + NormalLightHasFixedLights = R_CheckForFixedLights(realcolormaps.Maps); numfakecmaps = fakecmaps.Size(); // build default special maps (e.g. invulnerability) diff --git a/src/r_data/colormaps.h b/src/r_data/colormaps.h index 0764191a3..bda6a5ea4 100644 --- a/src/r_data/colormaps.h +++ b/src/r_data/colormaps.h @@ -1,18 +1,26 @@ #ifndef __RES_CMAP_H #define __RES_CMAP_H +struct FColormap; + void R_InitColormaps (); void R_DeinitColormaps (); DWORD R_ColormapNumForName(const char *name); // killough 4/4/98 void R_SetDefaultColormap (const char *name); // [RH] change normal fadetable DWORD R_BlendForColormap (DWORD map); // [RH] return calculated blend for a colormap -extern BYTE *realcolormaps; // [RH] make the colormaps externally visible +extern FColormap realcolormaps; // [RH] make the colormaps externally visible extern size_t numfakecmaps; +struct FColormap +{ + BYTE *Maps = nullptr; + PalEntry Color = 0xffffffff; + PalEntry Fade = 0xff000000; + int Desaturate = 0; +}; - -struct FDynamicColormap +struct FDynamicColormap : FColormap { void ChangeFade (PalEntry fadecolor); void ChangeColor (PalEntry lightcolor, int desaturate); @@ -20,10 +28,6 @@ struct FDynamicColormap void BuildLights (); static void RebuildAllLights(); - BYTE *Maps; - PalEntry Color; - PalEntry Fade; - int Desaturate; FDynamicColormap *Next; }; @@ -43,8 +47,13 @@ enum }; -struct FSpecialColormap +struct FSpecialColormap : FColormap { + FSpecialColormap() + { + Maps = Colormap; + } + float ColorizeStart[3]; float ColorizeEnd[3]; BYTE Colormap[256]; diff --git a/src/r_defs.h b/src/r_defs.h index f27ac2716..c0f878664 100644 --- a/src/r_defs.h +++ b/src/r_defs.h @@ -1397,11 +1397,13 @@ struct FMiniBSP // typedef BYTE lighttable_t; // This could be wider for >8 bit display. +struct FColormap; // This encapsulates the fields of vissprite_t that can be altered by AlterWeaponSprite struct visstyle_t { - lighttable_t *colormap; + int ColormapNum; // Which colormap is rendered + FColormap *BaseColormap; // Base colormap used together with ColormapNum float Alpha; FRenderStyle RenderStyle; }; diff --git a/src/r_draw.cpp b/src/r_draw.cpp index 80b91ed2d..682ed4668 100644 --- a/src/r_draw.cpp +++ b/src/r_draw.cpp @@ -38,6 +38,8 @@ #include "r_data/r_translate.h" #include "v_palette.h" #include "r_data/colormaps.h" +#include "r_plane.h" +#include "r_draw_rgba.h" #include "gi.h" #include "stats.h" @@ -70,6 +72,19 @@ int scaledviewwidth; // screen depth and asm/no asm. void (*R_DrawColumnHoriz)(void); void (*R_DrawColumn)(void); +void (*R_FillColumn)(void); +void (*R_FillAddColumn)(void); +void (*R_FillAddClampColumn)(void); +void (*R_FillSubClampColumn)(void); +void (*R_FillRevSubClampColumn)(void); +void (*R_DrawAddColumn)(void); +void (*R_DrawTlatedAddColumn)(void); +void (*R_DrawAddClampColumn)(void); +void (*R_DrawAddClampTranslatedColumn)(void); +void (*R_DrawSubClampColumn)(void); +void (*R_DrawSubClampTranslatedColumn)(void); +void (*R_DrawRevSubClampColumn)(void); +void (*R_DrawRevSubClampTranslatedColumn)(void); void (*R_DrawFuzzColumn)(void); void (*R_DrawTranslatedColumn)(void); void (*R_DrawShadedColumn)(void); @@ -79,7 +94,48 @@ void (*R_DrawSpanTranslucent)(void); void (*R_DrawSpanMaskedTranslucent)(void); void (*R_DrawSpanAddClamp)(void); void (*R_DrawSpanMaskedAddClamp)(void); -void (*rt_map4cols)(int,int,int); +void (*R_FillSpan)(void); +void (*R_FillColumnHoriz)(void); +void (*R_DrawFogBoundary)(int x1, int x2, short *uclip, short *dclip); +void (*R_MapTiltedPlane)(int y, int x1); +void (*R_MapColoredPlane)(int y, int x1); +void (*R_DrawParticle)(vissprite_t *); +void (*R_SetupDrawSlab)(FColormap *base_colormap, float light, int shade); +void (*R_DrawSlab)(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p); +fixed_t (*tmvline1_add)(); +void (*tmvline4_add)(); +fixed_t (*tmvline1_addclamp)(); +void (*tmvline4_addclamp)(); +fixed_t (*tmvline1_subclamp)(); +void (*tmvline4_subclamp)(); +fixed_t (*tmvline1_revsubclamp)(); +void (*tmvline4_revsubclamp)(); +void (*rt_copy1col)(int hx, int sx, int yl, int yh); +void (*rt_copy4cols)(int sx, int yl, int yh); +void (*rt_shaded1col)(int hx, int sx, int yl, int yh); +void (*rt_shaded4cols)(int sx, int yl, int yh); +void (*rt_map1col)(int hx, int sx, int yl, int yh); +void (*rt_add1col)(int hx, int sx, int yl, int yh); +void (*rt_addclamp1col)(int hx, int sx, int yl, int yh); +void (*rt_subclamp1col)(int hx, int sx, int yl, int yh); +void (*rt_revsubclamp1col)(int hx, int sx, int yl, int yh); +void (*rt_tlate1col)(int hx, int sx, int yl, int yh); +void (*rt_tlateadd1col)(int hx, int sx, int yl, int yh); +void (*rt_tlateaddclamp1col)(int hx, int sx, int yl, int yh); +void (*rt_tlatesubclamp1col)(int hx, int sx, int yl, int yh); +void (*rt_tlaterevsubclamp1col)(int hx, int sx, int yl, int yh); +void (*rt_map4cols)(int sx, int yl, int yh); +void (*rt_add4cols)(int sx, int yl, int yh); +void (*rt_addclamp4cols)(int sx, int yl, int yh); +void (*rt_subclamp4cols)(int sx, int yl, int yh); +void (*rt_revsubclamp4cols)(int sx, int yl, int yh); +void (*rt_tlate4cols)(int sx, int yl, int yh); +void (*rt_tlateadd4cols)(int sx, int yl, int yh); +void (*rt_tlateaddclamp4cols)(int sx, int yl, int yh); +void (*rt_tlatesubclamp4cols)(int sx, int yl, int yh); +void (*rt_tlaterevsubclamp4cols)(int sx, int yl, int yh); +void (*rt_initcols)(BYTE *buffer); +void (*rt_span_coverage)(int x, int start, int stop); // // R_DrawColumn @@ -90,18 +146,27 @@ extern "C" { int dc_pitch=0xABadCafe; // [RH] Distance between rows lighttable_t* dc_colormap; +FColormap *dc_fcolormap; +ShadeConstants dc_shade_constants; +fixed_t dc_light; int dc_x; int dc_yl; int dc_yh; fixed_t dc_iscale; fixed_t dc_texturefrac; +uint32_t dc_textureheight; int dc_color; // [RH] Color for column filler DWORD dc_srccolor; +uint32_t dc_srccolor_bgra; DWORD *dc_srcblend; // [RH] Source and destination DWORD *dc_destblend; // blending lookups +fixed_t dc_srcalpha; // Alpha value used by dc_srcblend +fixed_t dc_destalpha; // Alpha value used by dc_destblend // first pixel in a column (possibly virtual) const BYTE* dc_source; +const BYTE* dc_source2; +uint32_t dc_texturefracx; BYTE* dc_dest; int dc_count; @@ -109,7 +174,11 @@ int dc_count; DWORD vplce[4]; DWORD vince[4]; BYTE* palookupoffse[4]; +fixed_t palookuplight[4]; const BYTE* bufplce[4]; +const BYTE* bufplce2[4]; +uint32_t buftexturefracx[4]; +uint32_t bufheight[4]; // just for profiling int dccount; @@ -120,10 +189,10 @@ BYTE *dc_translation; BYTE shadetables[NUMCOLORMAPS*16*256]; FDynamicColormap ShadeFakeColormap[16]; BYTE identitymap[256]; +FDynamicColormap identitycolormap; EXTERN_CVAR (Int, r_columnmethod) - void R_InitShadeMaps() { int i,j; @@ -161,6 +230,10 @@ void R_InitShadeMaps() { identitymap[i] = i; } + identitycolormap.Color = ~0u; + identitycolormap.Desaturate = 0; + identitycolormap.Next = NULL; + identitycolormap.Maps = identitymap; } /************************************/ @@ -223,7 +296,7 @@ void R_DrawColumnP_C (void) #endif // [RH] Just fills a column with a color -void R_FillColumnP (void) +void R_FillColumnP_C (void) { int count; BYTE* dest; @@ -247,7 +320,7 @@ void R_FillColumnP (void) } } -void R_FillAddColumn (void) +void R_FillAddColumn_C (void) { int count; BYTE *dest; @@ -271,10 +344,9 @@ void R_FillAddColumn (void) *dest = RGB32k.All[bg & (bg>>15)]; dest += pitch; } while (--count); - } -void R_FillAddClampColumn (void) +void R_FillAddClampColumn_C (void) { int count; BYTE *dest; @@ -304,10 +376,9 @@ void R_FillAddClampColumn (void) *dest = RGB32k.All[a & (a>>15)]; dest += pitch; } while (--count); - } -void R_FillSubClampColumn (void) +void R_FillSubClampColumn_C (void) { int count; BYTE *dest; @@ -336,10 +407,9 @@ void R_FillSubClampColumn (void) *dest = RGB32k.All[a & (a>>15)]; dest += pitch; } while (--count); - } -void R_FillRevSubClampColumn (void) +void R_FillRevSubClampColumn_C (void) { int count; BYTE *dest; @@ -368,13 +438,11 @@ void R_FillRevSubClampColumn (void) *dest = RGB32k.All[a & (a>>15)]; dest += pitch; } while (--count); - } // // Spectre/Invisibility. // -#define FUZZTABLE 50 extern "C" { @@ -647,8 +715,8 @@ void R_DrawTlatedAddColumnP_C (void) fg = fg2rgb[fg]; bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - *dest = RGB32k.All[fg & (fg>>15)]; + fg = (fg + bg) | 0x1f07c1f; + *dest = RGB32k.All[fg & (fg >> 15)]; dest += pitch; frac += fracstep; } while (--count); @@ -937,8 +1005,6 @@ void R_DrawRevSubClampTranslatedColumnP_C () } } - - // // R_DrawSpan // With DOOM style restrictions on view orientation, @@ -966,7 +1032,10 @@ int ds_y; int ds_x1; int ds_x2; +FColormap* ds_fcolormap; lighttable_t* ds_colormap; +ShadeConstants ds_shade_constants; +dsfixed_t ds_light; dsfixed_t ds_xfrac; dsfixed_t ds_yfrac; @@ -977,6 +1046,7 @@ int ds_ybits; // start of a floor/ceiling tile image const BYTE* ds_source; +bool ds_source_mipmapped; // just for profiling int dscount; @@ -997,13 +1067,14 @@ extern "C" BYTE *ds_curcolormap, *ds_cursource, *ds_curtiltedsource; // //========================================================================== -void R_SetSpanSource(const BYTE *pixels) +void R_SetSpanSource(FTexture *tex) { - ds_source = pixels; + ds_source = r_swtruecolor ? (const BYTE*)tex->GetPixelsBgra() : tex->GetPixels(); + ds_source_mipmapped = tex->Mipmapped(); #ifdef X86_ASM - if (ds_cursource != ds_source) + if (!r_swtruecolor && ds_cursource != ds_source) { - R_SetSpanSource_ASM(pixels); + R_SetSpanSource_ASM(ds_source); } #endif } @@ -1016,11 +1087,11 @@ void R_SetSpanSource(const BYTE *pixels) // //========================================================================== -void R_SetSpanColormap(BYTE *colormap) +void R_SetSpanColormap(FDynamicColormap *colormap, int shade) { - ds_colormap = colormap; + R_SetDSColorMapLight(colormap, 0, shade); #ifdef X86_ASM - if (ds_colormap != ds_curcolormap) + if (!r_swtruecolor && ds_colormap != ds_curcolormap) { R_SetSpanColormap_ASM (ds_colormap); } @@ -1049,7 +1120,8 @@ void R_SetupSpanBits(FTexture *tex) ds_ybits--; } #ifdef X86_ASM - R_SetSpanSize_ASM (ds_xbits, ds_ybits); + if (!r_swtruecolor) + R_SetSpanSize_ASM (ds_xbits, ds_ybits); #endif } @@ -1090,6 +1162,7 @@ void R_DrawSpanP_C (void) if (ds_xbits == 6 && ds_ybits == 6) { // 64x64 is the most common case by far, so special case it. + do { // Current texture index in u,v. @@ -1471,11 +1544,12 @@ void R_DrawSpanMaskedAddClampP_C (void) } // [RH] Just fill a span with a color -void R_FillSpan (void) +void R_FillSpan_C (void) { memset (ylookup[ds_y] + ds_x1 + dc_destorg, ds_color, ds_x2 - ds_x1 + 1); } + // Draw a voxel slab // // "Build Engine & Tools" Copyright (c) 1993-1997 Ken Silverman @@ -1572,17 +1646,19 @@ extern "C" void R_DrawSlabC(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *v // wallscan stuff, in C +int vlinebits; +int mvlinebits; + #ifndef X86_ASM static DWORD vlinec1 (); -static int vlinebits; DWORD (*dovline1)() = vlinec1; DWORD (*doprevline1)() = vlinec1; #ifdef X64_ASM extern "C" void vlinetallasm4(); -#define dovline4 vlinetallasm4 extern "C" void setupvlinetallasm (int); +void (*dovline4)() = vlinetallasm4; #else static void vlinec4 (); void (*dovline4)() = vlinec4; @@ -1590,7 +1666,6 @@ void (*dovline4)() = vlinec4; static DWORD mvlinec1(); static void mvlinec4(); -static int mvlinebits; DWORD (*domvline1)() = mvlinec1; void (*domvline4)() = mvlinec4; @@ -1624,6 +1699,12 @@ void (*domvline4)() = mvlineasm4; void setupvline (int fracbits) { + if (r_swtruecolor) + { + vlinebits = fracbits; + return; + } + #ifdef X86_ASM if (CPU.Family <= 5) { @@ -1679,7 +1760,9 @@ DWORD vlinec1 () return frac; } +#endif +#if !defined(X86_ASM) void vlinec4 () { BYTE *dest = dc_dest; @@ -1700,13 +1783,20 @@ void vlinec4 () void setupmvline (int fracbits) { + if (!r_swtruecolor) + { #if defined(X86_ASM) - setupmvlineasm (fracbits); - domvline1 = mvlineasm1; - domvline4 = mvlineasm4; + setupmvlineasm(fracbits); + domvline1 = mvlineasm1; + domvline4 = mvlineasm4; #else - mvlinebits = fracbits; + mvlinebits = fracbits; #endif + } + else + { + mvlinebits = fracbits; + } } #if !defined(X86_ASM) @@ -1788,7 +1878,7 @@ static void R_DrawFogBoundaryLine (int y, int x) } while (++x <= x2); } -void R_DrawFogBoundary (int x1, int x2, short *uclip, short *dclip) +void R_DrawFogBoundary_C (int x1, int x2, short *uclip, short *dclip) { // This is essentially the same as R_MapVisPlane but with an extra step // to create new horizontal spans whenever the light changes enough that @@ -1808,7 +1898,7 @@ void R_DrawFogBoundary (int x1, int x2, short *uclip, short *dclip) clearbufshort (spanend+t2, b2-t2, x); } - dc_colormap = basecolormapdata + (rcolormap << COLORMAPSHIFT); + R_SetColorMapLight(basecolormap, (float)light, wallshade); for (--x; x >= x1; --x) { @@ -1833,7 +1923,7 @@ void R_DrawFogBoundary (int x1, int x2, short *uclip, short *dclip) clearbufshort (spanend+t2, b2-t2, x); } rcolormap = lcolormap; - dc_colormap = basecolormapdata + (lcolormap << COLORMAPSHIFT); + R_SetColorMapLight(basecolormap, (float)light, wallshade); } else { @@ -1884,7 +1974,7 @@ void setuptmvline (int bits) tmvlinebits = bits; } -fixed_t tmvline1_add () +fixed_t tmvline1_add_C () { DWORD fracstep = dc_iscale; DWORD frac = dc_texturefrac; @@ -1915,7 +2005,7 @@ fixed_t tmvline1_add () return frac; } -void tmvline4_add () +void tmvline4_add_C () { BYTE *dest = dc_dest; int count = dc_count; @@ -1942,7 +2032,7 @@ void tmvline4_add () } while (--count); } -fixed_t tmvline1_addclamp () +fixed_t tmvline1_addclamp_C () { DWORD fracstep = dc_iscale; DWORD frac = dc_texturefrac; @@ -1978,7 +2068,7 @@ fixed_t tmvline1_addclamp () return frac; } -void tmvline4_addclamp () +void tmvline4_addclamp_C () { BYTE *dest = dc_dest; int count = dc_count; @@ -2010,7 +2100,7 @@ void tmvline4_addclamp () } while (--count); } -fixed_t tmvline1_subclamp () +fixed_t tmvline1_subclamp_C () { DWORD fracstep = dc_iscale; DWORD frac = dc_texturefrac; @@ -2045,7 +2135,7 @@ fixed_t tmvline1_subclamp () return frac; } -void tmvline4_subclamp () +void tmvline4_subclamp_C () { BYTE *dest = dc_dest; int count = dc_count; @@ -2076,7 +2166,7 @@ void tmvline4_subclamp () } while (--count); } -fixed_t tmvline1_revsubclamp () +fixed_t tmvline1_revsubclamp_C () { DWORD fracstep = dc_iscale; DWORD frac = dc_texturefrac; @@ -2111,7 +2201,7 @@ fixed_t tmvline1_revsubclamp () return frac; } -void tmvline4_revsubclamp () +void tmvline4_revsubclamp_C () { BYTE *dest = dc_dest; int count = dc_count; @@ -2142,7 +2232,6 @@ void tmvline4_revsubclamp () } while (--count); } - //========================================================================== // // R_GetColumn @@ -2159,43 +2248,242 @@ const BYTE *R_GetColumn (FTexture *tex, int col) { col = width + (col % width); } - return tex->GetColumn (col, NULL); -} + if (r_swtruecolor) + return (const BYTE *)tex->GetColumnBgra(col, NULL); + else + return tex->GetColumn(col, NULL); +} // [RH] Initialize the column drawer pointers void R_InitColumnDrawers () { -#ifdef X86_ASM - R_DrawColumn = R_DrawColumnP_ASM; - R_DrawColumnHoriz = R_DrawColumnHorizP_ASM; - R_DrawFuzzColumn = R_DrawFuzzColumnP_ASM; - R_DrawTranslatedColumn = R_DrawTranslatedColumnP_C; - R_DrawShadedColumn = R_DrawShadedColumnP_C; - R_DrawSpan = R_DrawSpanP_ASM; - R_DrawSpanMasked = R_DrawSpanMaskedP_ASM; - if (CPU.Family <= 5) + // Save a copy when switching to true color mode as the assembly palette drawers might change them + static bool pointers_saved = false; + static DWORD(*dovline1_saved)(); + static DWORD(*doprevline1_saved)(); + static DWORD(*domvline1_saved)(); + static void(*dovline4_saved)(); + static void(*domvline4_saved)(); + + if (r_swtruecolor) { - rt_map4cols = rt_map4cols_asm2; + if (!pointers_saved) + { + pointers_saved = true; + dovline1_saved = dovline1; + doprevline1_saved = doprevline1; + domvline1_saved = domvline1; + dovline4_saved = dovline4; + domvline4_saved = domvline4; + } + + R_DrawColumnHoriz = R_DrawColumnHoriz_rgba; + R_DrawColumn = R_DrawColumn_rgba; + R_DrawFuzzColumn = R_DrawFuzzColumn_rgba; + R_DrawTranslatedColumn = R_DrawTranslatedColumn_rgba; + R_DrawShadedColumn = R_DrawShadedColumn_rgba; + R_DrawSpanMasked = R_DrawSpanMasked_rgba; + R_DrawSpan = R_DrawSpan_rgba; + + R_DrawSpanTranslucent = R_DrawSpanTranslucent_rgba; + R_DrawSpanMaskedTranslucent = R_DrawSpanMaskedTranslucent_rgba; + R_DrawSpanAddClamp = R_DrawSpanAddClamp_rgba; + R_DrawSpanMaskedAddClamp = R_DrawSpanMaskedAddClamp_rgba; + R_FillColumn = R_FillColumn_rgba; + R_FillAddColumn = R_FillAddColumn_rgba; + R_FillAddClampColumn = R_FillAddClampColumn_rgba; + R_FillSubClampColumn = R_FillSubClampColumn_rgba; + R_FillRevSubClampColumn = R_FillRevSubClampColumn_rgba; + R_DrawAddColumn = R_DrawAddColumn_rgba; + R_DrawTlatedAddColumn = R_DrawTlatedAddColumn_rgba; + R_DrawAddClampColumn = R_DrawAddClampColumn_rgba; + R_DrawAddClampTranslatedColumn = R_DrawAddClampTranslatedColumn_rgba; + R_DrawSubClampColumn = R_DrawSubClampColumn_rgba; + R_DrawSubClampTranslatedColumn = R_DrawSubClampTranslatedColumn_rgba; + R_DrawRevSubClampColumn = R_DrawRevSubClampColumn_rgba; + R_DrawRevSubClampTranslatedColumn = R_DrawRevSubClampTranslatedColumn_rgba; + R_FillSpan = R_FillSpan_rgba; + R_DrawFogBoundary = R_DrawFogBoundary_rgba; + R_FillColumnHoriz = R_FillColumnHoriz_rgba; + + R_DrawFogBoundary = R_DrawFogBoundary_rgba; + R_MapTiltedPlane = R_MapTiltedPlane_rgba; + R_MapColoredPlane = R_MapColoredPlane_rgba; + R_DrawParticle = R_DrawParticle_rgba; + + R_SetupDrawSlab = R_SetupDrawSlab_rgba; + R_DrawSlab = R_DrawSlab_rgba; + + tmvline1_add = tmvline1_add_rgba; + tmvline4_add = tmvline4_add_rgba; + tmvline1_addclamp = tmvline1_addclamp_rgba; + tmvline4_addclamp = tmvline4_addclamp_rgba; + tmvline1_subclamp = tmvline1_subclamp_rgba; + tmvline4_subclamp = tmvline4_subclamp_rgba; + tmvline1_revsubclamp = tmvline1_revsubclamp_rgba; + tmvline4_revsubclamp = tmvline4_revsubclamp_rgba; + + rt_copy1col = rt_copy1col_rgba; + rt_copy4cols = rt_copy4cols_rgba; + rt_map1col = rt_map1col_rgba; + rt_map4cols = rt_map4cols_rgba; + rt_shaded1col = rt_shaded1col_rgba; + rt_shaded4cols = rt_shaded4cols_rgba; + rt_add1col = rt_add1col_rgba; + rt_add4cols = rt_add4cols_rgba; + rt_addclamp1col = rt_addclamp1col_rgba; + rt_addclamp4cols = rt_addclamp4cols_rgba; + rt_subclamp1col = rt_subclamp1col_rgba; + rt_revsubclamp1col = rt_revsubclamp1col_rgba; + rt_tlate1col = rt_tlate1col_rgba; + rt_tlateadd1col = rt_tlateadd1col_rgba; + rt_tlateaddclamp1col = rt_tlateaddclamp1col_rgba; + rt_tlatesubclamp1col = rt_tlatesubclamp1col_rgba; + rt_tlaterevsubclamp1col = rt_tlaterevsubclamp1col_rgba; + rt_subclamp4cols = rt_subclamp4cols_rgba; + rt_revsubclamp4cols = rt_revsubclamp4cols_rgba; + rt_tlate4cols = rt_tlate4cols_rgba; + rt_tlateadd4cols = rt_tlateadd4cols_rgba; + rt_tlateaddclamp4cols = rt_tlateaddclamp4cols_rgba; + rt_tlatesubclamp4cols = rt_tlatesubclamp4cols_rgba; + rt_tlaterevsubclamp4cols = rt_tlaterevsubclamp4cols_rgba; + rt_initcols = rt_initcols_rgba; + rt_span_coverage = rt_span_coverage_rgba; + + dovline1 = vlinec1_rgba; + doprevline1 = vlinec1_rgba; + domvline1 = mvlinec1_rgba; + + dovline4 = vlinec4_rgba; + domvline4 = mvlinec4_rgba; } else { - rt_map4cols = rt_map4cols_asm1; - } +#ifdef X86_ASM + R_DrawColumn = R_DrawColumnP_ASM; + R_DrawColumnHoriz = R_DrawColumnHorizP_ASM; + R_DrawFuzzColumn = R_DrawFuzzColumnP_ASM; + R_DrawTranslatedColumn = R_DrawTranslatedColumnP_C; + R_DrawShadedColumn = R_DrawShadedColumnP_C; + R_DrawSpan = R_DrawSpanP_ASM; + R_DrawSpanMasked = R_DrawSpanMaskedP_ASM; + if (CPU.Family <= 5) + { + rt_map4cols = rt_map4cols_asm2; + } + else + { + rt_map4cols = rt_map4cols_asm1; + } #else - R_DrawColumnHoriz = R_DrawColumnHorizP_C; - R_DrawColumn = R_DrawColumnP_C; - R_DrawFuzzColumn = R_DrawFuzzColumnP_C; - R_DrawTranslatedColumn = R_DrawTranslatedColumnP_C; - R_DrawShadedColumn = R_DrawShadedColumnP_C; - R_DrawSpan = R_DrawSpanP_C; - R_DrawSpanMasked = R_DrawSpanMaskedP_C; - rt_map4cols = rt_map4cols_c; + R_DrawColumnHoriz = R_DrawColumnHorizP_C; + R_DrawColumn = R_DrawColumnP_C; + R_DrawFuzzColumn = R_DrawFuzzColumnP_C; + R_DrawTranslatedColumn = R_DrawTranslatedColumnP_C; + R_DrawShadedColumn = R_DrawShadedColumnP_C; + R_DrawSpan = R_DrawSpanP_C; + R_DrawSpanMasked = R_DrawSpanMaskedP_C; + rt_map4cols = rt_map4cols_c; #endif - R_DrawSpanTranslucent = R_DrawSpanTranslucentP_C; - R_DrawSpanMaskedTranslucent = R_DrawSpanMaskedTranslucentP_C; - R_DrawSpanAddClamp = R_DrawSpanAddClampP_C; - R_DrawSpanMaskedAddClamp = R_DrawSpanMaskedAddClampP_C; + R_DrawSpanTranslucent = R_DrawSpanTranslucentP_C; + R_DrawSpanMaskedTranslucent = R_DrawSpanMaskedTranslucentP_C; + R_DrawSpanAddClamp = R_DrawSpanAddClampP_C; + R_DrawSpanMaskedAddClamp = R_DrawSpanMaskedAddClampP_C; + R_FillColumn = R_FillColumnP_C; + R_FillAddColumn = R_FillAddColumn_C; + R_FillAddClampColumn = R_FillAddClampColumn_C; + R_FillSubClampColumn = R_FillSubClampColumn_C; + R_FillRevSubClampColumn = R_FillRevSubClampColumn_C; + R_DrawAddColumn = R_DrawAddColumnP_C; + R_DrawTlatedAddColumn = R_DrawTlatedAddColumnP_C; + R_DrawAddClampColumn = R_DrawAddClampColumnP_C; + R_DrawAddClampTranslatedColumn = R_DrawAddClampTranslatedColumnP_C; + R_DrawSubClampColumn = R_DrawSubClampColumnP_C; + R_DrawSubClampTranslatedColumn = R_DrawSubClampTranslatedColumnP_C; + R_DrawRevSubClampColumn = R_DrawRevSubClampColumnP_C; + R_DrawRevSubClampTranslatedColumn = R_DrawRevSubClampTranslatedColumnP_C; + R_FillSpan = R_FillSpan_C; + R_DrawFogBoundary = R_DrawFogBoundary_C; + R_FillColumnHoriz = R_FillColumnHorizP_C; + + R_DrawFogBoundary = R_DrawFogBoundary_C; + R_MapTiltedPlane = R_MapTiltedPlane_C; + R_MapColoredPlane = R_MapColoredPlane_C; + R_DrawParticle = R_DrawParticle_C; + +#ifdef X86_ASM + R_SetupDrawSlab = [](FColormap *colormap, float light, int shade) { R_SetupDrawSlabA(colormap->Maps + (GETPALOOKUP(light, shade) << COLORMAPSHIFT)); }; + R_DrawSlab = R_DrawSlabA; +#else + R_SetupDrawSlab = [](FColormap *colormap, float light, int shade) { R_SetupDrawSlabC(colormap->Maps + (GETPALOOKUP(light, shade) << COLORMAPSHIFT)); }; + R_DrawSlab = R_DrawSlabC; +#endif + + tmvline1_add = tmvline1_add_C; + tmvline4_add = tmvline4_add_C; + tmvline1_addclamp = tmvline1_addclamp_C; + tmvline4_addclamp = tmvline4_addclamp_C; + tmvline1_subclamp = tmvline1_subclamp_C; + tmvline4_subclamp = tmvline4_subclamp_C; + tmvline1_revsubclamp = tmvline1_revsubclamp_C; + tmvline4_revsubclamp = tmvline4_revsubclamp_C; + +#ifdef X86_ASM + rt_copy1col = rt_copy1col_asm; + rt_copy4cols = rt_copy4cols_asm; + rt_map1col = rt_map1col_asm; + rt_shaded4cols = rt_shaded4cols_asm; + rt_add4cols = rt_add4cols_asm; + rt_addclamp4cols = rt_addclamp4cols_asm; +#else + rt_copy1col = rt_copy1col_c; + rt_copy4cols = rt_copy4cols_c; + rt_map1col = rt_map1col_c; + rt_shaded4cols = rt_shaded4cols_c; + rt_add4cols = rt_add4cols_c; + rt_addclamp4cols = rt_addclamp4cols_c; +#endif + rt_shaded1col = rt_shaded1col_c; + rt_add1col = rt_add1col_c; + rt_addclamp1col = rt_addclamp1col_c; + rt_subclamp1col = rt_subclamp1col_c; + rt_revsubclamp1col = rt_revsubclamp1col_c; + rt_tlate1col = rt_tlate1col_c; + rt_tlateadd1col = rt_tlateadd1col_c; + rt_tlateaddclamp1col = rt_tlateaddclamp1col_c; + rt_tlatesubclamp1col = rt_tlatesubclamp1col_c; + rt_tlaterevsubclamp1col = rt_tlaterevsubclamp1col_c; + rt_subclamp4cols = rt_subclamp4cols_c; + rt_revsubclamp4cols = rt_revsubclamp4cols_c; + rt_tlate4cols = rt_tlate4cols_c; + rt_tlateadd4cols = rt_tlateadd4cols_c; + rt_tlateaddclamp4cols = rt_tlateaddclamp4cols_c; + rt_tlatesubclamp4cols = rt_tlatesubclamp4cols_c; + rt_tlaterevsubclamp4cols = rt_tlaterevsubclamp4cols_c; + rt_initcols = rt_initcols_pal; + rt_span_coverage = rt_span_coverage_pal; + + if (pointers_saved) + { + pointers_saved = false; + dovline1 = dovline1_saved; + doprevline1 = doprevline1_saved; + domvline1 = domvline1_saved; + dovline4 = dovline4_saved; + domvline4 = domvline4_saved; + } + } + + colfunc = basecolfunc = R_DrawColumn; + fuzzcolfunc = R_DrawFuzzColumn; + transcolfunc = R_DrawTranslatedColumn; + spanfunc = R_DrawSpan; + + // [RH] Horizontal column drawers + hcolfunc_pre = R_DrawColumnHoriz; + hcolfunc_post1 = rt_map1col; + hcolfunc_post4 = rt_map4cols; } // [RH] Choose column drawers in a single place @@ -2213,7 +2501,7 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags) { if (flags & STYLEF_ColorIsFixed) { - colfunc = R_FillColumnP; + colfunc = R_FillColumn; hcolfunc_post1 = rt_copy1col; hcolfunc_post4 = rt_copy4cols; } @@ -2235,16 +2523,22 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags) { dc_srcblend = Col2RGB8_Inverse[fglevel>>10]; dc_destblend = Col2RGB8_LessPrecision[bglevel>>10]; + dc_srcalpha = fglevel; + dc_destalpha = bglevel; } else if (op == STYLEOP_Add && fglevel + bglevel <= FRACUNIT) { dc_srcblend = Col2RGB8[fglevel>>10]; dc_destblend = Col2RGB8[bglevel>>10]; + dc_srcalpha = fglevel; + dc_destalpha = bglevel; } else { dc_srcblend = Col2RGB8_LessPrecision[fglevel>>10]; dc_destblend = Col2RGB8_LessPrecision[bglevel>>10]; + dc_srcalpha = fglevel; + dc_destalpha = bglevel; } switch (op) { @@ -2263,13 +2557,13 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags) } else if (dc_translation == NULL) { - colfunc = R_DrawAddColumnP_C; + colfunc = R_DrawAddColumn; hcolfunc_post1 = rt_add1col; hcolfunc_post4 = rt_add4cols; } else { - colfunc = R_DrawTlatedAddColumnP_C; + colfunc = R_DrawTlatedAddColumn; hcolfunc_post1 = rt_tlateadd1col; hcolfunc_post4 = rt_tlateadd4cols; } @@ -2284,13 +2578,13 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags) } else if (dc_translation == NULL) { - colfunc = R_DrawAddClampColumnP_C; + colfunc = R_DrawAddClampColumn; hcolfunc_post1 = rt_addclamp1col; hcolfunc_post4 = rt_addclamp4cols; } else { - colfunc = R_DrawAddClampTranslatedColumnP_C; + colfunc = R_DrawAddClampTranslatedColumn; hcolfunc_post1 = rt_tlateaddclamp1col; hcolfunc_post4 = rt_tlateaddclamp4cols; } @@ -2306,13 +2600,13 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags) } else if (dc_translation == NULL) { - colfunc = R_DrawSubClampColumnP_C; + colfunc = R_DrawSubClampColumn; hcolfunc_post1 = rt_subclamp1col; hcolfunc_post4 = rt_subclamp4cols; } else { - colfunc = R_DrawSubClampTranslatedColumnP_C; + colfunc = R_DrawSubClampTranslatedColumn; hcolfunc_post1 = rt_tlatesubclamp1col; hcolfunc_post4 = rt_tlatesubclamp4cols; } @@ -2331,13 +2625,13 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags) } else if (dc_translation == NULL) { - colfunc = R_DrawRevSubClampColumnP_C; + colfunc = R_DrawRevSubClampColumn; hcolfunc_post1 = rt_revsubclamp1col; hcolfunc_post4 = rt_revsubclamp4cols; } else { - colfunc = R_DrawRevSubClampTranslatedColumnP_C; + colfunc = R_DrawRevSubClampTranslatedColumn; hcolfunc_post1 = rt_tlaterevsubclamp1col; hcolfunc_post4 = rt_tlaterevsubclamp4cols; } @@ -2412,11 +2706,15 @@ ESPSResult R_SetPatchStyle (FRenderStyle style, fixed_t alpha, int translation, colfunc = R_DrawShadedColumn; hcolfunc_post1 = rt_shaded1col; hcolfunc_post4 = rt_shaded4cols; - dc_color = fixedcolormap ? fixedcolormap[APART(color)] : basecolormap->Maps[APART(color)]; - dc_colormap = (basecolormap = &ShadeFakeColormap[16-alpha])->Maps; + dc_color = fixedcolormap ? fixedcolormap->Maps[APART(color)] : basecolormap->Maps[APART(color)]; + basecolormap = &ShadeFakeColormap[16-alpha]; if (fixedlightlev >= 0 && fixedcolormap == NULL) { - dc_colormap += fixedlightlev; + R_SetColorMapLight(basecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); + } + else + { + R_SetColorMapLight(basecolormap, 0, 0); } return r_columnmethod ? DoDraw1 : DoDraw0; } @@ -2426,10 +2724,10 @@ ESPSResult R_SetPatchStyle (FRenderStyle style, fixed_t alpha, int translation, if (style.Flags & STYLEF_ColorIsFixed) { - int x = fglevel >> 10; - int r = RPART(color); - int g = GPART(color); - int b = BPART(color); + uint32_t x = fglevel >> 10; + uint32_t r = RPART(color); + uint32_t g = GPART(color); + uint32_t b = BPART(color); // dc_color is used by the rt_* routines. It is indexed into dc_srcblend. dc_color = RGB32k.RGB[r>>3][g>>3][b>>3]; if (style.Flags & STYLEF_InvertSource) @@ -2438,11 +2736,13 @@ ESPSResult R_SetPatchStyle (FRenderStyle style, fixed_t alpha, int translation, g = 255 - g; b = 255 - b; } + uint32_t alpha = clamp(fglevel >> (FRACBITS - 8), 0, 255); + dc_srccolor_bgra = (alpha << 24) | (r << 16) | (g << 8) | b; // dc_srccolor is used by the R_Fill* routines. It is premultiplied // with the alpha. dc_srccolor = ((((r*x)>>4)<<20) | ((g*x)>>4) | ((((b)*x)>>4)<<10)) & 0x3feffbff; - hcolfunc_pre = R_FillColumnHorizP; - dc_colormap = identitymap; + hcolfunc_pre = R_FillColumnHoriz; + R_SetColorMapLight(&identitycolormap, 0, 0); } if (!R_SetBlendFunc (style.BlendOp, fglevel, bglevel, style.Flags)) @@ -2459,25 +2759,25 @@ void R_FinishSetPatchStyle () bool R_GetTransMaskDrawers (fixed_t (**tmvline1)(), void (**tmvline4)()) { - if (colfunc == R_DrawAddColumnP_C) + if (colfunc == R_DrawAddColumn) { *tmvline1 = tmvline1_add; *tmvline4 = tmvline4_add; return true; } - if (colfunc == R_DrawAddClampColumnP_C) + if (colfunc == R_DrawAddClampColumn) { *tmvline1 = tmvline1_addclamp; *tmvline4 = tmvline4_addclamp; return true; } - if (colfunc == R_DrawSubClampColumnP_C) + if (colfunc == R_DrawSubClampColumn) { *tmvline1 = tmvline1_subclamp; *tmvline4 = tmvline4_subclamp; return true; } - if (colfunc == R_DrawRevSubClampColumnP_C) + if (colfunc == R_DrawRevSubClampColumn) { *tmvline1 = tmvline1_revsubclamp; *tmvline4 = tmvline4_revsubclamp; @@ -2486,3 +2786,70 @@ bool R_GetTransMaskDrawers (fixed_t (**tmvline1)(), void (**tmvline4)()) return false; } +void R_SetTranslationMap(lighttable_t *translation) +{ + dc_fcolormap = nullptr; + dc_colormap = translation; + if (r_swtruecolor) + { + dc_shade_constants.light_red = 256; + dc_shade_constants.light_green = 256; + dc_shade_constants.light_blue = 256; + dc_shade_constants.light_alpha = 256; + dc_shade_constants.fade_red = 0; + dc_shade_constants.fade_green = 0; + dc_shade_constants.fade_blue = 0; + dc_shade_constants.fade_alpha = 256; + dc_shade_constants.desaturate = 0; + dc_shade_constants.simple_shade = true; + dc_light = 0; + } +} + +void R_SetColorMapLight(FColormap *base_colormap, float light, int shade) +{ + dc_fcolormap = base_colormap; + if (r_swtruecolor) + { + dc_shade_constants.light_red = dc_fcolormap->Color.r * 256 / 255; + dc_shade_constants.light_green = dc_fcolormap->Color.g * 256 / 255; + dc_shade_constants.light_blue = dc_fcolormap->Color.b * 256 / 255; + dc_shade_constants.light_alpha = dc_fcolormap->Color.a * 256 / 255; + dc_shade_constants.fade_red = dc_fcolormap->Fade.r; + dc_shade_constants.fade_green = dc_fcolormap->Fade.g; + dc_shade_constants.fade_blue = dc_fcolormap->Fade.b; + dc_shade_constants.fade_alpha = dc_fcolormap->Fade.a; + dc_shade_constants.desaturate = MIN(abs(dc_fcolormap->Desaturate), 255) * 255 / 256; + dc_shade_constants.simple_shade = (dc_fcolormap->Color.d == 0x00ffffff && dc_fcolormap->Fade.d == 0x00000000 && dc_fcolormap->Desaturate == 0); + dc_colormap = base_colormap->Maps; + dc_light = LIGHTSCALE(light, shade); + } + else + { + dc_colormap = base_colormap->Maps + (GETPALOOKUP(light, shade) << COLORMAPSHIFT); + } +} + +void R_SetDSColorMapLight(FColormap *base_colormap, float light, int shade) +{ + ds_fcolormap = base_colormap; + if (r_swtruecolor) + { + ds_shade_constants.light_red = ds_fcolormap->Color.r * 256 / 255; + ds_shade_constants.light_green = ds_fcolormap->Color.g * 256 / 255; + ds_shade_constants.light_blue = ds_fcolormap->Color.b * 256 / 255; + ds_shade_constants.light_alpha = ds_fcolormap->Color.a * 256 / 255; + ds_shade_constants.fade_red = ds_fcolormap->Fade.r; + ds_shade_constants.fade_green = ds_fcolormap->Fade.g; + ds_shade_constants.fade_blue = ds_fcolormap->Fade.b; + ds_shade_constants.fade_alpha = ds_fcolormap->Fade.a; + ds_shade_constants.desaturate = MIN(abs(ds_fcolormap->Desaturate), 255) * 255 / 256; + ds_shade_constants.simple_shade = (ds_fcolormap->Color.d == 0x00ffffff && ds_fcolormap->Fade.d == 0x00000000 && ds_fcolormap->Desaturate == 0); + ds_colormap = base_colormap->Maps; + ds_light = LIGHTSCALE(light, shade); + } + else + { + ds_colormap = base_colormap->Maps + (GETPALOOKUP(light, shade) << COLORMAPSHIFT); + } +} diff --git a/src/r_draw.h b/src/r_draw.h index cb2f68f33..591ae0b5f 100644 --- a/src/r_draw.h +++ b/src/r_draw.h @@ -25,24 +25,55 @@ #include "r_defs.h" +// Spectre/Invisibility. +#define FUZZTABLE 50 +extern "C" int fuzzoffset[FUZZTABLE + 1]; // [RH] +1 for the assembly routine +extern "C" int fuzzpos; +extern "C" int fuzzviewheight; + +struct FColormap; + +struct ShadeConstants +{ + uint16_t light_alpha; + uint16_t light_red; + uint16_t light_green; + uint16_t light_blue; + uint16_t fade_alpha; + uint16_t fade_red; + uint16_t fade_green; + uint16_t fade_blue; + uint16_t desaturate; + bool simple_shade; +}; + extern "C" int ylookup[MAXHEIGHT]; extern "C" int dc_pitch; // [RH] Distance between rows extern "C" lighttable_t*dc_colormap; +extern "C" FColormap *dc_fcolormap; +extern "C" ShadeConstants dc_shade_constants; +extern "C" fixed_t dc_light; extern "C" int dc_x; extern "C" int dc_yl; extern "C" int dc_yh; extern "C" fixed_t dc_iscale; extern double dc_texturemid; extern "C" fixed_t dc_texturefrac; +extern "C" uint32_t dc_textureheight; extern "C" int dc_color; // [RH] For flat colors (no texturing) extern "C" DWORD dc_srccolor; +extern "C" uint32_t dc_srccolor_bgra; extern "C" DWORD *dc_srcblend; extern "C" DWORD *dc_destblend; +extern "C" fixed_t dc_srcalpha; +extern "C" fixed_t dc_destalpha; // first pixel in a column extern "C" const BYTE* dc_source; +extern "C" const BYTE* dc_source2; +extern "C" uint32_t dc_texturefracx; extern "C" BYTE *dc_dest, *dc_destorg; extern "C" int dc_count; @@ -50,7 +81,11 @@ extern "C" int dc_count; extern "C" DWORD vplce[4]; extern "C" DWORD vince[4]; extern "C" BYTE* palookupoffse[4]; +extern "C" fixed_t palookuplight[4]; extern "C" const BYTE* bufplce[4]; +extern "C" const BYTE* bufplce2[4]; +extern "C" uint32_t buftexturefracx[4]; +extern "C" uint32_t bufheight[4]; // [RH] Temporary buffer for column drawing extern "C" BYTE *dc_temp; @@ -58,7 +93,6 @@ extern "C" unsigned int dc_tspans[4][MAXHEIGHT]; extern "C" unsigned int *dc_ctspan[4]; extern "C" unsigned int horizspans[4]; - // [RH] Pointers to the different column and span drawers... // The span blitting interface. @@ -67,12 +101,7 @@ extern void (*R_DrawColumn)(void); extern DWORD (*dovline1) (); extern DWORD (*doprevline1) (); -#ifdef X64_ASM -#define dovline4 vlinetallasm4 -extern "C" void vlinetallasm4(); -#else extern void (*dovline4) (); -#endif extern void setupvline (int); extern DWORD (*domvline1) (); @@ -94,8 +123,8 @@ extern void (*R_DrawTranslatedColumn)(void); // Span drawing for rows, floor/ceiling. No Spectre effect needed. extern void (*R_DrawSpan)(void); void R_SetupSpanBits(FTexture *tex); -void R_SetSpanColormap(BYTE *colormap); -void R_SetSpanSource(const BYTE *pixels); +void R_SetSpanColormap(FDynamicColormap *colormap, int shade); +void R_SetSpanSource(FTexture *tex); // Span drawing for masked textures. extern void (*R_DrawSpanMasked)(void); @@ -125,33 +154,33 @@ extern "C" void rt_copy1col_c (int hx, int sx, int yl, int yh); void rt_copy4cols_c (int sx, int yl, int yh); -void rt_shaded1col (int hx, int sx, int yl, int yh); +void rt_shaded1col_c (int hx, int sx, int yl, int yh); void rt_shaded4cols_c (int sx, int yl, int yh); void rt_shaded4cols_asm (int sx, int yl, int yh); void rt_map1col_c (int hx, int sx, int yl, int yh); -void rt_add1col (int hx, int sx, int yl, int yh); -void rt_addclamp1col (int hx, int sx, int yl, int yh); -void rt_subclamp1col (int hx, int sx, int yl, int yh); -void rt_revsubclamp1col (int hx, int sx, int yl, int yh); +void rt_add1col_c (int hx, int sx, int yl, int yh); +void rt_addclamp1col_c (int hx, int sx, int yl, int yh); +void rt_subclamp1col_c (int hx, int sx, int yl, int yh); +void rt_revsubclamp1col_c (int hx, int sx, int yl, int yh); -void rt_tlate1col (int hx, int sx, int yl, int yh); -void rt_tlateadd1col (int hx, int sx, int yl, int yh); -void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh); -void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh); -void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh); +void rt_tlate1col_c (int hx, int sx, int yl, int yh); +void rt_tlateadd1col_c (int hx, int sx, int yl, int yh); +void rt_tlateaddclamp1col_c (int hx, int sx, int yl, int yh); +void rt_tlatesubclamp1col_c (int hx, int sx, int yl, int yh); +void rt_tlaterevsubclamp1col_c (int hx, int sx, int yl, int yh); void rt_map4cols_c (int sx, int yl, int yh); void rt_add4cols_c (int sx, int yl, int yh); void rt_addclamp4cols_c (int sx, int yl, int yh); -void rt_subclamp4cols (int sx, int yl, int yh); -void rt_revsubclamp4cols (int sx, int yl, int yh); +void rt_subclamp4cols_c (int sx, int yl, int yh); +void rt_revsubclamp4cols_c (int sx, int yl, int yh); -void rt_tlate4cols (int sx, int yl, int yh); -void rt_tlateadd4cols (int sx, int yl, int yh); -void rt_tlateaddclamp4cols (int sx, int yl, int yh); -void rt_tlatesubclamp4cols (int sx, int yl, int yh); -void rt_tlaterevsubclamp4cols (int sx, int yl, int yh); +void rt_tlate4cols_c (int sx, int yl, int yh); +void rt_tlateadd4cols_c (int sx, int yl, int yh); +void rt_tlateaddclamp4cols_c (int sx, int yl, int yh); +void rt_tlatesubclamp4cols_c (int sx, int yl, int yh); +void rt_tlaterevsubclamp4cols_c (int sx, int yl, int yh); void rt_copy1col_asm (int hx, int sx, int yl, int yh); void rt_map1col_asm (int hx, int sx, int yl, int yh); @@ -163,30 +192,49 @@ void rt_add4cols_asm (int sx, int yl, int yh); void rt_addclamp4cols_asm (int sx, int yl, int yh); } -extern void (*rt_map4cols)(int sx, int yl, int yh); +extern void (*rt_copy1col)(int hx, int sx, int yl, int yh); +extern void (*rt_copy4cols)(int sx, int yl, int yh); -#ifdef X86_ASM -#define rt_copy1col rt_copy1col_asm -#define rt_copy4cols rt_copy4cols_asm -#define rt_map1col rt_map1col_asm -#define rt_shaded4cols rt_shaded4cols_asm -#define rt_add4cols rt_add4cols_asm -#define rt_addclamp4cols rt_addclamp4cols_asm -#else -#define rt_copy1col rt_copy1col_c -#define rt_copy4cols rt_copy4cols_c -#define rt_map1col rt_map1col_c -#define rt_shaded4cols rt_shaded4cols_c -#define rt_add4cols rt_add4cols_c -#define rt_addclamp4cols rt_addclamp4cols_c -#endif +extern void (*rt_shaded1col)(int hx, int sx, int yl, int yh); +extern void (*rt_shaded4cols)(int sx, int yl, int yh); + +extern void (*rt_map1col)(int hx, int sx, int yl, int yh); +extern void (*rt_add1col)(int hx, int sx, int yl, int yh); +extern void (*rt_addclamp1col)(int hx, int sx, int yl, int yh); +extern void (*rt_subclamp1col)(int hx, int sx, int yl, int yh); +extern void (*rt_revsubclamp1col)(int hx, int sx, int yl, int yh); + +extern void (*rt_tlate1col)(int hx, int sx, int yl, int yh); +extern void (*rt_tlateadd1col)(int hx, int sx, int yl, int yh); +extern void (*rt_tlateaddclamp1col)(int hx, int sx, int yl, int yh); +extern void (*rt_tlatesubclamp1col)(int hx, int sx, int yl, int yh); +extern void (*rt_tlaterevsubclamp1col)(int hx, int sx, int yl, int yh); + +extern void (*rt_map4cols)(int sx, int yl, int yh); +extern void (*rt_add4cols)(int sx, int yl, int yh); +extern void (*rt_addclamp4cols)(int sx, int yl, int yh); +extern void (*rt_subclamp4cols)(int sx, int yl, int yh); +extern void (*rt_revsubclamp4cols)(int sx, int yl, int yh); + +extern void (*rt_tlate4cols)(int sx, int yl, int yh); +extern void (*rt_tlateadd4cols)(int sx, int yl, int yh); +extern void (*rt_tlateaddclamp4cols)(int sx, int yl, int yh); +extern void (*rt_tlatesubclamp4cols)(int sx, int yl, int yh); +extern void (*rt_tlaterevsubclamp4cols)(int sx, int yl, int yh); + +extern void (*rt_initcols)(BYTE *buffer); +extern void (*rt_span_coverage)(int x, int start, int stop); void rt_draw4cols (int sx); // [RH] Preps the temporary horizontal buffer. -void rt_initcols (BYTE *buffer=NULL); +void rt_initcols_pal (BYTE *buffer); -void R_DrawFogBoundary (int x1, int x2, short *uclip, short *dclip); +void rt_span_coverage_pal(int x, int start, int stop); + +extern void (*R_DrawFogBoundary)(int x1, int x2, short *uclip, short *dclip); + +void R_DrawFogBoundary_C (int x1, int x2, short *uclip, short *dclip); #ifdef X86_ASM @@ -218,26 +266,47 @@ void R_DrawSpanMaskedTranslucentP_C (void); void R_DrawTlatedLucentColumnP_C (void); #define R_DrawTlatedLucentColumn R_DrawTlatedLucentColumnP_C -void R_FillColumnP (void); -void R_FillColumnHorizP (void); -void R_FillSpan (void); +extern void(*R_FillColumn)(void); +extern void(*R_FillAddColumn)(void); +extern void(*R_FillAddClampColumn)(void); +extern void(*R_FillSubClampColumn)(void); +extern void(*R_FillRevSubClampColumn)(void); +extern void(*R_DrawAddColumn)(void); +extern void(*R_DrawTlatedAddColumn)(void); +extern void(*R_DrawAddClampColumn)(void); +extern void(*R_DrawAddClampTranslatedColumn)(void); +extern void(*R_DrawSubClampColumn)(void); +extern void(*R_DrawSubClampTranslatedColumn)(void); +extern void(*R_DrawRevSubClampColumn)(void); +extern void(*R_DrawRevSubClampTranslatedColumn)(void); + +extern void(*R_FillSpan)(void); +extern void(*R_FillColumnHoriz)(void); + +void R_FillColumnP_C (void); + +void R_FillColumnHorizP_C (void); +void R_FillSpan_C (void); + +extern void(*R_SetupDrawSlab)(FColormap *base_colormap, float light, int shade); +extern void(*R_DrawSlab)(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p); #ifdef X86_ASM -#define R_SetupDrawSlab R_SetupDrawSlabA -#define R_DrawSlab R_DrawSlabA +extern "C" void R_SetupDrawSlabA(const BYTE *colormap); +extern "C" void R_DrawSlabA(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p); #else -#define R_SetupDrawSlab R_SetupDrawSlabC -#define R_DrawSlab R_DrawSlabC +extern "C" void R_SetupDrawSlabC(const BYTE *colormap); +extern "C" void R_DrawSlabC(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p); #endif -extern "C" void R_SetupDrawSlab(const BYTE *colormap); -extern "C" void R_DrawSlab(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p); - extern "C" int ds_y; extern "C" int ds_x1; extern "C" int ds_x2; +extern "C" FColormap* ds_fcolormap; extern "C" lighttable_t* ds_colormap; +extern "C" ShadeConstants ds_shade_constants; +extern "C" dsfixed_t ds_light; extern "C" dsfixed_t ds_xfrac; extern "C" dsfixed_t ds_yfrac; @@ -249,12 +318,14 @@ extern "C" fixed_t ds_alpha; // start of a 64*64 tile image extern "C" const BYTE* ds_source; +extern "C" bool ds_source_mipmapped; extern "C" int ds_color; // [RH] For flat color (no texturing) extern BYTE shadetables[/*NUMCOLORMAPS*16*256*/]; extern FDynamicColormap ShadeFakeColormap[16]; extern BYTE identitymap[256]; +extern FDynamicColormap identitycolormap; extern BYTE *dc_translation; // [RH] Added for muliresolution support @@ -278,6 +349,15 @@ inline ESPSResult R_SetPatchStyle(FRenderStyle style, float alpha, int translati // style was STYLE_Shade void R_FinishSetPatchStyle (); +extern fixed_t(*tmvline1_add)(); +extern void(*tmvline4_add)(); +extern fixed_t(*tmvline1_addclamp)(); +extern void(*tmvline4_addclamp)(); +extern fixed_t(*tmvline1_subclamp)(); +extern void(*tmvline4_subclamp)(); +extern fixed_t(*tmvline1_revsubclamp)(); +extern void(*tmvline4_revsubclamp)(); + // transmaskwallscan calls this to find out what column drawers to use bool R_GetTransMaskDrawers (fixed_t (**tmvline1)(), void (**tmvline4)()); @@ -293,4 +373,19 @@ void maskwallscan (int x1, int x2, short *uwal, short *dwal, float *swal, fixed_ // transmaskwallscan is like maskwallscan, but it can also blend to the background void transmaskwallscan (int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, double yrepeat, const BYTE *(*getcol)(FTexture *tex, int col)=R_GetColumn); +// Sets dc_colormap and dc_light to their appropriate values depending on the output format (pal vs true color) +void R_SetColorMapLight(FColormap *base_colormap, float light, int shade); + +// Same as R_SetColorMapLight, but for ds_colormap and ds_light +void R_SetDSColorMapLight(FColormap *base_colormap, float light, int shade); + +void R_SetTranslationMap(lighttable_t *translation); + +extern bool r_swtruecolor; + +EXTERN_CVAR(Bool, r_multithreaded); +EXTERN_CVAR(Bool, r_magfilter); +EXTERN_CVAR(Bool, r_minfilter); +EXTERN_CVAR(Bool, r_mipmap); + #endif diff --git a/src/r_draw_rgba.cpp b/src/r_draw_rgba.cpp new file mode 100644 index 000000000..69ebfeb84 --- /dev/null +++ b/src/r_draw_rgba.cpp @@ -0,0 +1,2969 @@ +// Emacs style mode select -*- C++ -*- +//----------------------------------------------------------------------------- +// +// $Id:$ +// +// Copyright (C) 1993-1996 by id Software, Inc. +// +// This source is available for distribution and/or modification +// only under the terms of the DOOM Source Code License as +// published by id Software. All rights reserved. +// +// The source is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// FITNESS FOR A PARTICULAR PURPOSE. See the DOOM Source Code License +// for more details. +// +// $Log:$ +// +// DESCRIPTION: +// True color span/column drawing functions. +// +//----------------------------------------------------------------------------- + +#include + +#include "templates.h" +#include "doomdef.h" +#include "i_system.h" +#include "w_wad.h" +#include "r_local.h" +#include "v_video.h" +#include "doomstat.h" +#include "st_stuff.h" +#include "g_game.h" +#include "g_level.h" +#include "r_data/r_translate.h" +#include "v_palette.h" +#include "r_data/colormaps.h" +#include "r_plane.h" +#include "r_draw_rgba.h" + +#include "gi.h" +#include "stats.h" +#include "x86.h" +#ifndef NO_SSE +#include +#include +#endif +#include + +extern "C" short spanend[MAXHEIGHT]; +extern float rw_light; +extern float rw_lightstep; +extern int wallshade; + +// Use multiple threads when drawing +CVAR(Bool, r_multithreaded, true, 0); + +// Use linear filtering when scaling up +CVAR(Bool, r_magfilter, false, CVAR_ARCHIVE | CVAR_GLOBALCONFIG); + +// Use linear filtering when scaling down +CVAR(Bool, r_minfilter, true, CVAR_ARCHIVE | CVAR_GLOBALCONFIG); + +// Use mipmapped textures +CVAR(Bool, r_mipmap, true, CVAR_ARCHIVE | CVAR_GLOBALCONFIG); + +#ifndef NO_SSE + +#ifdef _MSC_VER +#pragma warning(disable: 4101) // warning C4101: unreferenced local variable +#endif + +// Generate SSE drawers: +#define VecCommand(name) name##_SSE_Command +#define VEC_SHADE_VARS SSE_SHADE_VARS +#define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT +#define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4 +#define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE +#define VEC_SHADE_INIT SSE_SHADE_INIT +#define VEC_SHADE_INIT4 SSE_SHADE_INIT4 +#define VEC_SHADE SSE_SHADE +#include "r_draw_rgba_sse.h" +/* +// Generate AVX drawers: +#undef VecCommand +#undef VEC_SHADE_SIMPLE_INIT +#undef VEC_SHADE_SIMPLE_INIT4 +#undef VEC_SHADE_SIMPLE +#undef VEC_SHADE_INIT +#undef VEC_SHADE_INIT4 +#undef VEC_SHADE +#define VecCommand(name) name##_AVX_Command +#define VEC_SHADE_SIMPLE_INIT AVX_LINEAR_SHADE_SIMPLE_INIT +#define VEC_SHADE_SIMPLE_INIT4 AVX_LINEAR_SHADE_SIMPLE_INIT4 +#define VEC_SHADE_SIMPLE AVX_LINEAR_SHADE_SIMPLE +#define VEC_SHADE_INIT AVX_LINEAR_SHADE_INIT +#define VEC_SHADE_INIT4 AVX_LINEAR_SHADE_INIT4 +#define VEC_SHADE AVX_LINEAR_SHADE +#include "r_draw_rgba_sse.h" +*/ +#endif + +///////////////////////////////////////////////////////////////////////////// + +#ifndef NO_SSE +__m128i SampleBgra::samplertable[256 * 2]; +#endif + +DrawerCommandQueue *DrawerCommandQueue::Instance() +{ + static DrawerCommandQueue queue; + return &queue; +} + +DrawerCommandQueue::DrawerCommandQueue() +{ +#ifndef NO_SSE + for (int inv_b = 0; inv_b < 16; inv_b++) + { + for (int inv_a = 0; inv_a < 16; inv_a++) + { + int a = 16 - inv_a; + int b = 16 - inv_b; + + int ab = a * b; + int invab = inv_a * b; + int ainvb = a * inv_b; + int invainvb = inv_a * inv_b; + + __m128i ab_invab = _mm_set_epi16(invab, invab, invab, invab, ab, ab, ab, ab); + __m128i ainvb_invainvb = _mm_set_epi16(invainvb, invainvb, invainvb, invainvb, ainvb, ainvb, ainvb, ainvb); + + _mm_store_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2, ab_invab); + _mm_store_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2 + 1, ainvb_invainvb); + } + } +#endif +} + +DrawerCommandQueue::~DrawerCommandQueue() +{ + StopThreads(); +} + +void* DrawerCommandQueue::AllocMemory(size_t size) +{ + // Make sure allocations remain 16-byte aligned + size = (size + 15) / 16 * 16; + + auto queue = Instance(); + if (queue->memorypool_pos + size > memorypool_size) + return nullptr; + + void *data = queue->memorypool + queue->memorypool_pos; + queue->memorypool_pos += size; + return data; +} + +void DrawerCommandQueue::Begin() +{ + auto queue = Instance(); + queue->Finish(); + queue->threaded_render++; +} + +void DrawerCommandQueue::End() +{ + auto queue = Instance(); + queue->Finish(); + if (queue->threaded_render > 0) + queue->threaded_render--; +} + +void DrawerCommandQueue::WaitForWorkers() +{ + Instance()->Finish(); +} + +void DrawerCommandQueue::Finish() +{ + auto queue = Instance(); + if (queue->commands.empty()) + return; + + // Give worker threads something to do: + + std::unique_lock start_lock(queue->start_mutex); + queue->active_commands.swap(queue->commands); + queue->run_id++; + start_lock.unlock(); + + queue->StartThreads(); + queue->start_condition.notify_all(); + + // Do one thread ourselves: + + DrawerThread thread; + thread.core = 0; + thread.num_cores = queue->threads.size() + 1; + + for (int pass = 0; pass < queue->num_passes; pass++) + { + thread.pass_start_y = pass * queue->rows_in_pass; + thread.pass_end_y = (pass + 1) * queue->rows_in_pass; + if (pass + 1 == queue->num_passes) + thread.pass_end_y = MAX(thread.pass_end_y, MAXHEIGHT); + + size_t size = queue->active_commands.size(); + for (size_t i = 0; i < size; i++) + { + auto &command = queue->active_commands[i]; + command->Execute(&thread); + } + } + + // Wait for everyone to finish: + + std::unique_lock end_lock(queue->end_mutex); + queue->end_condition.wait(end_lock, [&]() { return queue->finished_threads == queue->threads.size(); }); + + // Clean up batch: + + for (auto &command : queue->active_commands) + command->~DrawerCommand(); + queue->active_commands.clear(); + queue->memorypool_pos = 0; + queue->finished_threads = 0; +} + +void DrawerCommandQueue::StartThreads() +{ + if (!threads.empty()) + return; + + int num_threads = std::thread::hardware_concurrency(); + if (num_threads == 0) + num_threads = 4; + + threads.resize(num_threads - 1); + + for (int i = 0; i < num_threads - 1; i++) + { + DrawerCommandQueue *queue = this; + DrawerThread *thread = &threads[i]; + thread->core = i + 1; + thread->num_cores = num_threads; + thread->thread = std::thread([=]() + { + int run_id = 0; + while (true) + { + // Wait until we are signalled to run: + std::unique_lock start_lock(queue->start_mutex); + queue->start_condition.wait(start_lock, [&]() { return queue->run_id != run_id || queue->shutdown_flag; }); + if (queue->shutdown_flag) + break; + run_id = queue->run_id; + start_lock.unlock(); + + // Do the work: + for (int pass = 0; pass < queue->num_passes; pass++) + { + thread->pass_start_y = pass * queue->rows_in_pass; + thread->pass_end_y = (pass + 1) * queue->rows_in_pass; + if (pass + 1 == queue->num_passes) + thread->pass_end_y = MAX(thread->pass_end_y, MAXHEIGHT); + + size_t size = queue->active_commands.size(); + for (size_t i = 0; i < size; i++) + { + auto &command = queue->active_commands[i]; + command->Execute(thread); + } + } + + // Notify main thread that we finished: + std::unique_lock end_lock(queue->end_mutex); + queue->finished_threads++; + end_lock.unlock(); + queue->end_condition.notify_all(); + } + }); + } +} + +void DrawerCommandQueue::StopThreads() +{ + std::unique_lock lock(start_mutex); + shutdown_flag = true; + lock.unlock(); + start_condition.notify_all(); + for (auto &thread : threads) + thread.thread.join(); + threads.clear(); + lock.lock(); + shutdown_flag = false; +} + +///////////////////////////////////////////////////////////////////////////// + +class DrawerColumnCommand : public DrawerCommand +{ +public: + int _count; + BYTE * RESTRICT _dest; + int _pitch; + DWORD _iscale; + DWORD _texturefrac; + + DrawerColumnCommand() + { + _count = dc_count; + _dest = dc_dest; + _iscale = dc_iscale; + _texturefrac = dc_texturefrac; + _pitch = dc_pitch; + } + + class LoopIterator + { + public: + int count; + uint32_t *dest; + int pitch; + fixed_t fracstep; + fixed_t frac; + + LoopIterator(DrawerColumnCommand *command, DrawerThread *thread) + { + count = thread->count_for_thread(command->_dest_y, command->_count); + if (count <= 0) + return; + + dest = thread->dest_for_thread(command->_dest_y, command->_pitch, (uint32_t*)command->_dest); + pitch = command->_pitch * thread->num_cores; + + fracstep = command->_iscale * thread->num_cores; + frac = command->_texturefrac + command->_iscale * thread->skipped_by_thread(command->_dest_y); + } + + uint32_t sample_index() + { + return frac >> FRACBITS; + } + + explicit operator bool() + { + return count > 0; + } + + bool next() + { + dest += pitch; + frac += fracstep; + return (--count) != 0; + } + }; +}; + +class DrawColumnRGBACommand : public DrawerColumnCommand +{ + uint32_t _light; + const BYTE * RESTRICT _source; + ShadeConstants _shade_constants; + BYTE * RESTRICT _colormap; + +public: + DrawColumnRGBACommand() + { + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _source = dc_source; + _colormap = dc_colormap; + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_colormap[_source[loop.sample_index()]], _light, _shade_constants); + *loop.dest = BlendBgra::copy(fg); + } while (loop.next()); + } +}; + +class FillColumnRGBACommand : public DrawerColumnCommand +{ + uint32_t _color; + +public: + FillColumnRGBACommand() + { + uint32_t light = LightBgra::calc_light_multiplier(dc_light); + _color = LightBgra::shade_pal_index_simple(dc_color, light); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + *loop.dest = BlendBgra::copy(_color); + } while (loop.next()); + } +}; + +class FillAddColumnRGBACommand : public DrawerColumnCommand +{ + uint32_t _srccolor; + +public: + FillAddColumnRGBACommand() + { + _srccolor = dc_srccolor_bgra; + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + uint32_t alpha = APART(_srccolor); + alpha += alpha >> 7; + + do + { + *loop.dest = BlendBgra::add(_srccolor, *loop.dest, alpha, 256 - alpha); + } while (loop.next()); + } +}; + +class FillAddClampColumnRGBACommand : public DrawerColumnCommand +{ + int _color; + uint32_t _srccolor; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + FillAddClampColumnRGBACommand() + { + _color = dc_color; + _srccolor = dc_srccolor_bgra; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + *loop.dest = BlendBgra::add(_srccolor, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class FillSubClampColumnRGBACommand : public DrawerColumnCommand +{ + uint32_t _srccolor; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + FillSubClampColumnRGBACommand() + { + _srccolor = dc_srccolor_bgra; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + *loop.dest = BlendBgra::sub(_srccolor, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class FillRevSubClampColumnRGBACommand : public DrawerColumnCommand +{ + uint32_t _srccolor; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + FillRevSubClampColumnRGBACommand() + { + _srccolor = dc_srccolor_bgra; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + *loop.dest = BlendBgra::revsub(_srccolor, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawAddColumnRGBACommand : public DrawerColumnCommand +{ + const BYTE * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + uint32_t _srcalpha; + uint32_t _destalpha; + BYTE * RESTRICT _colormap; + +public: + DrawAddColumnRGBACommand() + { + _source = dc_source; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + _colormap = dc_colormap; + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_colormap[_source[loop.sample_index()]], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawTranslatedColumnRGBACommand : public DrawerColumnCommand +{ + fixed_t _light; + ShadeConstants _shade_constants; + BYTE * RESTRICT _translation; + const BYTE * RESTRICT _source; + +public: + DrawTranslatedColumnRGBACommand() + { + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _translation = dc_translation; + _source = dc_source; + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_translation[_source[loop.sample_index()]], _light, _shade_constants); + *loop.dest = BlendBgra::copy(fg); + } while (loop.next()); + } +}; + +class DrawTlatedAddColumnRGBACommand : public DrawerColumnCommand +{ + fixed_t _light; + ShadeConstants _shade_constants; + BYTE * RESTRICT _translation; + const BYTE * RESTRICT _source; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + DrawTlatedAddColumnRGBACommand() + { + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _translation = dc_translation; + _source = dc_source; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_translation[_source[loop.sample_index()]], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawShadedColumnRGBACommand : public DrawerColumnCommand +{ +private: + const BYTE * RESTRICT _source; + lighttable_t * RESTRICT _colormap; + uint32_t _color; + +public: + DrawShadedColumnRGBACommand() + { + _source = dc_source; + _colormap = dc_colormap; + _color = LightBgra::shade_pal_index_simple(dc_color, LightBgra::calc_light_multiplier(dc_light)); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t alpha = clamp(_colormap[_source[loop.sample_index()]], 0, 64) * 4; + uint32_t inv_alpha = 256 - alpha; + *loop.dest = BlendBgra::add(_color, *loop.dest, alpha, inv_alpha); + } while (loop.next()); + } +}; + +class DrawAddClampColumnRGBACommand : public DrawerColumnCommand +{ + const BYTE * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + DrawAddClampColumnRGBACommand() + { + _source = dc_source; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawAddClampTranslatedColumnRGBACommand : public DrawerColumnCommand +{ + BYTE * RESTRICT _translation; + const BYTE * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + DrawAddClampTranslatedColumnRGBACommand() + { + _translation = dc_translation; + _source = dc_source; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_translation[_source[loop.sample_index()]], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawSubClampColumnRGBACommand : public DrawerColumnCommand +{ + const BYTE * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + DrawSubClampColumnRGBACommand() + { + _source = dc_source; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::sub(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawSubClampTranslatedColumnRGBACommand : public DrawerColumnCommand +{ + const BYTE * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + uint32_t _srcalpha; + uint32_t _destalpha; + BYTE * RESTRICT _translation; + +public: + DrawSubClampTranslatedColumnRGBACommand() + { + _source = dc_source; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + _translation = dc_translation; + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_translation[_source[loop.sample_index()]], _light, _shade_constants); + *loop.dest = BlendBgra::sub(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawRevSubClampColumnRGBACommand : public DrawerColumnCommand +{ + const BYTE * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + DrawRevSubClampColumnRGBACommand() + { + _source = dc_source; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::revsub(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawRevSubClampTranslatedColumnRGBACommand : public DrawerColumnCommand +{ + const BYTE * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + uint32_t _srcalpha; + uint32_t _destalpha; + BYTE * RESTRICT _translation; + +public: + DrawRevSubClampTranslatedColumnRGBACommand() + { + _source = dc_source; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + _translation = dc_translation; + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_translation[_source[loop.sample_index()]], _light, _shade_constants); + *loop.dest = BlendBgra::revsub(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawFuzzColumnRGBACommand : public DrawerCommand +{ + int _x; + int _yl; + int _yh; + BYTE * RESTRICT _destorg; + int _pitch; + int _fuzzpos; + int _fuzzviewheight; + +public: + DrawFuzzColumnRGBACommand() + { + _x = dc_x; + _yl = dc_yl; + _yh = dc_yh; + _destorg = dc_destorg; + _pitch = dc_pitch; + _fuzzpos = fuzzpos; + _fuzzviewheight = fuzzviewheight; + } + + void Execute(DrawerThread *thread) override + { + int yl = MAX(_yl, 1); + int yh = MIN(_yh, _fuzzviewheight); + + int count = thread->count_for_thread(yl, yh - yl + 1); + + // Zero length. + if (count <= 0) + return; + + uint32_t *dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + _x + (uint32_t*)_destorg); + + int pitch = _pitch * thread->num_cores; + int fuzzstep = thread->num_cores; + int fuzz = (_fuzzpos + thread->skipped_by_thread(yl)) % FUZZTABLE; + + yl += thread->skipped_by_thread(yl); + + // Handle the case where we would go out of bounds at the top: + if (yl < fuzzstep) + { + uint32_t *srcdest = dest + fuzzoffset[fuzz] * fuzzstep + pitch; + //assert(static_cast((srcdest - (uint32_t*)dc_destorg) / (_pitch)) < viewheight); + + uint32_t bg = *srcdest; + + uint32_t red = RPART(bg) * 3 / 4; + uint32_t green = GPART(bg) * 3 / 4; + uint32_t blue = BPART(bg) * 3 / 4; + + *dest = 0xff000000 | (red << 16) | (green << 8) | blue; + dest += pitch; + fuzz += fuzzstep; + fuzz %= FUZZTABLE; + + count--; + if (count == 0) + return; + } + + bool lowerbounds = (yl + (count + fuzzstep - 1) * fuzzstep > _fuzzviewheight); + if (lowerbounds) + count--; + + // Fuzz where fuzzoffset stays within bounds + while (count > 0) + { + int available = (FUZZTABLE - fuzz); + int next_wrap = available / fuzzstep; + if (available % fuzzstep != 0) + next_wrap++; + + int cnt = MIN(count, next_wrap); + count -= cnt; + do + { + uint32_t *srcdest = dest + fuzzoffset[fuzz] * fuzzstep; + //assert(static_cast((srcdest - (uint32_t*)dc_destorg) / (_pitch)) < viewheight); + + uint32_t bg = *srcdest; + + uint32_t red = RPART(bg) * 3 / 4; + uint32_t green = GPART(bg) * 3 / 4; + uint32_t blue = BPART(bg) * 3 / 4; + + *dest = 0xff000000 | (red << 16) | (green << 8) | blue; + dest += pitch; + fuzz += fuzzstep; + } while (--cnt); + + fuzz %= FUZZTABLE; + } + + // Handle the case where we would go out of bounds at the bottom + if (lowerbounds) + { + uint32_t *srcdest = dest + fuzzoffset[fuzz] * fuzzstep - pitch; + //assert(static_cast((srcdest - (uint32_t*)dc_destorg) / (_pitch)) < viewheight); + + uint32_t bg = *srcdest; + + uint32_t red = RPART(bg) * 3 / 4; + uint32_t green = GPART(bg) * 3 / 4; + uint32_t blue = BPART(bg) * 3 / 4; + + *dest = 0xff000000 | (red << 16) | (green << 8) | blue; + } + } +}; + +///////////////////////////////////////////////////////////////////////////// + +class DrawerSpanCommand : public DrawerCommand +{ +public: + fixed_t _xfrac; + fixed_t _yfrac; + fixed_t _xstep; + fixed_t _ystep; + int _x1; + int _x2; + int _y; + int _xbits; + int _ybits; + BYTE * RESTRICT _destorg; + + const uint32_t * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + bool _nearest_filter; + + uint32_t _srcalpha; + uint32_t _destalpha; + + DrawerSpanCommand() + { + _xfrac = ds_xfrac; + _yfrac = ds_yfrac; + _xstep = ds_xstep; + _ystep = ds_ystep; + _x1 = ds_x1; + _x2 = ds_x2; + _y = ds_y; + _xbits = ds_xbits; + _ybits = ds_ybits; + _destorg = dc_destorg; + + _source = (const uint32_t*)ds_source; + _light = LightBgra::calc_light_multiplier(ds_light); + _shade_constants = ds_shade_constants; + _nearest_filter = !SampleBgra::span_sampler_setup(_source, _xbits, _ybits, _xstep, _ystep, ds_source_mipmapped); + + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + class LoopIterator + { + public: + uint32_t *dest; + int count; + dsfixed_t xfrac; + dsfixed_t yfrac; + dsfixed_t xstep; + dsfixed_t ystep; + BYTE yshift; + BYTE xshift; + int xmask; + bool is_64x64; + bool skipped; + + LoopIterator(DrawerSpanCommand *command, DrawerThread *thread) + { + dest = ylookup[command->_y] + command->_x1 + (uint32_t*)command->_destorg; + count = command->_x2 - command->_x1 + 1; + xfrac = command->_xfrac; + yfrac = command->_yfrac; + xstep = command->_xstep; + ystep = command->_ystep; + yshift = 32 - command->_ybits; + xshift = yshift - command->_xbits; + xmask = ((1 << command->_xbits) - 1) << command->_ybits; + is_64x64 = command->_xbits == 6 && command->_ybits == 6; + skipped = thread->line_skipped_by_thread(command->_y); + } + + // 64x64 is the most common case by far, so special case it. + int spot64() + { + return ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + } + + int spot() + { + return ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + } + + explicit operator bool() + { + return !skipped && count > 0; + } + + bool next() + { + dest++; + xfrac += xstep; + yfrac += ystep; + return (--count) != 0; + } + }; +}; + +class DrawSpanRGBACommand : public DrawerSpanCommand +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (_nearest_filter) + { + if (loop.is_64x64) + { + do + { + *loop.dest = LightBgra::shade_bgra(_source[loop.spot64()], _light, _shade_constants); + } while (loop.next()); + } + else + { + do + { + *loop.dest = LightBgra::shade_bgra(_source[loop.spot()], _light, _shade_constants); + } while (loop.next()); + } + } + else + { + if (loop.is_64x64) + { + do + { + *loop.dest = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_source, loop.xfrac, loop.yfrac, 26, 26), _light, _shade_constants); + } while (loop.next()); + } + else + { + do + { + *loop.dest = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_source, loop.xfrac, loop.yfrac, 32 - _xbits, 32 - _ybits), _light, _shade_constants); + } while (loop.next()); + } + } + } +}; + +class DrawSpanMaskedRGBACommand : public DrawerSpanCommand +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (_nearest_filter) + { + if (loop.is_64x64) + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot64()], _light, _shade_constants); + *loop.dest = BlendBgra::alpha_blend(fg, *loop.dest); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot()], _light, _shade_constants); + *loop.dest = BlendBgra::alpha_blend(fg, *loop.dest); + } while (loop.next()); + } + } + else + { + if (loop.is_64x64) + { + do + { + uint32_t fg = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_source, loop.xfrac, loop.yfrac, 26, 26), _light, _shade_constants); + *loop.dest = BlendBgra::alpha_blend(fg, *loop.dest); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_source, loop.xfrac, loop.yfrac, 32 - _xbits, 32 - _ybits), _light, _shade_constants); + *loop.dest = BlendBgra::alpha_blend(fg, *loop.dest); + } while (loop.next()); + } + } + } +}; + +class DrawSpanTranslucentRGBACommand : public DrawerSpanCommand +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (loop.is_64x64) + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot64()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } + } +}; + +class DrawSpanMaskedTranslucentRGBACommand : public DrawerSpanCommand +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (loop.is_64x64) + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot64()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } + } +}; + +class DrawSpanAddClampRGBACommand : public DrawerSpanCommand +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (loop.is_64x64) + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot64()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } + } +}; + +class DrawSpanMaskedAddClampRGBACommand : public DrawerSpanCommand +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (loop.is_64x64) + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot64()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } + } +}; + +class FillSpanRGBACommand : public DrawerCommand +{ + int _x1; + int _x2; + int _y; + BYTE * RESTRICT _destorg; + fixed_t _light; + int _color; + +public: + FillSpanRGBACommand() + { + _x1 = ds_x1; + _x2 = ds_x2; + _y = ds_y; + _destorg = dc_destorg; + _light = ds_light; + _color = ds_color; + } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(_y)) + return; + + uint32_t *dest = ylookup[_y] + _x1 + (uint32_t*)_destorg; + int count = (_x2 - _x1 + 1); + uint32_t light = LightBgra::calc_light_multiplier(_light); + uint32_t color = LightBgra::shade_pal_index_simple(_color, light); + for (int i = 0; i < count; i++) + dest[i] = color; + } +}; + +///////////////////////////////////////////////////////////////////////////// + +class DrawSlabRGBACommand : public DrawerCommand +{ + int _dx; + fixed_t _v; + int _dy; + fixed_t _vi; + const BYTE *_voxelptr; + uint32_t *_p; + ShadeConstants _shade_constants; + const BYTE *_colormap; + fixed_t _light; + int _pitch; + int _start_y; + +public: + DrawSlabRGBACommand(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p, ShadeConstants shade_constants, const BYTE *colormap, fixed_t light) + { + _dx = dx; + _v = v; + _dy = dy; + _vi = vi; + _voxelptr = vptr; + _p = (uint32_t *)p; + _shade_constants = shade_constants; + _colormap = colormap; + _light = light; + _pitch = dc_pitch; + _start_y = static_cast((p - dc_destorg) / (dc_pitch * 4)); + assert(dx > 0); + } + + void Execute(DrawerThread *thread) override + { + int dx = _dx; + fixed_t v = _v; + int dy = _dy; + fixed_t vi = _vi; + const BYTE *vptr = _voxelptr; + uint32_t *p = _p; + ShadeConstants shade_constants = _shade_constants; + const BYTE *colormap = _colormap; + uint32_t light = LightBgra::calc_light_multiplier(_light); + int pitch = _pitch; + int x; + + dy = thread->count_for_thread(_start_y, dy); + p = thread->dest_for_thread(_start_y, pitch, p); + v += vi * thread->skipped_by_thread(_start_y); + vi *= thread->num_cores; + pitch *= thread->num_cores; + + if (dx == 1) + { + while (dy > 0) + { + *p = LightBgra::shade_pal_index(colormap[vptr[v >> FRACBITS]], light, shade_constants); + p += pitch; + v += vi; + dy--; + } + } + else if (dx == 2) + { + while (dy > 0) + { + uint32_t color = LightBgra::shade_pal_index(colormap[vptr[v >> FRACBITS]], light, shade_constants); + p[0] = color; + p[1] = color; + p += pitch; + v += vi; + dy--; + } + } + else if (dx == 3) + { + while (dy > 0) + { + uint32_t color = LightBgra::shade_pal_index(colormap[vptr[v >> FRACBITS]], light, shade_constants); + p[0] = color; + p[1] = color; + p[2] = color; + p += pitch; + v += vi; + dy--; + } + } + else if (dx == 4) + { + while (dy > 0) + { + uint32_t color = LightBgra::shade_pal_index(colormap[vptr[v >> FRACBITS]], light, shade_constants); + p[0] = color; + p[1] = color; + p[2] = color; + p[3] = color; + p += pitch; + v += vi; + dy--; + } + } + else while (dy > 0) + { + uint32_t color = LightBgra::shade_pal_index(colormap[vptr[v >> FRACBITS]], light, shade_constants); + // The optimizer will probably turn this into a memset call. + // Since dx is not likely to be large, I'm not sure that's a good thing, + // hence the alternatives above. + for (x = 0; x < dx; x++) + { + p[x] = color; + } + p += pitch; + v += vi; + dy--; + } + } +}; + +///////////////////////////////////////////////////////////////////////////// + +class DrawerWall1Command : public DrawerCommand +{ +public: + BYTE * RESTRICT _dest; + int _pitch; + int _count; + DWORD _texturefrac; + uint32_t _texturefracx; + DWORD _iscale; + uint32_t _textureheight; + + const uint32 * RESTRICT _source; + const uint32 * RESTRICT _source2; + uint32_t _light; + ShadeConstants _shade_constants; + + uint32_t _srcalpha; + uint32_t _destalpha; + + DrawerWall1Command() + { + _dest = dc_dest; + _pitch = dc_pitch; + _count = dc_count; + _texturefrac = dc_texturefrac; + _texturefracx = dc_texturefracx; + _iscale = dc_iscale; + _textureheight = dc_textureheight; + + _source = (const uint32 *)dc_source; + _source2 = (const uint32 *)dc_source2; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + class LoopIterator + { + public: + uint32_t *dest; + int pitch; + int count; + uint32_t fracstep; + uint32_t frac; + uint32_t texturefracx; + uint32_t height; + uint32_t one; + + LoopIterator(DrawerWall1Command *command, DrawerThread *thread) + { + count = thread->count_for_thread(command->_dest_y, command->_count); + if (count <= 0) + return; + + fracstep = command->_iscale * thread->num_cores; + frac = command->_texturefrac + command->_iscale * thread->skipped_by_thread(command->_dest_y); + texturefracx = command->_texturefracx; + dest = thread->dest_for_thread(command->_dest_y, command->_pitch, (uint32_t*)command->_dest); + pitch = command->_pitch * thread->num_cores; + + height = command->_textureheight; + one = ((0x80000000 + height - 1) / height) * 2 + 1; + } + + explicit operator bool() + { + return count > 0; + } + + int sample_index() + { + return ((frac >> FRACBITS) * height) >> FRACBITS; + } + + bool next() + { + frac += fracstep; + dest += pitch; + return (--count) != 0; + } + }; +}; + +class DrawerWall4Command : public DrawerCommand +{ +public: + BYTE * RESTRICT _dest; + int _count; + int _pitch; + ShadeConstants _shade_constants; + uint32_t _vplce[4]; + uint32_t _vince[4]; + uint32_t _buftexturefracx[4]; + uint32_t _bufheight[4]; + const uint32_t * RESTRICT _bufplce[4]; + const uint32_t * RESTRICT _bufplce2[4]; + uint32_t _light[4]; + + uint32_t _srcalpha; + uint32_t _destalpha; + + DrawerWall4Command() + { + _dest = dc_dest; + _count = dc_count; + _pitch = dc_pitch; + _shade_constants = dc_shade_constants; + for (int i = 0; i < 4; i++) + { + _vplce[i] = vplce[i]; + _vince[i] = vince[i]; + _buftexturefracx[i] = buftexturefracx[i]; + _bufheight[i] = bufheight[i]; + _bufplce[i] = (const uint32_t *)bufplce[i]; + _bufplce2[i] = (const uint32_t *)bufplce2[i]; + _light[i] = LightBgra::calc_light_multiplier(palookuplight[i]); + } + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + class LoopIterator + { + public: + uint32_t *dest; + int pitch; + int count; + uint32_t vplce[4]; + uint32_t vince[4]; + uint32_t height[4]; + uint32_t one[4]; + + LoopIterator(DrawerWall4Command *command, DrawerThread *thread) + { + count = thread->count_for_thread(command->_dest_y, command->_count); + if (count <= 0) + return; + + dest = thread->dest_for_thread(command->_dest_y, command->_pitch, (uint32_t*)command->_dest); + pitch = command->_pitch * thread->num_cores; + + int skipped = thread->skipped_by_thread(command->_dest_y); + for (int i = 0; i < 4; i++) + { + vplce[i] = command->_vplce[i] + command->_vince[i] * skipped; + vince[i] = command->_vince[i] * thread->num_cores; + height[i] = command->_bufheight[i]; + one[i] = ((0x80000000 + height[i] - 1) / height[i]) * 2 + 1; + } + } + + explicit operator bool() + { + return count > 0; + } + + int sample_index(int col) + { + return ((vplce[col] >> FRACBITS) * height[col]) >> FRACBITS; + } + + bool next() + { + vplce[0] += vince[0]; + vplce[1] += vince[1]; + vplce[2] += vince[2]; + vplce[3] += vince[3]; + dest += pitch; + return (--count) != 0; + } + }; + +#ifdef NO_SSE + struct NearestSampler + { + FORCEINLINE static uint32_t Sample1(DrawerWall4Command &cmd, LoopIterator &loop, int index) + { + return cmd._bufplce[index][loop.sample_index(index)]; + } + }; + struct LinearSampler + { + FORCEINLINE static uint32_t Sample1(DrawerWall4Command &cmd, LoopIterator &loop, int index) + { + return SampleBgra::sample_bilinear(cmd._bufplce[index], cmd._bufplce2[index], cmd._buftexturefracx[index], loop.vplce[index], loop.one[index], loop.height[index]); + } + }; +#else + struct NearestSampler + { + FORCEINLINE static __m128i Sample4(DrawerWall4Command &cmd, LoopIterator &loop) + { + return _mm_set_epi32(cmd._bufplce[3][loop.sample_index(3)], cmd._bufplce[2][loop.sample_index(2)], cmd._bufplce[1][loop.sample_index(1)], cmd._bufplce[0][loop.sample_index(0)]); + } + }; + + struct LinearSampler + { + FORCEINLINE static __m128i Sample4(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg; + VEC_SAMPLE_BILINEAR4_COLUMN(fg, cmd._bufplce, cmd._bufplce2, cmd._buftexturefracx, loop.vplce, loop.one, loop.height); + return fg; + } + }; +#endif + +#ifdef NO_SSE + template + struct Copy + { + Copy(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::copy(fg); + } + } + }; + + template + struct Mask + { + Mask(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::alpha_blend(fg, loop.dest[i]); + } + } + }; + + template + struct TMaskAdd + { + TMaskAdd(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::add(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha)); + } + } + }; + + template + struct TMaskSub + { + TMaskSub(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::sub(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha)); + } + } + }; + + template + struct TMaskRevSub + { + TMaskRevSub(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::revsub(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha)); + } + } + }; + + typedef Copy CopyNearestSimple; + typedef Copy CopyLinearSimple; + typedef Copy CopyNearest; + typedef Copy CopyLinear; + typedef Mask MaskNearestSimple; + typedef Mask MaskLinearSimple; + typedef Mask MaskNearest; + typedef Mask MaskLinear; + typedef TMaskAdd TMaskAddNearestSimple; + typedef TMaskAdd TMaskAddLinearSimple; + typedef TMaskAdd TMaskAddNearest; + typedef TMaskAdd TMaskAddLinear; + typedef TMaskSub TMaskSubNearestSimple; + typedef TMaskSub TMaskSubLinearSimple; + typedef TMaskSub TMaskSubNearest; + typedef TMaskSub TMaskSubLinear; + typedef TMaskRevSub TMaskRevSubNearestSimple; + typedef TMaskRevSub TMaskRevSubLinearSimple; + typedef TMaskRevSub TMaskRevSubNearest; + typedef TMaskRevSub TMaskRevSubLinear; +#else + template + struct CopySimple + { + VEC_SHADE_VARS(); + CopySimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)loop.dest, fg); + } + }; + + template + struct Copy + { + VEC_SHADE_VARS(); + Copy(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + VEC_SHADE(fg, cmd._shade_constants); + _mm_storeu_si128((__m128i*)loop.dest, fg); + } + }; + + template + struct MaskSimple + { + VEC_SHADE_VARS(); + MaskSimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + VEC_SHADE_SIMPLE(fg); + VEC_ALPHA_BLEND(fg, bg); + _mm_storeu_si128((__m128i*)loop.dest, fg); + } + }; + + template + struct Mask + { + VEC_SHADE_VARS(); + Mask(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + VEC_SHADE(fg, cmd._shade_constants); + VEC_ALPHA_BLEND(fg, bg); + _mm_storeu_si128((__m128i*)loop.dest, fg); + } + }; + + template + struct TMaskAddSimple + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskAddSimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskAdd + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskAdd(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskSubSimple + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskSubSimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskSub + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskSub(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskRevSubSimple + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskRevSubSimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskRevSub + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskRevSub(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + typedef CopySimple CopyNearestSimple; + typedef CopySimple CopyLinearSimple; + typedef Copy CopyNearest; + typedef Copy CopyLinear; + typedef MaskSimple MaskNearestSimple; + typedef MaskSimple MaskLinearSimple; + typedef Mask MaskNearest; + typedef Mask MaskLinear; + typedef TMaskAddSimple TMaskAddNearestSimple; + typedef TMaskAddSimple TMaskAddLinearSimple; + typedef TMaskAdd TMaskAddNearest; + typedef TMaskAdd TMaskAddLinear; + typedef TMaskSubSimple TMaskSubNearestSimple; + typedef TMaskSubSimple TMaskSubLinearSimple; + typedef TMaskSub TMaskSubNearest; + typedef TMaskSub TMaskSubLinear; + typedef TMaskRevSubSimple TMaskRevSubNearestSimple; + typedef TMaskRevSubSimple TMaskRevSubLinearSimple; + typedef TMaskRevSub TMaskRevSubNearest; + typedef TMaskRevSub TMaskRevSubLinear; +#endif +}; + +typedef DrawerBlendCommand Vlinec4NearestSimpleRGBACommand; +typedef DrawerBlendCommand Vlinec4NearestRGBACommand; +typedef DrawerBlendCommand Vlinec4LinearSimpleRGBACommand; +typedef DrawerBlendCommand Vlinec4LinearRGBACommand; +typedef DrawerBlendCommand Mvlinec4NearestSimpleRGBACommand; +typedef DrawerBlendCommand Mvlinec4NearestRGBACommand; +typedef DrawerBlendCommand Mvlinec4LinearSimpleRGBACommand; +typedef DrawerBlendCommand Mvlinec4LinearRGBACommand; +typedef DrawerBlendCommand Tmvline4AddNearestSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4AddNearestRGBACommand; +typedef DrawerBlendCommand Tmvline4AddLinearSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4AddLinearRGBACommand; +typedef DrawerBlendCommand Tmvline4AddClampNearestSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4AddClampNearestRGBACommand; +typedef DrawerBlendCommand Tmvline4AddClampLinearSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4AddClampLinearRGBACommand; +typedef DrawerBlendCommand Tmvline4SubClampNearestSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4SubClampNearestRGBACommand; +typedef DrawerBlendCommand Tmvline4SubClampLinearSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4SubClampLinearRGBACommand; +typedef DrawerBlendCommand Tmvline4RevSubClampNearestSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4RevSubClampNearestRGBACommand; +typedef DrawerBlendCommand Tmvline4RevSubClampLinearSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4RevSubClampLinearRGBACommand; + +class Vlinec1RGBACommand : public DrawerWall1Command +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (_source2 == nullptr) + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::copy(fg); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_source, _source2, loop.texturefracx, loop.frac, loop.one, loop.height), _light, _shade_constants); + *loop.dest = BlendBgra::copy(fg); + } while (loop.next()); + } + } +}; + +class Mvlinec1RGBACommand : public DrawerWall1Command +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (_source2 == nullptr) + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::alpha_blend(fg, *loop.dest); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_source, _source2, loop.texturefracx, loop.frac, loop.one, loop.height), _light, _shade_constants); + *loop.dest = BlendBgra::alpha_blend(fg, *loop.dest); + } while (loop.next()); + } + } +}; + +class Tmvline1AddRGBACommand : public DrawerWall1Command +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } +}; + +class Tmvline1AddClampRGBACommand : public DrawerWall1Command +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } +}; + +class Tmvline1SubClampRGBACommand : public DrawerWall1Command +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::sub(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } +}; + +class Tmvline1RevSubClampRGBACommand : public DrawerWall1Command +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::revsub(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } +}; + +///////////////////////////////////////////////////////////////////////////// + +class DrawFogBoundaryLineRGBACommand : public DrawerCommand +{ + int _y; + int _x; + int _x2; + BYTE * RESTRICT _destorg; + fixed_t _light; + ShadeConstants _shade_constants; + +public: + DrawFogBoundaryLineRGBACommand(int y, int x, int x2) + { + _y = y; + _x = x; + _x2 = x2; + + _destorg = dc_destorg; + _light = dc_light; + _shade_constants = dc_shade_constants; + } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(_y)) + return; + + int y = _y; + int x = _x; + int x2 = _x2; + + uint32_t *dest = ylookup[y] + (uint32_t*)_destorg; + + uint32_t light = LightBgra::calc_light_multiplier(_light); + ShadeConstants constants = _shade_constants; + + do + { + uint32_t red = (dest[x] >> 16) & 0xff; + uint32_t green = (dest[x] >> 8) & 0xff; + uint32_t blue = dest[x] & 0xff; + + if (constants.simple_shade) + { + red = red * light / 256; + green = green * light / 256; + blue = blue * light / 256; + } + else + { + uint32_t inv_light = 256 - light; + uint32_t inv_desaturate = 256 - constants.desaturate; + + uint32_t intensity = ((red * 77 + green * 143 + blue * 37) >> 8) * constants.desaturate; + + red = (red * inv_desaturate + intensity) / 256; + green = (green * inv_desaturate + intensity) / 256; + blue = (blue * inv_desaturate + intensity) / 256; + + red = (constants.fade_red * inv_light + red * light) / 256; + green = (constants.fade_green * inv_light + green * light) / 256; + blue = (constants.fade_blue * inv_light + blue * light) / 256; + + red = (red * constants.light_red) / 256; + green = (green * constants.light_green) / 256; + blue = (blue * constants.light_blue) / 256; + } + + dest[x] = 0xff000000 | (red << 16) | (green << 8) | blue; + } while (++x <= x2); + } +}; + +class DrawTiltedSpanRGBACommand : public DrawerCommand +{ + int _x1; + int _x2; + int _y; + BYTE * RESTRICT _destorg; + fixed_t _light; + ShadeConstants _shade_constants; + FVector3 _plane_sz; + FVector3 _plane_su; + FVector3 _plane_sv; + bool _plane_shade; + int _planeshade; + float _planelightfloat; + fixed_t _pviewx; + fixed_t _pviewy; + int _xbits; + int _ybits; + const uint32_t * RESTRICT _source; + +public: + DrawTiltedSpanRGBACommand(int y, int x1, int x2, const FVector3 &plane_sz, const FVector3 &plane_su, const FVector3 &plane_sv, bool plane_shade, int planeshade, float planelightfloat, fixed_t pviewx, fixed_t pviewy) + { + _x1 = x1; + _x2 = x2; + _y = y; + _destorg = dc_destorg; + _light = ds_light; + _shade_constants = ds_shade_constants; + _plane_sz = plane_sz; + _plane_su = plane_su; + _plane_sv = plane_sv; + _plane_shade = plane_shade; + _planeshade = planeshade; + _planelightfloat = planelightfloat; + _pviewx = pviewx; + _pviewy = pviewy; + _source = (const uint32_t*)ds_source; + _xbits = ds_xbits; + _ybits = ds_ybits; + } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(_y)) + return; + + //#define SPANSIZE 32 + //#define INVSPAN 0.03125f + //#define SPANSIZE 8 + //#define INVSPAN 0.125f + #define SPANSIZE 16 + #define INVSPAN 0.0625f + + int source_width = 1 << _xbits; + int source_height = 1 << _ybits; + + uint32_t *dest = ylookup[_y] + _x1 + (uint32_t*)_destorg; + int count = _x2 - _x1 + 1; + + // Depth (Z) change across the span + double iz = _plane_sz[2] + _plane_sz[1] * (centery - _y) + _plane_sz[0] * (_x1 - centerx); + + // Light change across the span + fixed_t lightstart = _light; + fixed_t lightend = lightstart; + if (_plane_shade) + { + double vis_start = iz * _planelightfloat; + double vis_end = (iz + _plane_sz[0] * count) * _planelightfloat; + + lightstart = LIGHTSCALE(vis_start, _planeshade); + lightend = LIGHTSCALE(vis_end, _planeshade); + } + fixed_t light = lightstart; + fixed_t steplight = (lightend - lightstart) / count; + + // Texture coordinates + double uz = _plane_su[2] + _plane_su[1] * (centery - _y) + _plane_su[0] * (_x1 - centerx); + double vz = _plane_sv[2] + _plane_sv[1] * (centery - _y) + _plane_sv[0] * (_x1 - centerx); + double startz = 1.f / iz; + double startu = uz*startz; + double startv = vz*startz; + double izstep = _plane_sz[0] * SPANSIZE; + double uzstep = _plane_su[0] * SPANSIZE; + double vzstep = _plane_sv[0] * SPANSIZE; + + // Linear interpolate in sizes of SPANSIZE to increase speed + while (count >= SPANSIZE) + { + iz += izstep; + uz += uzstep; + vz += vzstep; + + double endz = 1.f / iz; + double endu = uz*endz; + double endv = vz*endz; + uint32_t stepu = (uint32_t)(SQWORD((endu - startu) * INVSPAN)); + uint32_t stepv = (uint32_t)(SQWORD((endv - startv) * INVSPAN)); + uint32_t u = (uint32_t)(SQWORD(startu) + _pviewx); + uint32_t v = (uint32_t)(SQWORD(startv) + _pviewy); + + for (int i = 0; i < SPANSIZE; i++) + { + uint32_t sx = ((u >> 16) * source_width) >> 16; + uint32_t sy = ((v >> 16) * source_height) >> 16; + uint32_t fg = _source[sy + sx * source_height]; + + if (_shade_constants.simple_shade) + *(dest++) = LightBgra::shade_bgra_simple(fg, LightBgra::calc_light_multiplier(light)); + else + *(dest++) = LightBgra::shade_bgra(fg, LightBgra::calc_light_multiplier(light), _shade_constants); + + u += stepu; + v += stepv; + light += steplight; + } + startu = endu; + startv = endv; + count -= SPANSIZE; + } + + // The last few pixels at the end + while (count > 0) + { + double endz = 1.f / iz; + startu = uz*endz; + startv = vz*endz; + uint32_t u = (uint32_t)(SQWORD(startu) + _pviewx); + uint32_t v = (uint32_t)(SQWORD(startv) + _pviewy); + + uint32_t sx = ((u >> 16) * source_width) >> 16; + uint32_t sy = ((v >> 16) * source_height) >> 16; + uint32_t fg = _source[sy + sx * source_height]; + + if (_shade_constants.simple_shade) + *(dest++) = LightBgra::shade_bgra_simple(fg, LightBgra::calc_light_multiplier(light)); + else + *(dest++) = LightBgra::shade_bgra(fg, LightBgra::calc_light_multiplier(light), _shade_constants); + + iz += _plane_sz[0]; + uz += _plane_su[0]; + vz += _plane_sv[0]; + light += steplight; + count--; + } + } +}; + +class DrawColoredSpanRGBACommand : public DrawerCommand +{ + int _y; + int _x1; + int _x2; + BYTE * RESTRICT _destorg; + fixed_t _light; + int _color; + +public: + DrawColoredSpanRGBACommand(int y, int x1, int x2) + { + _y = y; + _x1 = x1; + _x2 = x2; + + _destorg = dc_destorg; + _light = ds_light; + _color = ds_color; + } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(_y)) + return; + + int y = _y; + int x1 = _x1; + int x2 = _x2; + + uint32_t *dest = ylookup[y] + x1 + (uint32_t*)_destorg; + int count = (x2 - x1 + 1); + uint32_t light = LightBgra::calc_light_multiplier(_light); + uint32_t color = LightBgra::shade_pal_index_simple(_color, light); + for (int i = 0; i < count; i++) + dest[i] = color; + } +}; + +class FillTransColumnRGBACommand : public DrawerCommand +{ + int _x; + int _y1; + int _y2; + int _color; + int _a; + BYTE * RESTRICT _destorg; + int _pitch; + fixed_t _light; + +public: + FillTransColumnRGBACommand(int x, int y1, int y2, int color, int a) + { + _x = x; + _y1 = y1; + _y2 = y2; + _color = color; + _a = a; + + _destorg = dc_destorg; + _pitch = dc_pitch; + } + + void Execute(DrawerThread *thread) override + { + int x = _x; + int y1 = _y1; + int y2 = _y2; + int color = _color; + int a = _a; + + int ycount = thread->count_for_thread(y1, y2 - y1 + 1); + if (ycount <= 0) + return; + + uint32_t fg = GPalette.BaseColors[color].d; + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + uint32_t alpha = a + 1; + uint32_t inv_alpha = 256 - alpha; + + fg_red *= alpha; + fg_green *= alpha; + fg_blue *= alpha; + + int spacing = _pitch * thread->num_cores; + uint32_t *dest = thread->dest_for_thread(y1, _pitch, ylookup[y1] + x + (uint32_t*)_destorg); + + for (int y = 0; y < ycount; y++) + { + uint32_t bg_red = (*dest >> 16) & 0xff; + uint32_t bg_green = (*dest >> 8) & 0xff; + uint32_t bg_blue = (*dest) & 0xff; + + uint32_t red = (fg_red + bg_red * inv_alpha) / 256; + uint32_t green = (fg_green + bg_green * inv_alpha) / 256; + uint32_t blue = (fg_blue + bg_blue * inv_alpha) / 256; + + *dest = 0xff000000 | (red << 16) | (green << 8) | blue; + dest += spacing; + } + } +}; + +ApplySpecialColormapRGBACommand::ApplySpecialColormapRGBACommand(FSpecialColormap *colormap, DFrameBuffer *screen) +{ + buffer = screen->GetBuffer(); + pitch = screen->GetPitch(); + width = screen->GetWidth(); + height = screen->GetHeight(); + + start_red = (int)(colormap->ColorizeStart[0] * 255); + start_green = (int)(colormap->ColorizeStart[1] * 255); + start_blue = (int)(colormap->ColorizeStart[2] * 255); + end_red = (int)(colormap->ColorizeEnd[0] * 255); + end_green = (int)(colormap->ColorizeEnd[1] * 255); + end_blue = (int)(colormap->ColorizeEnd[2] * 255); +} + +#ifdef NO_SSE +void ApplySpecialColormapRGBACommand::Execute(DrawerThread *thread) +{ + int y = thread->skipped_by_thread(0); + int count = thread->count_for_thread(0, height); + while (count > 0) + { + BYTE *pixels = buffer + y * pitch * 4; + for (int x = 0; x < width; x++) + { + int fg_red = pixels[2]; + int fg_green = pixels[1]; + int fg_blue = pixels[0]; + + int gray = (fg_red * 77 + fg_green * 143 + fg_blue * 37) >> 8; + gray += (gray >> 7); // gray*=256/255 + int inv_gray = 256 - gray; + + int red = clamp((start_red * inv_gray + end_red * gray) >> 8, 0, 255); + int green = clamp((start_green * inv_gray + end_green * gray) >> 8, 0, 255); + int blue = clamp((start_blue * inv_gray + end_blue * gray) >> 8, 0, 255); + + pixels[0] = (BYTE)blue; + pixels[1] = (BYTE)green; + pixels[2] = (BYTE)red; + pixels[3] = 0xff; + + pixels += 4; + } + y += thread->num_cores; + count--; + } +} +#else +void ApplySpecialColormapRGBACommand::Execute(DrawerThread *thread) +{ + int y = thread->skipped_by_thread(0); + int count = thread->count_for_thread(0, height); + __m128i gray_weight = _mm_set_epi16(256, 77, 143, 37, 256, 77, 143, 37); + __m128i start_end = _mm_set_epi16(255, start_red, start_green, start_blue, 255, end_red, end_green, end_blue); + while (count > 0) + { + BYTE *pixels = buffer + y * pitch * 4; + int sse_length = width / 4; + for (int x = 0; x < sse_length; x++) + { + // Unpack to integers: + __m128i p = _mm_loadu_si128((const __m128i*)pixels); + + __m128i p16_0 = _mm_unpacklo_epi8(p, _mm_setzero_si128()); + __m128i p16_1 = _mm_unpackhi_epi8(p, _mm_setzero_si128()); + + // Add gray weighting to colors + __m128i mullo0 = _mm_mullo_epi16(p16_0, gray_weight); + __m128i mullo1 = _mm_mullo_epi16(p16_1, gray_weight); + __m128i p32_0 = _mm_unpacklo_epi16(mullo0, _mm_setzero_si128()); + __m128i p32_1 = _mm_unpackhi_epi16(mullo0, _mm_setzero_si128()); + __m128i p32_2 = _mm_unpacklo_epi16(mullo1, _mm_setzero_si128()); + __m128i p32_3 = _mm_unpackhi_epi16(mullo1, _mm_setzero_si128()); + + // Transpose to get color components in individual vectors: + __m128 tmpx = _mm_castsi128_ps(p32_0); + __m128 tmpy = _mm_castsi128_ps(p32_1); + __m128 tmpz = _mm_castsi128_ps(p32_2); + __m128 tmpw = _mm_castsi128_ps(p32_3); + _MM_TRANSPOSE4_PS(tmpx, tmpy, tmpz, tmpw); + __m128i blue = _mm_castps_si128(tmpx); + __m128i green = _mm_castps_si128(tmpy); + __m128i red = _mm_castps_si128(tmpz); + __m128i alpha = _mm_castps_si128(tmpw); + + // Calculate gray and 256-gray values: + __m128i gray = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(red, green), blue), 8); + __m128i inv_gray = _mm_sub_epi32(_mm_set1_epi32(256), gray); + + // p32 = start * inv_gray + end * gray: + __m128i gray0 = _mm_shuffle_epi32(gray, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i gray1 = _mm_shuffle_epi32(gray, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i gray2 = _mm_shuffle_epi32(gray, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i gray3 = _mm_shuffle_epi32(gray, _MM_SHUFFLE(3, 3, 3, 3)); + __m128i inv_gray0 = _mm_shuffle_epi32(inv_gray, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i inv_gray1 = _mm_shuffle_epi32(inv_gray, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i inv_gray2 = _mm_shuffle_epi32(inv_gray, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i inv_gray3 = _mm_shuffle_epi32(inv_gray, _MM_SHUFFLE(3, 3, 3, 3)); + __m128i gray16_0 = _mm_packs_epi32(gray0, inv_gray0); + __m128i gray16_1 = _mm_packs_epi32(gray1, inv_gray1); + __m128i gray16_2 = _mm_packs_epi32(gray2, inv_gray2); + __m128i gray16_3 = _mm_packs_epi32(gray3, inv_gray3); + __m128i gray16_0_mullo = _mm_mullo_epi16(gray16_0, start_end); + __m128i gray16_1_mullo = _mm_mullo_epi16(gray16_1, start_end); + __m128i gray16_2_mullo = _mm_mullo_epi16(gray16_2, start_end); + __m128i gray16_3_mullo = _mm_mullo_epi16(gray16_3, start_end); + __m128i gray16_0_mulhi = _mm_mulhi_epi16(gray16_0, start_end); + __m128i gray16_1_mulhi = _mm_mulhi_epi16(gray16_1, start_end); + __m128i gray16_2_mulhi = _mm_mulhi_epi16(gray16_2, start_end); + __m128i gray16_3_mulhi = _mm_mulhi_epi16(gray16_3, start_end); + p32_0 = _mm_srli_epi32(_mm_add_epi32(_mm_unpacklo_epi16(gray16_0_mullo, gray16_0_mulhi), _mm_unpackhi_epi16(gray16_0_mullo, gray16_0_mulhi)), 8); + p32_1 = _mm_srli_epi32(_mm_add_epi32(_mm_unpacklo_epi16(gray16_1_mullo, gray16_1_mulhi), _mm_unpackhi_epi16(gray16_1_mullo, gray16_1_mulhi)), 8); + p32_2 = _mm_srli_epi32(_mm_add_epi32(_mm_unpacklo_epi16(gray16_2_mullo, gray16_2_mulhi), _mm_unpackhi_epi16(gray16_2_mullo, gray16_2_mulhi)), 8); + p32_3 = _mm_srli_epi32(_mm_add_epi32(_mm_unpacklo_epi16(gray16_3_mullo, gray16_3_mulhi), _mm_unpackhi_epi16(gray16_3_mullo, gray16_3_mulhi)), 8); + + p16_0 = _mm_packs_epi32(p32_0, p32_1); + p16_1 = _mm_packs_epi32(p32_2, p32_3); + p = _mm_packus_epi16(p16_0, p16_1); + + _mm_storeu_si128((__m128i*)pixels, p); + pixels += 16; + } + + for (int x = sse_length * 4; x < width; x++) + { + int fg_red = pixels[2]; + int fg_green = pixels[1]; + int fg_blue = pixels[0]; + + int gray = (fg_red * 77 + fg_green * 143 + fg_blue * 37) >> 8; + gray += (gray >> 7); // gray*=256/255 + int inv_gray = 256 - gray; + + int red = clamp((start_red * inv_gray + end_red * gray) >> 8, 0, 255); + int green = clamp((start_green * inv_gray + end_green * gray) >> 8, 0, 255); + int blue = clamp((start_blue * inv_gray + end_blue * gray) >> 8, 0, 255); + + pixels[0] = (BYTE)blue; + pixels[1] = (BYTE)green; + pixels[2] = (BYTE)red; + pixels[3] = 0xff; + + pixels += 4; + } + + y += thread->num_cores; + count--; + } +} +#endif + +///////////////////////////////////////////////////////////////////////////// + +void R_BeginDrawerCommands() +{ + DrawerCommandQueue::Begin(); +} + +void R_EndDrawerCommands() +{ + DrawerCommandQueue::End(); +} + +void R_DrawColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_FillColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_FillAddColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_FillAddClampColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_FillSubClampColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_FillRevSubClampColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawFuzzColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); + + dc_yl = MAX(dc_yl, 1); + dc_yh = MIN(dc_yh, fuzzviewheight); + if (dc_yl <= dc_yh) + fuzzpos = (fuzzpos + dc_yh - dc_yl + 1) % FUZZTABLE; +} + +void R_DrawAddColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawTranslatedColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawTlatedAddColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawShadedColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawAddClampColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawAddClampTranslatedColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawSubClampColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawSubClampTranslatedColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawRevSubClampColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawRevSubClampTranslatedColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawSpan_rgba() +{ +#ifdef NO_SSE + DrawerCommandQueue::QueueCommand(); +#else + DrawerCommandQueue::QueueCommand(); +#endif +} + +void R_DrawSpanMasked_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawSpanTranslucent_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawSpanMaskedTranslucent_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawSpanAddClamp_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawSpanMaskedAddClamp_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_FillSpan_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawTiltedSpan_rgba(int y, int x1, int x2, const FVector3 &plane_sz, const FVector3 &plane_su, const FVector3 &plane_sv, bool plane_shade, int planeshade, float planelightfloat, fixed_t pviewx, fixed_t pviewy) +{ + DrawerCommandQueue::QueueCommand(y, x1, x2, plane_sz, plane_su, plane_sv, plane_shade, planeshade, planelightfloat, pviewx, pviewy); +} + +void R_DrawColoredSpan_rgba(int y, int x1, int x2) +{ + DrawerCommandQueue::QueueCommand(y, x1, x2); +} + +static ShadeConstants slab_rgba_shade_constants; +static const BYTE *slab_rgba_colormap; +static fixed_t slab_rgba_light; + +void R_SetupDrawSlab_rgba(FColormap *base_colormap, float light, int shade) +{ + slab_rgba_shade_constants.light_red = base_colormap->Color.r * 256 / 255; + slab_rgba_shade_constants.light_green = base_colormap->Color.g * 256 / 255; + slab_rgba_shade_constants.light_blue = base_colormap->Color.b * 256 / 255; + slab_rgba_shade_constants.light_alpha = base_colormap->Color.a * 256 / 255; + slab_rgba_shade_constants.fade_red = base_colormap->Fade.r; + slab_rgba_shade_constants.fade_green = base_colormap->Fade.g; + slab_rgba_shade_constants.fade_blue = base_colormap->Fade.b; + slab_rgba_shade_constants.fade_alpha = base_colormap->Fade.a; + slab_rgba_shade_constants.desaturate = MIN(abs(base_colormap->Desaturate), 255) * 255 / 256; + slab_rgba_shade_constants.simple_shade = (base_colormap->Color.d == 0x00ffffff && base_colormap->Fade.d == 0x00000000 && base_colormap->Desaturate == 0); + slab_rgba_colormap = base_colormap->Maps; + slab_rgba_light = LIGHTSCALE(light, shade); +} + +void R_DrawSlab_rgba(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p) +{ + DrawerCommandQueue::QueueCommand(dx, v, dy, vi, vptr, p, slab_rgba_shade_constants, slab_rgba_colormap, slab_rgba_light); +} + +DWORD vlinec1_rgba() +{ + DrawerCommandQueue::QueueCommand(); + return dc_texturefrac + dc_count * dc_iscale; +} + +template +void queue_wallcommand() +{ + if (bufplce2[0] == nullptr && dc_shade_constants.simple_shade) + DrawerCommandQueue::QueueCommand(); + else if (bufplce2[0] == nullptr) + DrawerCommandQueue::QueueCommand(); + else if (dc_shade_constants.simple_shade) + DrawerCommandQueue::QueueCommand(); + else + DrawerCommandQueue::QueueCommand(); +} + +void vlinec4_rgba() +{ + queue_wallcommand(); + for (int i = 0; i < 4; i++) + vplce[i] += vince[i] * dc_count; +} + +DWORD mvlinec1_rgba() +{ + DrawerCommandQueue::QueueCommand(); + return dc_texturefrac + dc_count * dc_iscale; +} + +void mvlinec4_rgba() +{ + queue_wallcommand(); + for (int i = 0; i < 4; i++) + vplce[i] += vince[i] * dc_count; +} + +fixed_t tmvline1_add_rgba() +{ + DrawerCommandQueue::QueueCommand(); + return dc_texturefrac + dc_count * dc_iscale; +} + +void tmvline4_add_rgba() +{ + queue_wallcommand(); + for (int i = 0; i < 4; i++) + vplce[i] += vince[i] * dc_count; +} + +fixed_t tmvline1_addclamp_rgba() +{ + DrawerCommandQueue::QueueCommand(); + return dc_texturefrac + dc_count * dc_iscale; +} + +void tmvline4_addclamp_rgba() +{ + queue_wallcommand(); + for (int i = 0; i < 4; i++) + vplce[i] += vince[i] * dc_count; +} + +fixed_t tmvline1_subclamp_rgba() +{ + DrawerCommandQueue::QueueCommand(); + return dc_texturefrac + dc_count * dc_iscale; +} + +void tmvline4_subclamp_rgba() +{ + queue_wallcommand(); + for (int i = 0; i < 4; i++) + vplce[i] += vince[i] * dc_count; +} + +fixed_t tmvline1_revsubclamp_rgba() +{ + DrawerCommandQueue::QueueCommand(); + return dc_texturefrac + dc_count * dc_iscale; +} + +void tmvline4_revsubclamp_rgba() +{ + queue_wallcommand(); + for (int i = 0; i < 4; i++) + vplce[i] += vince[i] * dc_count; +} + +void R_DrawFogBoundarySection_rgba(int y, int y2, int x1) +{ + for (; y < y2; ++y) + { + int x2 = spanend[y]; + DrawerCommandQueue::QueueCommand(y, x1, x2); + } +} + +void R_DrawFogBoundary_rgba(int x1, int x2, short *uclip, short *dclip) +{ + // To do: we do not need to create new spans when using rgba output - instead we should calculate light on a per pixel basis + + // This is essentially the same as R_MapVisPlane but with an extra step + // to create new horizontal spans whenever the light changes enough that + // we need to use a new colormap. + + double lightstep = rw_lightstep; + double light = rw_light + rw_lightstep*(x2 - x1 - 1); + int x = x2 - 1; + int t2 = uclip[x]; + int b2 = dclip[x]; + int rcolormap = GETPALOOKUP(light, wallshade); + int lcolormap; + BYTE *basecolormapdata = basecolormap->Maps; + + if (b2 > t2) + { + clearbufshort(spanend + t2, b2 - t2, x); + } + + R_SetColorMapLight(basecolormap, (float)light, wallshade); + + BYTE *fake_dc_colormap = basecolormap->Maps + (GETPALOOKUP(light, wallshade) << COLORMAPSHIFT); + + for (--x; x >= x1; --x) + { + int t1 = uclip[x]; + int b1 = dclip[x]; + const int xr = x + 1; + int stop; + + light -= rw_lightstep; + lcolormap = GETPALOOKUP(light, wallshade); + if (lcolormap != rcolormap) + { + if (t2 < b2 && rcolormap != 0) + { // Colormap 0 is always the identity map, so rendering it is + // just a waste of time. + R_DrawFogBoundarySection_rgba(t2, b2, xr); + } + if (t1 < t2) t2 = t1; + if (b1 > b2) b2 = b1; + if (t2 < b2) + { + clearbufshort(spanend + t2, b2 - t2, x); + } + rcolormap = lcolormap; + R_SetColorMapLight(basecolormap, (float)light, wallshade); + fake_dc_colormap = basecolormap->Maps + (GETPALOOKUP(light, wallshade) << COLORMAPSHIFT); + } + else + { + if (fake_dc_colormap != basecolormapdata) + { + stop = MIN(t1, b2); + while (t2 < stop) + { + int y = t2++; + DrawerCommandQueue::QueueCommand(y, xr, spanend[y]); + } + stop = MAX(b1, t2); + while (b2 > stop) + { + int y = --b2; + DrawerCommandQueue::QueueCommand(y, xr, spanend[y]); + } + } + else + { + t2 = MAX(t2, MIN(t1, b2)); + b2 = MIN(b2, MAX(b1, t2)); + } + + stop = MIN(t2, b1); + while (t1 < stop) + { + spanend[t1++] = x; + } + stop = MAX(b2, t2); + while (b1 > stop) + { + spanend[--b1] = x; + } + } + + t2 = uclip[x]; + b2 = dclip[x]; + } + if (t2 < b2 && rcolormap != 0) + { + R_DrawFogBoundarySection_rgba(t2, b2, x1); + } +} diff --git a/src/r_draw_rgba.h b/src/r_draw_rgba.h new file mode 100644 index 000000000..ca54f7263 --- /dev/null +++ b/src/r_draw_rgba.h @@ -0,0 +1,994 @@ +// Emacs style mode select -*- C++ -*- +//----------------------------------------------------------------------------- +// +// $Id:$ +// +// Copyright (C) 1993-1996 by id Software, Inc. +// +// This source is available for distribution and/or modification +// only under the terms of the DOOM Source Code License as +// published by id Software. All rights reserved. +// +// The source is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// FITNESS FOR A PARTICULAR PURPOSE. See the DOOM Source Code License +// for more details. +// +// DESCRIPTION: +// System specific interface stuff. +// +//----------------------------------------------------------------------------- + + +#ifndef __R_DRAW_RGBA__ +#define __R_DRAW_RGBA__ + +#include "r_draw.h" +#include "v_palette.h" +#include +#include +#include +#include +#include + +#ifndef NO_SSE +#include +#endif + +///////////////////////////////////////////////////////////////////////////// +// Drawer functions: + +void rt_initcols_rgba(BYTE *buffer); +void rt_span_coverage_rgba(int x, int start, int stop); + +void rt_copy1col_rgba(int hx, int sx, int yl, int yh); +void rt_copy4cols_rgba(int sx, int yl, int yh); +void rt_shaded1col_rgba(int hx, int sx, int yl, int yh); +void rt_shaded4cols_rgba(int sx, int yl, int yh); +void rt_map1col_rgba(int hx, int sx, int yl, int yh); +void rt_add1col_rgba(int hx, int sx, int yl, int yh); +void rt_addclamp1col_rgba(int hx, int sx, int yl, int yh); +void rt_subclamp1col_rgba(int hx, int sx, int yl, int yh); +void rt_revsubclamp1col_rgba(int hx, int sx, int yl, int yh); +void rt_tlate1col_rgba(int hx, int sx, int yl, int yh); +void rt_tlateadd1col_rgba(int hx, int sx, int yl, int yh); +void rt_tlateaddclamp1col_rgba(int hx, int sx, int yl, int yh); +void rt_tlatesubclamp1col_rgba(int hx, int sx, int yl, int yh); +void rt_tlaterevsubclamp1col_rgba(int hx, int sx, int yl, int yh); +void rt_map4cols_rgba(int sx, int yl, int yh); +void rt_add4cols_rgba(int sx, int yl, int yh); +void rt_addclamp4cols_rgba(int sx, int yl, int yh); +void rt_subclamp4cols_rgba(int sx, int yl, int yh); +void rt_revsubclamp4cols_rgba(int sx, int yl, int yh); +void rt_tlate4cols_rgba(int sx, int yl, int yh); +void rt_tlateadd4cols_rgba(int sx, int yl, int yh); +void rt_tlateaddclamp4cols_rgba(int sx, int yl, int yh); +void rt_tlatesubclamp4cols_rgba(int sx, int yl, int yh); +void rt_tlaterevsubclamp4cols_rgba(int sx, int yl, int yh); + +void R_DrawColumnHoriz_rgba(); +void R_DrawColumn_rgba(); +void R_DrawFuzzColumn_rgba(); +void R_DrawTranslatedColumn_rgba(); +void R_DrawShadedColumn_rgba(); + +void R_FillColumn_rgba(); +void R_FillAddColumn_rgba(); +void R_FillAddClampColumn_rgba(); +void R_FillSubClampColumn_rgba(); +void R_FillRevSubClampColumn_rgba(); +void R_DrawAddColumn_rgba(); +void R_DrawTlatedAddColumn_rgba(); +void R_DrawAddClampColumn_rgba(); +void R_DrawAddClampTranslatedColumn_rgba(); +void R_DrawSubClampColumn_rgba(); +void R_DrawSubClampTranslatedColumn_rgba(); +void R_DrawRevSubClampColumn_rgba(); +void R_DrawRevSubClampTranslatedColumn_rgba(); + +void R_DrawSpan_rgba(void); +void R_DrawSpanMasked_rgba(void); +void R_DrawSpanTranslucent_rgba(); +void R_DrawSpanMaskedTranslucent_rgba(); +void R_DrawSpanAddClamp_rgba(); +void R_DrawSpanMaskedAddClamp_rgba(); +void R_FillSpan_rgba(); + +void R_DrawTiltedSpan_rgba(int y, int x1, int x2, const FVector3 &plane_sz, const FVector3 &plane_su, const FVector3 &plane_sv, bool plane_shade, int planeshade, float planelightfloat, fixed_t pviewx, fixed_t pviewy); +void R_DrawColoredSpan_rgba(int y, int x1, int x2); + +void R_SetupDrawSlab_rgba(FColormap *base_colormap, float light, int shade); +void R_DrawSlab_rgba(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p); + +void R_DrawFogBoundary_rgba(int x1, int x2, short *uclip, short *dclip); + +DWORD vlinec1_rgba(); +void vlinec4_rgba(); +DWORD mvlinec1_rgba(); +void mvlinec4_rgba(); +fixed_t tmvline1_add_rgba(); +void tmvline4_add_rgba(); +fixed_t tmvline1_addclamp_rgba(); +void tmvline4_addclamp_rgba(); +fixed_t tmvline1_subclamp_rgba(); +void tmvline4_subclamp_rgba(); +fixed_t tmvline1_revsubclamp_rgba(); +void tmvline4_revsubclamp_rgba(); + +void R_FillColumnHoriz_rgba(); +void R_FillSpan_rgba(); + +///////////////////////////////////////////////////////////////////////////// +// Multithreaded rendering infrastructure: + +// Redirect drawer commands to worker threads +void R_BeginDrawerCommands(); + +// Wait until all drawers finished executing +void R_EndDrawerCommands(); + +struct FSpecialColormap; +class DrawerCommandQueue; + +// Worker data for each thread executing drawer commands +class DrawerThread +{ +public: + std::thread thread; + + // Thread line index of this thread + int core = 0; + + // Number of active threads + int num_cores = 1; + + // Range of rows processed this pass + int pass_start_y = 0; + int pass_end_y = MAXHEIGHT; + + uint32_t dc_temp_rgbabuff_rgba[MAXHEIGHT * 4]; + uint32_t *dc_temp_rgba; + + // Checks if a line is rendered by this thread + bool line_skipped_by_thread(int line) + { + return line < pass_start_y || line >= pass_end_y || line % num_cores != core; + } + + // The number of lines to skip to reach the first line to be rendered by this thread + int skipped_by_thread(int first_line) + { + int pass_skip = MAX(pass_start_y - first_line, 0); + int core_skip = (num_cores - (first_line + pass_skip - core) % num_cores) % num_cores; + return pass_skip + core_skip; + } + + // The number of lines to be rendered by this thread + int count_for_thread(int first_line, int count) + { + int lines_until_pass_end = MAX(pass_end_y - first_line, 0); + count = MIN(count, lines_until_pass_end); + int c = (count - skipped_by_thread(first_line) + num_cores - 1) / num_cores; + return MAX(c, 0); + } + + // Calculate the dest address for the first line to be rendered by this thread + uint32_t *dest_for_thread(int first_line, int pitch, uint32_t *dest) + { + return dest + skipped_by_thread(first_line) * pitch; + } +}; + +// Task to be executed by each worker thread +class DrawerCommand +{ +protected: + int _dest_y; + +public: + DrawerCommand() + { + _dest_y = static_cast((dc_dest - dc_destorg) / (dc_pitch * 4)); + } + + virtual void Execute(DrawerThread *thread) = 0; +}; + +EXTERN_CVAR(Bool, r_multithreaded) +EXTERN_CVAR(Bool, r_mipmap) + +// Manages queueing up commands and executing them on worker threads +class DrawerCommandQueue +{ + enum { memorypool_size = 16 * 1024 * 1024 }; + char memorypool[memorypool_size]; + size_t memorypool_pos = 0; + + std::vector commands; + + std::vector threads; + + std::mutex start_mutex; + std::condition_variable start_condition; + std::vector active_commands; + bool shutdown_flag = false; + int run_id = 0; + + std::mutex end_mutex; + std::condition_variable end_condition; + size_t finished_threads = 0; + + int threaded_render = 0; + DrawerThread single_core_thread; + int num_passes = 1; + int rows_in_pass = MAXHEIGHT; + + void StartThreads(); + void StopThreads(); + void Finish(); + + static DrawerCommandQueue *Instance(); + + DrawerCommandQueue(); + ~DrawerCommandQueue(); + +public: + // Allocate memory valid for the duration of a command execution + static void* AllocMemory(size_t size); + + // Queue command to be executed by drawer worker threads + template + static void QueueCommand(Types &&... args) + { + auto queue = Instance(); + if (queue->threaded_render == 0 || !r_multithreaded) + { + T command(std::forward(args)...); + command.Execute(&queue->single_core_thread); + } + else + { + void *ptr = AllocMemory(sizeof(T)); + if (!ptr) // Out of memory - render what we got + { + queue->Finish(); + ptr = AllocMemory(sizeof(T)); + if (!ptr) + return; + } + T *command = new (ptr)T(std::forward(args)...); + queue->commands.push_back(command); + } + } + + // Redirects all drawing commands to worker threads until End is called + // Begin/End blocks can be nested. + static void Begin(); + + // End redirection and wait until all worker threads finished executing + static void End(); + + // Waits until all worker threads finished executing + static void WaitForWorkers(); +}; + +///////////////////////////////////////////////////////////////////////////// +// Drawer commands: + +class ApplySpecialColormapRGBACommand : public DrawerCommand +{ + BYTE *buffer; + int pitch; + int width; + int height; + int start_red; + int start_green; + int start_blue; + int end_red; + int end_green; + int end_blue; + +public: + ApplySpecialColormapRGBACommand(FSpecialColormap *colormap, DFrameBuffer *screen); + void Execute(DrawerThread *thread) override; +}; + +template +class DrawerBlendCommand : public CommandType +{ +public: + void Execute(DrawerThread *thread) override + { + typename CommandType::LoopIterator loop(this, thread); + if (!loop) return; + BlendMode blend(*this, loop); + do + { + blend.Blend(*this, loop); + } while (loop.next()); + } +}; + +///////////////////////////////////////////////////////////////////////////// +// Pixel shading inline functions: + +// Give the compiler a strong hint we want these functions inlined: +#ifndef FORCEINLINE +#if defined(_MSC_VER) +#define FORCEINLINE __forceinline +#elif defined(__GNUC__) +#define FORCEINLINE __attribute__((always_inline)) inline +#else +#define FORCEINLINE inline +#endif +#endif + +// Promise compiler we have no aliasing of this pointer +#ifndef RESTRICT +#if defined(_MSC_VER) +#define RESTRICT __restrict +#elif defined(__GNUC__) +#define RESTRICT __restrict__ +#else +#define RESTRICT +#endif +#endif + +class LightBgra +{ +public: + // calculates the light constant passed to the shade_pal_index function + FORCEINLINE static uint32_t calc_light_multiplier(dsfixed_t light) + { + return 256 - (light >> (FRACBITS - 8)); + } + + // Calculates a ARGB8 color for the given palette index and light multiplier + FORCEINLINE static uint32_t shade_pal_index_simple(uint32_t index, uint32_t light) + { + const PalEntry &color = GPalette.BaseColors[index]; + uint32_t red = color.r; + uint32_t green = color.g; + uint32_t blue = color.b; + + red = red * light / 256; + green = green * light / 256; + blue = blue * light / 256; + + return 0xff000000 | (red << 16) | (green << 8) | blue; + } + + // Calculates a ARGB8 color for the given palette index, light multiplier and dynamic colormap + FORCEINLINE static uint32_t shade_pal_index(uint32_t index, uint32_t light, const ShadeConstants &constants) + { + const PalEntry &color = GPalette.BaseColors[index]; + uint32_t alpha = color.d & 0xff000000; + uint32_t red = color.r; + uint32_t green = color.g; + uint32_t blue = color.b; + if (constants.simple_shade) + { + red = red * light / 256; + green = green * light / 256; + blue = blue * light / 256; + } + else + { + uint32_t inv_light = 256 - light; + uint32_t inv_desaturate = 256 - constants.desaturate; + + uint32_t intensity = ((red * 77 + green * 143 + blue * 37) >> 8) * constants.desaturate; + + red = (red * inv_desaturate + intensity) / 256; + green = (green * inv_desaturate + intensity) / 256; + blue = (blue * inv_desaturate + intensity) / 256; + + red = (constants.fade_red * inv_light + red * light) / 256; + green = (constants.fade_green * inv_light + green * light) / 256; + blue = (constants.fade_blue * inv_light + blue * light) / 256; + + red = (red * constants.light_red) / 256; + green = (green * constants.light_green) / 256; + blue = (blue * constants.light_blue) / 256; + } + return alpha | (red << 16) | (green << 8) | blue; + } + + FORCEINLINE static uint32_t shade_bgra_simple(uint32_t color, uint32_t light) + { + uint32_t red = RPART(color) * light / 256; + uint32_t green = GPART(color) * light / 256; + uint32_t blue = BPART(color) * light / 256; + return 0xff000000 | (red << 16) | (green << 8) | blue; + } + + FORCEINLINE static uint32_t shade_bgra(uint32_t color, uint32_t light, const ShadeConstants &constants) + { + uint32_t alpha = color & 0xff000000; + uint32_t red = (color >> 16) & 0xff; + uint32_t green = (color >> 8) & 0xff; + uint32_t blue = color & 0xff; + if (constants.simple_shade) + { + red = red * light / 256; + green = green * light / 256; + blue = blue * light / 256; + } + else + { + uint32_t inv_light = 256 - light; + uint32_t inv_desaturate = 256 - constants.desaturate; + + uint32_t intensity = ((red * 77 + green * 143 + blue * 37) >> 8) * constants.desaturate; + + red = (red * inv_desaturate + intensity) / 256; + green = (green * inv_desaturate + intensity) / 256; + blue = (blue * inv_desaturate + intensity) / 256; + + red = (constants.fade_red * inv_light + red * light) / 256; + green = (constants.fade_green * inv_light + green * light) / 256; + blue = (constants.fade_blue * inv_light + blue * light) / 256; + + red = (red * constants.light_red) / 256; + green = (green * constants.light_green) / 256; + blue = (blue * constants.light_blue) / 256; + } + return alpha | (red << 16) | (green << 8) | blue; + } +}; + +class BlendBgra +{ +public: + FORCEINLINE static uint32_t copy(uint32_t fg) + { + return fg; + } + + FORCEINLINE static uint32_t add(uint32_t fg, uint32_t bg, uint32_t srcalpha, uint32_t destalpha) + { + uint32_t red = MIN((RPART(fg) * srcalpha + RPART(bg) * destalpha) >> 8, 255); + uint32_t green = MIN((GPART(fg) * srcalpha + GPART(bg) * destalpha) >> 8, 255); + uint32_t blue = MIN((BPART(fg) * srcalpha + BPART(bg) * destalpha) >> 8, 255); + return 0xff000000 | (red << 16) | (green << 8) | blue; + } + + FORCEINLINE static uint32_t sub(uint32_t fg, uint32_t bg, uint32_t srcalpha, uint32_t destalpha) + { + uint32_t red = clamp((0x10000 - RPART(fg) * srcalpha + RPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256; + uint32_t green = clamp((0x10000 - GPART(fg) * srcalpha + GPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256; + uint32_t blue = clamp((0x10000 - BPART(fg) * srcalpha + BPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256; + return 0xff000000 | (red << 16) | (green << 8) | blue; + } + + FORCEINLINE static uint32_t revsub(uint32_t fg, uint32_t bg, uint32_t srcalpha, uint32_t destalpha) + { + uint32_t red = clamp((0x10000 + RPART(fg) * srcalpha - RPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256; + uint32_t green = clamp((0x10000 + GPART(fg) * srcalpha - GPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256; + uint32_t blue = clamp((0x10000 + BPART(fg) * srcalpha - BPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256; + return 0xff000000 | (red << 16) | (green << 8) | blue; + } + + FORCEINLINE static uint32_t alpha_blend(uint32_t fg, uint32_t bg) + { + uint32_t alpha = APART(fg) + (APART(fg) >> 7); // 255 -> 256 + uint32_t inv_alpha = 256 - alpha; + uint32_t red = MIN(RPART(fg) * alpha + (RPART(bg) * inv_alpha) / 256, 255); + uint32_t green = MIN(GPART(fg) * alpha + (GPART(bg) * inv_alpha) / 256, 255); + uint32_t blue = MIN(BPART(fg) * alpha + (BPART(bg) * inv_alpha) / 256, 255); + return 0xff000000 | (red << 16) | (green << 8) | blue; + } +}; + +class SampleBgra +{ +public: + inline static bool span_sampler_setup(const uint32_t * RESTRICT &source, int &xbits, int &ybits, fixed_t xstep, fixed_t ystep, bool mipmapped) + { + // Is this a magfilter or minfilter? + fixed_t xmagnitude = abs(xstep) >> (32 - xbits - FRACBITS); + fixed_t ymagnitude = abs(ystep) >> (32 - ybits - FRACBITS); + fixed_t magnitude = (xmagnitude + ymagnitude) * 2 + (1 << (FRACBITS - 1)); + bool magnifying = (magnitude >> FRACBITS == 0); + + if (r_mipmap && mipmapped) + { + int level = magnitude >> (FRACBITS + 1); + while (level != 0) + { + if (xbits <= 2 || ybits <= 2) + break; + + source += (1 << (xbits)) * (1 << (ybits)); + xbits -= 1; + ybits -= 1; + level >>= 1; + } + } + + return (magnifying && r_magfilter) || (!magnifying && r_minfilter); + } + + FORCEINLINE static uint32_t sample_bilinear(const uint32_t *col0, const uint32_t *col1, uint32_t texturefracx, uint32_t texturefracy, uint32_t one, uint32_t height) + { + uint32_t frac_y0 = (texturefracy >> FRACBITS) * height; + uint32_t frac_y1 = ((texturefracy + one) >> FRACBITS) * height; + uint32_t y0 = frac_y0 >> FRACBITS; + uint32_t y1 = frac_y1 >> FRACBITS; + + uint32_t p00 = col0[y0]; + uint32_t p01 = col0[y1]; + uint32_t p10 = col1[y0]; + uint32_t p11 = col1[y1]; + + uint32_t inv_b = texturefracx; + uint32_t inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t red = (RPART(p00) * a * b + RPART(p01) * inv_a * b + RPART(p10) * a * inv_b + RPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t green = (GPART(p00) * a * b + GPART(p01) * inv_a * b + GPART(p10) * a * inv_b + GPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t blue = (BPART(p00) * a * b + BPART(p01) * inv_a * b + BPART(p10) * a * inv_b + BPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t alpha = (APART(p00) * a * b + APART(p01) * inv_a * b + APART(p10) * a * inv_b + APART(p11) * inv_a * inv_b + 127) >> 8; + + return (alpha << 24) | (red << 16) | (green << 8) | blue; + } + + FORCEINLINE static uint32_t sample_bilinear(const uint32_t *texture, dsfixed_t xfrac, dsfixed_t yfrac, int xbits, int ybits) + { + int xshift = (32 - xbits); + int yshift = (32 - ybits); + int xmask = (1 << xshift) - 1; + int ymask = (1 << yshift) - 1; + uint32_t x = xfrac >> xbits; + uint32_t y = yfrac >> ybits; + + uint32_t p00 = texture[(y & ymask) + ((x & xmask) << yshift)]; + uint32_t p01 = texture[((y + 1) & ymask) + ((x & xmask) << yshift)]; + uint32_t p10 = texture[(y & ymask) + (((x + 1) & xmask) << yshift)]; + uint32_t p11 = texture[((y + 1) & ymask) + (((x + 1) & xmask) << yshift)]; + + uint32_t inv_b = (xfrac >> (xbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (ybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t red = (RPART(p00) * a * b + RPART(p01) * inv_a * b + RPART(p10) * a * inv_b + RPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t green = (GPART(p00) * a * b + GPART(p01) * inv_a * b + GPART(p10) * a * inv_b + GPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t blue = (BPART(p00) * a * b + BPART(p01) * inv_a * b + BPART(p10) * a * inv_b + BPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t alpha = (APART(p00) * a * b + APART(p01) * inv_a * b + APART(p10) * a * inv_b + APART(p11) * inv_a * inv_b + 127) >> 8; + + return (alpha << 24) | (red << 16) | (green << 8) | blue; + } + +#ifndef NO_SSE + static __m128i samplertable[256 * 2]; +#endif +}; + +///////////////////////////////////////////////////////////////////////////// +// SSE/AVX shading macros: + +#define AVX2_SAMPLE_BILINEAR4_COLUMN_INIT(col0, col1, one, height, texturefracx) \ + const uint32_t *baseptr = col0[0]; \ + __m128i coloffsets0 = _mm_setr_epi32(col0[0] - baseptr, col0[1] - baseptr, col0[2] - baseptr, col0[3] - baseptr); \ + __m128i coloffsets1 = _mm_setr_epi32(col1[0] - baseptr, col1[1] - baseptr, col1[2] - baseptr, col1[3] - baseptr); \ + __m128i mone = _mm_loadu_si128((const __m128i*)one); \ + __m128i m127 = _mm_set1_epi16(127); \ + __m128i m16 = _mm_set1_epi32(16); \ + __m128i m15 = _mm_set1_epi32(15); \ + __m128i mheight = _mm_loadu_si128((const __m128i*)height); \ + __m128i mtexturefracx = _mm_loadu_si128((const __m128i*)texturefracx); + +#define AVX2_SAMPLE_BILINEAR4_COLUMN(fg, texturefracy) { \ + __m128i mtexturefracy = _mm_loadu_si128((const __m128i*)texturefracy); \ + __m128i multmp0 = _mm_srli_epi32(mtexturefracy, FRACBITS); \ + __m128i multmp1 = _mm_srli_epi32(_mm_add_epi32(mtexturefracy, mone), FRACBITS); \ + __m128i frac_y0 = _mm_or_si128(_mm_mul_epu32(multmp0, mheight), _mm_slli_si128(_mm_mul_epu32(_mm_srli_si128(multmp0, 4), _mm_srli_si128(mheight, 4)), 4)); \ + __m128i frac_y1 = _mm_or_si128(_mm_mul_epu32(multmp1, mheight), _mm_slli_si128(_mm_mul_epu32(_mm_srli_si128(multmp1, 4), _mm_srli_si128(mheight, 4)), 4)); \ + __m128i y0 = _mm_srli_epi32(frac_y0, FRACBITS); \ + __m128i y1 = _mm_srli_epi32(frac_y1, FRACBITS); \ + __m128i inv_b = mtexturefracx; \ + __m128i inv_a = _mm_and_si128(_mm_srli_epi32(frac_y1, FRACBITS - 4), m15); \ + __m128i a = _mm_sub_epi32(m16, inv_a); \ + __m128i b = _mm_sub_epi32(m16, inv_b); \ + __m128i ab = _mm_mullo_epi16(a, b); \ + __m128i invab = _mm_mullo_epi16(inv_a, b); \ + __m128i ainvb = _mm_mullo_epi16(a, inv_b); \ + __m128i invainvb = _mm_mullo_epi16(inv_a, inv_b); \ + __m128i ab_lo = _mm_shuffle_epi32(ab, _MM_SHUFFLE(1, 1, 0, 0)); \ + __m128i ab_hi = _mm_shuffle_epi32(ab, _MM_SHUFFLE(3, 3, 2, 2)); \ + __m128i invab_lo = _mm_shuffle_epi32(invab, _MM_SHUFFLE(1, 1, 0, 0)); \ + __m128i invab_hi = _mm_shuffle_epi32(invab, _MM_SHUFFLE(3, 3, 2, 2)); \ + __m128i ainvb_lo = _mm_shuffle_epi32(ainvb, _MM_SHUFFLE(1, 1, 0, 0)); \ + __m128i ainvb_hi = _mm_shuffle_epi32(ainvb, _MM_SHUFFLE(3, 3, 2, 2)); \ + __m128i invainvb_lo = _mm_shuffle_epi32(invainvb, _MM_SHUFFLE(1, 1, 0, 0)); \ + __m128i invainvb_hi = _mm_shuffle_epi32(invainvb, _MM_SHUFFLE(3, 3, 2, 2)); \ + ab_lo = _mm_or_si128(ab_lo, _mm_slli_epi32(ab_lo, 16)); \ + ab_hi = _mm_or_si128(ab_hi, _mm_slli_epi32(ab_hi, 16)); \ + invab_lo = _mm_or_si128(invab_lo, _mm_slli_epi32(invab_lo, 16)); \ + invab_hi = _mm_or_si128(invab_hi, _mm_slli_epi32(invab_hi, 16)); \ + ainvb_lo = _mm_or_si128(ainvb_lo, _mm_slli_epi32(ainvb_lo, 16)); \ + ainvb_hi = _mm_or_si128(ainvb_hi, _mm_slli_epi32(ainvb_hi, 16)); \ + invainvb_lo = _mm_or_si128(invainvb_lo, _mm_slli_epi32(invainvb_lo, 16)); \ + invainvb_hi = _mm_or_si128(invainvb_hi, _mm_slli_epi32(invainvb_hi, 16)); \ + __m128i p00 = _mm_i32gather_epi32((const int *)baseptr, _mm_add_epi32(y0, coloffsets0), 4); \ + __m128i p01 = _mm_i32gather_epi32((const int *)baseptr, _mm_add_epi32(y1, coloffsets0), 4); \ + __m128i p10 = _mm_i32gather_epi32((const int *)baseptr, _mm_add_epi32(y0, coloffsets1), 4); \ + __m128i p11 = _mm_i32gather_epi32((const int *)baseptr, _mm_add_epi32(y1, coloffsets1), 4); \ + __m128i p00_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(p00, _mm_setzero_si128()), ab_lo); \ + __m128i p01_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(p01, _mm_setzero_si128()), invab_lo); \ + __m128i p10_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(p10, _mm_setzero_si128()), ainvb_lo); \ + __m128i p11_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(p11, _mm_setzero_si128()), invainvb_lo); \ + __m128i p00_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(p00, _mm_setzero_si128()), ab_hi); \ + __m128i p01_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(p01, _mm_setzero_si128()), invab_hi); \ + __m128i p10_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(p10, _mm_setzero_si128()), ainvb_hi); \ + __m128i p11_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(p11, _mm_setzero_si128()), invainvb_hi); \ + __m128i fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_adds_epu16(p00_lo, p01_lo), _mm_adds_epu16(p10_lo, p11_lo)), m127), 8); \ + __m128i fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_adds_epu16(p00_hi, p01_hi), _mm_adds_epu16(p10_hi, p11_hi)), m127), 8); \ + fg = _mm_packus_epi16(fg_lo, fg_hi); \ +} + +#define VEC_SAMPLE_BILINEAR4_COLUMN(fg, col0, col1, texturefracx, texturefracy, one, height) { \ + __m128i m127 = _mm_set1_epi16(127); \ + fg = _mm_setzero_si128(); \ + for (int i = 0; i < 4; i++) \ + { \ + uint32_t frac_y0 = (texturefracy[i] >> FRACBITS) * height[i]; \ + uint32_t frac_y1 = ((texturefracy[i] + one[i]) >> FRACBITS) * height[i]; \ + uint32_t y0 = (frac_y0 >> FRACBITS); \ + uint32_t y1 = (frac_y1 >> FRACBITS); \ + \ + uint32_t inv_b = texturefracx[i]; \ + uint32_t inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; \ + \ + __m128i ab_invab = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2); \ + __m128i ainvb_invainvb = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2 + 1); \ + \ + __m128i gather = _mm_set_epi32(col1[i][y1], col1[i][y0], col0[i][y1], col0[i][y0]); \ + __m128i p0 = _mm_unpacklo_epi8(gather, _mm_setzero_si128()); \ + __m128i p1 = _mm_unpackhi_epi8(gather, _mm_setzero_si128()); \ + \ + __m128i tmp = _mm_adds_epu16(_mm_mullo_epi16(p0, ab_invab), _mm_mullo_epi16(p1, ainvb_invainvb)); \ + __m128i color = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_srli_si128(tmp, 8), tmp), m127), 8); \ + \ + fg = _mm_or_si128(_mm_srli_si128(fg, 4), _mm_slli_si128(_mm_packus_epi16(color, _mm_setzero_si128()), 12)); \ + } \ +} + +#define VEC_SAMPLE_MIP_NEAREST4_COLUMN(fg, col0, col1, mipfrac, texturefracy, height0, height1) { \ + uint32_t y0[4], y1[4]; \ + for (int i = 0; i < 4; i++) \ + { \ + y0[i] = (texturefracy[i] >> FRACBITS) * height0[i]; \ + y1[i] = (texturefracy[i] >> FRACBITS) * height1[i]; \ + } \ + __m128i p0 = _mm_set_epi32(col0[y0[3]], col0[y0[2]], col0[y0[1]], col0[y0[0]]); \ + __m128i p1 = _mm_set_epi32(col1[y1[3]], col1[y1[2]], col1[y1[1]], col1[y1[0]]); \ + __m128i t = _mm_loadu_si128((const __m128i*)mipfrac); \ + __m128i inv_t = _mm_sub_epi32(_mm_set1_epi32(256), mipfrac); \ + __m128i p0_lo = _mm_unpacklo_epi8(p0, _mm_setzero_si128()); \ + __m128i p0_hi = _mm_unpackhi_epi8(p0, _mm_setzero_si128()); \ + __m128i p1_lo = _mm_unpacklo_epi8(p1, _mm_setzero_si128()); \ + __m128i p1_hi = _mm_unpackhi_epi8(p1, _mm_setzero_si128()); \ + __m128i fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(p0_lo, t), _mm_mullo_epi16(p1_lo, inv_t)), 8); \ + __m128i fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(p0_hi, t), _mm_mullo_epi16(p1_hi, inv_t)), 8); \ + fg = _mm_packus_epi16(fg_lo, fg_hi); \ +} + +#define VEC_SAMPLE_BILINEAR4_SPAN(fg, texture, xfrac, yfrac, xstep, ystep, xbits, ybits) { \ + int xshift = (32 - xbits); \ + int yshift = (32 - ybits); \ + int xmask = (1 << xshift) - 1; \ + int ymask = (1 << yshift) - 1; \ + \ + __m128i m127 = _mm_set1_epi16(127); \ + fg = _mm_setzero_si128(); \ + for (int i = 0; i < 4; i++) \ + { \ + uint32_t x = xfrac >> xbits; \ + uint32_t y = yfrac >> ybits; \ + \ + uint32_t p00 = texture[(y & ymask) + ((x & xmask) << yshift)]; \ + uint32_t p01 = texture[((y + 1) & ymask) + ((x & xmask) << yshift)]; \ + uint32_t p10 = texture[(y & ymask) + (((x + 1) & xmask) << yshift)]; \ + uint32_t p11 = texture[((y + 1) & ymask) + (((x + 1) & xmask) << yshift)]; \ + \ + uint32_t inv_b = (xfrac >> (xbits - 4)) & 15; \ + uint32_t inv_a = (yfrac >> (ybits - 4)) & 15; \ + \ + __m128i ab_invab = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2); \ + __m128i ainvb_invainvb = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2 + 1); \ + \ + __m128i p0 = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, p01, p00), _mm_setzero_si128()); \ + __m128i p1 = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, p11, p10), _mm_setzero_si128()); \ + \ + __m128i tmp = _mm_adds_epu16(_mm_mullo_epi16(p0, ab_invab), _mm_mullo_epi16(p1, ainvb_invainvb)); \ + __m128i color = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_srli_si128(tmp, 8), tmp), m127), 8); \ + \ + fg = _mm_or_si128(_mm_srli_si128(fg, 4), _mm_slli_si128(_mm_packus_epi16(color, _mm_setzero_si128()), 12)); \ + \ + xfrac += xstep; \ + yfrac += ystep; \ + } \ +} + +// Calculate constants for a simple shade with gamma correction +#define AVX_LINEAR_SHADE_SIMPLE_INIT(light) \ + __m256 mlight_hi = _mm256_set_ps(1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f), 1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f)); \ + mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \ + __m256 mlight_lo = mlight_hi; \ + __m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \ + __m256 m255 = _mm256_set1_ps(255.0f); + +// Calculate constants for a simple shade with different light levels for each pixel and gamma correction +#define AVX_LINEAR_SHADE_SIMPLE_INIT4(light3, light2, light1, light0) \ + __m256 mlight_hi = _mm256_set_ps(1.0f, light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), 1.0f, light0 * (1.0f/256.0f), light0 * (1.0f/256.0f), light0 * (1.0f/256.0f)); \ + __m256 mlight_lo = _mm256_set_ps(1.0f, light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), 1.0f, light2 * (1.0f/256.0f), light2 * (1.0f/256.0f), light2 * (1.0f/256.0f)); \ + mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \ + mlight_lo = _mm256_mul_ps(mlight_lo, mlight_lo); \ + __m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \ + __m256 m255 = _mm256_set1_ps(255.0f); + +// Simple shade 4 pixels with gamma correction +#define AVX_LINEAR_SHADE_SIMPLE(fg) { \ + __m256i fg_16 = _mm256_set_m128i(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _mm_unpacklo_epi8(fg, _mm_setzero_si128())); \ + __m256 fg_hi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(fg_16, _mm256_setzero_si256())); \ + __m256 fg_lo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(fg_16, _mm256_setzero_si256())); \ + fg_hi = _mm256_mul_ps(fg_hi, mrcp_255); \ + fg_hi = _mm256_mul_ps(fg_hi, fg_hi); \ + fg_hi = _mm256_mul_ps(fg_hi, mlight_hi); \ + fg_hi = _mm256_sqrt_ps(fg_hi); \ + fg_hi = _mm256_mul_ps(fg_hi, m255); \ + fg_lo = _mm256_mul_ps(fg_lo, mrcp_255); \ + fg_lo = _mm256_mul_ps(fg_lo, fg_lo); \ + fg_lo = _mm256_mul_ps(fg_lo, mlight_lo); \ + fg_lo = _mm256_sqrt_ps(fg_lo); \ + fg_lo = _mm256_mul_ps(fg_lo, m255); \ + fg_16 = _mm256_packus_epi32(_mm256_cvtps_epi32(fg_lo), _mm256_cvtps_epi32(fg_hi)); \ + fg = _mm_packus_epi16(_mm256_extractf128_si256(fg_16, 0), _mm256_extractf128_si256(fg_16, 1)); \ +} + +// Calculate constants for a complex shade with gamma correction +#define AVX_LINEAR_SHADE_INIT(light, shade_constants) \ + __m256 mlight_hi = _mm256_set_ps(1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f), 1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f)); \ + mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \ + __m256 mlight_lo = mlight_hi; \ + __m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \ + __m256 m255 = _mm256_set1_ps(255.0f); \ + __m256 color = _mm256_set_ps( \ + 1.0f, shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f), \ + 1.0f, shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f)); \ + __m256 fade = _mm256_set_ps( \ + 0.0f, shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f), \ + 0.0f, shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f)); \ + __m256 fade_amount_hi = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_hi)); \ + __m256 fade_amount_lo = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_lo)); \ + __m256 inv_desaturate = _mm256_set1_ps((256 - shade_constants.desaturate) * (1.0f/256.0f)); \ + __m128 ss_desaturate = _mm_set_ss(shade_constants.desaturate * (1.0f/256.0f)); \ + __m128 intensity_weight = _mm_set_ps(0.0f, 77.0f/256.0f, 143.0f/256.0f, 37.0f/256.0f); + +// Calculate constants for a complex shade with different light levels for each pixel and gamma correction +#define AVX_LINEAR_SHADE_INIT4(light3, light2, light1, light0, shade_constants) \ + __m256 mlight_hi = _mm256_set_ps(1.0f, light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), 1.0f, light0 * (1.0f/256.0f), light0 * (1.0f/256.0f), light0 * (1.0f/256.0f)); \ + __m256 mlight_lo = _mm256_set_ps(1.0f, light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), 1.0f, light2 * (1.0f/256.0f), light2 * (1.0f/256.0f), light2 * (1.0f/256.0f)); \ + mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \ + mlight_lo = _mm256_mul_ps(mlight_lo, mlight_lo); \ + __m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \ + __m256 m255 = _mm256_set1_ps(255.0f); \ + __m256 color = _mm256_set_ps( \ + 1.0f, shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f), \ + 1.0f, shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f)); \ + __m256 fade = _mm256_set_ps( \ + 0.0f, shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f), \ + 0.0f, shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f)); \ + __m256 fade_amount_hi = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_hi)); \ + __m256 fade_amount_lo = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_lo)); \ + __m256 inv_desaturate = _mm256_set1_ps((256 - shade_constants.desaturate) * (1.0f/256.0f)); \ + __m128 ss_desaturate = _mm_set_ss(shade_constants.desaturate * (1.0f/256.0f)); \ + __m128 intensity_weight = _mm_set_ps(0.0f, 77.0f/256.0f, 143.0f/256.0f, 37.0f/256.0f); + +// Complex shade 4 pixels with gamma correction +#define AVX_LINEAR_SHADE(fg, shade_constants) { \ + __m256i fg_16 = _mm256_set_m128i(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _mm_unpacklo_epi8(fg, _mm_setzero_si128())); \ + __m256 fg_hi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(fg_16, _mm256_setzero_si256())); \ + __m256 fg_lo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(fg_16, _mm256_setzero_si256())); \ + fg_hi = _mm256_mul_ps(fg_hi, mrcp_255); \ + fg_hi = _mm256_mul_ps(fg_hi, fg_hi); \ + fg_lo = _mm256_mul_ps(fg_lo, mrcp_255); \ + fg_lo = _mm256_mul_ps(fg_lo, fg_lo); \ + \ + __m128 intensity_hi0 = _mm_mul_ps(_mm256_extractf128_ps(fg_hi, 0), intensity_weight); \ + __m128 intensity_hi1 = _mm_mul_ps(_mm256_extractf128_ps(fg_hi, 1), intensity_weight); \ + intensity_hi0 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_hi0, _mm_shuffle_ps(intensity_hi0, intensity_hi0, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_hi0, intensity_hi0, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \ + intensity_hi0 = _mm_shuffle_ps(intensity_hi0, intensity_hi0, _MM_SHUFFLE(0,0,0,0)); \ + intensity_hi1 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_hi1, _mm_shuffle_ps(intensity_hi1, intensity_hi1, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_hi1, intensity_hi1, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \ + intensity_hi1 = _mm_shuffle_ps(intensity_hi1, intensity_hi1, _MM_SHUFFLE(0,0,0,0)); \ + __m256 intensity_hi = _mm256_set_m128(intensity_hi1, intensity_hi0); \ + \ + fg_hi = _mm256_add_ps(_mm256_mul_ps(fg_hi, inv_desaturate), intensity_hi); \ + fg_hi = _mm256_add_ps(_mm256_mul_ps(fg_hi, mlight_hi), fade_amount_hi); \ + fg_hi = _mm256_mul_ps(fg_hi, color); \ + \ + __m128 intensity_lo0 = _mm_mul_ps(_mm256_extractf128_ps(fg_lo, 0), intensity_weight); \ + __m128 intensity_lo1 = _mm_mul_ps(_mm256_extractf128_ps(fg_lo, 1), intensity_weight); \ + intensity_lo0 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_lo0, _mm_shuffle_ps(intensity_lo0, intensity_lo0, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_lo0, intensity_lo0, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \ + intensity_lo0 = _mm_shuffle_ps(intensity_lo0, intensity_lo0, _MM_SHUFFLE(0,0,0,0)); \ + intensity_lo1 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_lo1, _mm_shuffle_ps(intensity_lo1, intensity_lo1, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_lo1, intensity_lo1, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \ + intensity_lo1 = _mm_shuffle_ps(intensity_lo1, intensity_lo1, _MM_SHUFFLE(0,0,0,0)); \ + __m256 intensity_lo = _mm256_set_m128(intensity_lo1, intensity_lo0); \ + \ + fg_lo = _mm256_add_ps(_mm256_mul_ps(fg_lo, inv_desaturate), intensity_lo); \ + fg_lo = _mm256_add_ps(_mm256_mul_ps(fg_lo, mlight_lo), fade_amount_lo); \ + fg_lo = _mm256_mul_ps(fg_lo, color); \ + \ + fg_hi = _mm256_sqrt_ps(fg_hi); \ + fg_hi = _mm256_mul_ps(fg_hi, m255); \ + fg_lo = _mm256_sqrt_ps(fg_lo); \ + fg_lo = _mm256_mul_ps(fg_lo, m255); \ + fg_16 = _mm256_packus_epi32(_mm256_cvtps_epi32(fg_lo), _mm256_cvtps_epi32(fg_hi)); \ + fg = _mm_packus_epi16(_mm256_extractf128_si256(fg_16, 0), _mm256_extractf128_si256(fg_16, 1)); \ +} + +/* +// Complex shade 8 pixels +#define AVX_SHADE(fg, shade_constants) { \ + __m256i fg_hi = _mm256_unpackhi_epi8(fg, _mm256_setzero_si256()); \ + __m256i fg_lo = _mm256_unpacklo_epi8(fg, _mm256_setzero_si256()); \ + \ + __m256i intensity_hi = _mm256_mullo_epi16(fg_hi, _mm256_set_epi16(0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37)); \ + __m256i intensity_lo = _mm256_mullo_epi16(fg_lo, _mm256_set_epi16(0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37)); \ + __m256i intensity = _mm256_mullo_epi16(_mm256_srli_epi16(_mm256_hadd_epi16(_mm256_hadd_epi16(intensity_lo, intensity_hi), _mm256_setzero_si256()), 8), desaturate); \ + intensity = _mm256_unpacklo_epi16(intensity, intensity); \ + intensity_hi = _mm256_unpackhi_epi32(intensity, intensity); \ + intensity_lo = _mm256_unpacklo_epi32(intensity, intensity); \ + \ + fg_hi = _mm256_srli_epi16(_mm256_adds_epu16(_mm256_mullo_epi16(fg_hi, inv_desaturate), intensity_hi), 8); \ + fg_hi = _mm256_srli_epi16(_mm256_adds_epu16(_mm256_mullo_epi16(fg_hi, mlight), fade_amount), 8); \ + fg_hi = _mm256_srli_epi16(_mm256_mullo_epi16(fg_hi, color), 8); \ + \ + fg_lo = _mm256_srli_epi16(_mm256_adds_epu16(_mm256_mullo_epi16(fg_lo, inv_desaturate), intensity_lo), 8); \ + fg_lo = _mm256_srli_epi16(_mm256_adds_epu16(_mm256_mullo_epi16(fg_lo, mlight), fade_amount), 8); \ + fg_lo = _mm256_srli_epi16(_mm256_mullo_epi16(fg_lo, color), 8); \ + \ + fg = _mm256_packus_epi16(fg_lo, fg_hi); \ +} +*/ + +// Normal premultiplied alpha blend using the alpha from fg +#define VEC_ALPHA_BLEND(fg,bg) { \ + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); \ + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); \ + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); \ + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); \ + __m128i m256 = _mm_set1_epi16(256); \ + __m128i alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(fg_hi, _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(3,3,3,3)); \ + __m128i alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(fg_lo, _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(3,3,3,3)); \ + alpha_hi = _mm_add_epi16(alpha_hi, _mm_srli_epi16(alpha_hi, 7)); \ + alpha_lo = _mm_add_epi16(alpha_lo, _mm_srli_epi16(alpha_lo, 7)); \ + __m128i inv_alpha_hi = _mm_sub_epi16(m256, alpha_hi); \ + __m128i inv_alpha_lo = _mm_sub_epi16(m256, alpha_lo); \ + fg_hi = _mm_mullo_epi16(fg_hi, alpha_hi); \ + fg_hi = _mm_srli_epi16(fg_hi, 8); \ + fg_lo = _mm_mullo_epi16(fg_lo, alpha_lo); \ + fg_lo = _mm_srli_epi16(fg_lo, 8); \ + fg = _mm_packus_epi16(fg_lo, fg_hi); \ + bg_hi = _mm_mullo_epi16(bg_hi, inv_alpha_hi); \ + bg_hi = _mm_srli_epi16(bg_hi, 8); \ + bg_lo = _mm_mullo_epi16(bg_lo, inv_alpha_lo); \ + bg_lo = _mm_srli_epi16(bg_lo, 8); \ + bg = _mm_packus_epi16(bg_lo, bg_hi); \ + fg = _mm_adds_epu8(fg, bg); \ +} + +// Calculates the final alpha values to be used when combined with the source texture alpha channel +FORCEINLINE uint32_t calc_blend_bgalpha(uint32_t fg, uint32_t dest_alpha) +{ + uint32_t alpha = fg >> 24; + alpha += alpha >> 7; + uint32_t inv_alpha = 256 - alpha; + return (dest_alpha * alpha + 256 * inv_alpha + 128) >> 8; +} + +#define VEC_CALC_BLEND_ALPHA_VARS() __m128i msrc_alpha, mdest_alpha, m256, m255, m128; + +#define VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha) \ + msrc_alpha = _mm_set1_epi16(src_alpha); \ + mdest_alpha = _mm_set1_epi16(dest_alpha * 255 / 256); \ + m256 = _mm_set1_epi16(256); \ + m255 = _mm_set1_epi16(255); \ + m128 = _mm_set1_epi16(128); + +// Calculates the final alpha values to be used when combined with the source texture alpha channel +#define VEC_CALC_BLEND_ALPHA(fg) \ + __m128i fg_alpha_hi, fg_alpha_lo, bg_alpha_hi, bg_alpha_lo; { \ + __m128i alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); \ + __m128i alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_mm_unpacklo_epi8(fg, _mm_setzero_si128()), _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); \ + alpha_hi = _mm_add_epi16(alpha_hi, _mm_srli_epi16(alpha_hi, 7)); \ + alpha_lo = _mm_add_epi16(alpha_lo, _mm_srli_epi16(alpha_lo, 7)); \ + bg_alpha_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_mullo_epi16(mdest_alpha, alpha_hi), _mm_mullo_epi16(m255, _mm_sub_epi16(m256, alpha_hi))), m128), 8); \ + bg_alpha_hi = _mm_add_epi16(bg_alpha_hi, _mm_srli_epi16(bg_alpha_hi, 7)); \ + bg_alpha_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_mullo_epi16(mdest_alpha, alpha_lo), _mm_mullo_epi16(m255, _mm_sub_epi16(m256, alpha_lo))), m128), 8); \ + bg_alpha_lo = _mm_add_epi16(bg_alpha_lo, _mm_srli_epi16(bg_alpha_lo, 7)); \ + fg_alpha_hi = msrc_alpha; \ + fg_alpha_lo = msrc_alpha; \ + } + +#define SSE_SHADE_VARS() __m128i mlight_hi, mlight_lo, color, fade, fade_amount_hi, fade_amount_lo, inv_desaturate; + +// Calculate constants for a simple shade +#define SSE_SHADE_SIMPLE_INIT(light) \ + mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \ + mlight_lo = mlight_hi; + +// Calculate constants for a simple shade with different light levels for each pixel +#define SSE_SHADE_SIMPLE_INIT4(light3, light2, light1, light0) \ + mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \ + mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2); + +// Simple shade 4 pixels +#define SSE_SHADE_SIMPLE(fg) { \ + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); \ + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); \ + fg_hi = _mm_mullo_epi16(fg_hi, mlight_hi); \ + fg_hi = _mm_srli_epi16(fg_hi, 8); \ + fg_lo = _mm_mullo_epi16(fg_lo, mlight_lo); \ + fg_lo = _mm_srli_epi16(fg_lo, 8); \ + fg = _mm_packus_epi16(fg_lo, fg_hi); \ +} + +// Calculate constants for a complex shade +#define SSE_SHADE_INIT(light, shade_constants) \ + mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \ + mlight_lo = mlight_hi; \ + color = _mm_set_epi16( \ + 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \ + 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \ + fade = _mm_set_epi16( \ + 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \ + 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \ + fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \ + fade_amount_lo = fade_amount_hi; \ + inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \ + +// Calculate constants for a complex shade with different light levels for each pixel +#define SSE_SHADE_INIT4(light3, light2, light1, light0, shade_constants) \ + mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \ + mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2); \ + color = _mm_set_epi16( \ + 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \ + 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \ + fade = _mm_set_epi16( \ + 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \ + 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \ + fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \ + fade_amount_lo = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_lo)); \ + inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \ + +// Complex shade 4 pixels +#define SSE_SHADE(fg, shade_constants) { \ + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); \ + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); \ + \ + __m128i intensity_hi = _mm_mullo_epi16(fg_hi, _mm_set_epi16(0, 77, 143, 37, 0, 77, 143, 37)); \ + uint16_t intensity_hi0 = ((_mm_extract_epi16(intensity_hi, 2) + _mm_extract_epi16(intensity_hi, 1) + _mm_extract_epi16(intensity_hi, 0)) >> 8) * shade_constants.desaturate; \ + uint16_t intensity_hi1 = ((_mm_extract_epi16(intensity_hi, 6) + _mm_extract_epi16(intensity_hi, 5) + _mm_extract_epi16(intensity_hi, 4)) >> 8) * shade_constants.desaturate; \ + intensity_hi = _mm_set_epi16(intensity_hi1, intensity_hi1, intensity_hi1, intensity_hi1, intensity_hi0, intensity_hi0, intensity_hi0, intensity_hi0); \ + \ + fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, inv_desaturate), intensity_hi), 8); \ + fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mlight_hi), fade_amount_hi), 8); \ + fg_hi = _mm_srli_epi16(_mm_mullo_epi16(fg_hi, color), 8); \ + \ + __m128i intensity_lo = _mm_mullo_epi16(fg_lo, _mm_set_epi16(0, 77, 143, 37, 0, 77, 143, 37)); \ + uint16_t intensity_lo0 = ((_mm_extract_epi16(intensity_lo, 2) + _mm_extract_epi16(intensity_lo, 1) + _mm_extract_epi16(intensity_lo, 0)) >> 8) * shade_constants.desaturate; \ + uint16_t intensity_lo1 = ((_mm_extract_epi16(intensity_lo, 6) + _mm_extract_epi16(intensity_lo, 5) + _mm_extract_epi16(intensity_lo, 4)) >> 8) * shade_constants.desaturate; \ + intensity_lo = _mm_set_epi16(intensity_lo1, intensity_lo1, intensity_lo1, intensity_lo1, intensity_lo0, intensity_lo0, intensity_lo0, intensity_lo0); \ + \ + fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, inv_desaturate), intensity_lo), 8); \ + fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mlight_lo), fade_amount_lo), 8); \ + fg_lo = _mm_srli_epi16(_mm_mullo_epi16(fg_lo, color), 8); \ + \ + fg = _mm_packus_epi16(fg_lo, fg_hi); \ +} + +#endif diff --git a/src/r_draw_rgba_sse.h b/src/r_draw_rgba_sse.h new file mode 100644 index 000000000..4ee557693 --- /dev/null +++ b/src/r_draw_rgba_sse.h @@ -0,0 +1,367 @@ +// +// SSE/AVX intrinsics based drawers for the r_draw family of drawers. +// +// Note: This header file is intentionally not guarded by a __R_DRAW_RGBA_SSE__ define. +// It is because the code is nearly identical for SSE vs AVX. The file is included +// multiple times by r_draw_rgba.cpp with different defines that changes the class +// names outputted and the type of intrinsics used. + +#ifdef _MSC_VER +#pragma warning(disable: 4752) // warning C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX +#endif + +class VecCommand(DrawSpanRGBA) : public DrawerCommand +{ + const uint32_t * RESTRICT _source; + fixed_t _xfrac; + fixed_t _yfrac; + fixed_t _xstep; + fixed_t _ystep; + int _x1; + int _x2; + int _y; + int _xbits; + int _ybits; + BYTE * RESTRICT _destorg; + fixed_t _light; + ShadeConstants _shade_constants; + bool _nearest_filter; + +public: + VecCommand(DrawSpanRGBA)() + { + _source = (const uint32_t*)ds_source; + _xfrac = ds_xfrac; + _yfrac = ds_yfrac; + _xstep = ds_xstep; + _ystep = ds_ystep; + _x1 = ds_x1; + _x2 = ds_x2; + _y = ds_y; + _xbits = ds_xbits; + _ybits = ds_ybits; + _destorg = dc_destorg; + _light = ds_light; + _shade_constants = ds_shade_constants; + _nearest_filter = !SampleBgra::span_sampler_setup(_source, _xbits, _ybits, _xstep, _ystep, ds_source_mipmapped); + } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(_y)) + return; + + dsfixed_t xfrac; + dsfixed_t yfrac; + dsfixed_t xstep; + dsfixed_t ystep; + uint32_t* dest; + const uint32_t* source = _source; + int count; + int spot; + + xfrac = _xfrac; + yfrac = _yfrac; + + dest = ylookup[_y] + _x1 + (uint32_t*)_destorg; + + count = _x2 - _x1 + 1; + + xstep = _xstep; + ystep = _ystep; + + uint32_t light = LightBgra::calc_light_multiplier(_light); + ShadeConstants shade_constants = _shade_constants; + + if (_nearest_filter) + { + if (_xbits == 6 && _ybits == 6) + { + // 64x64 is the most common case by far, so special case it. + + int sse_count = count / 4; + count -= sse_count * 4; + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + + while (sse_count--) + { + // Current texture index in u,v. + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p0 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p1 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p2 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p3 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + // Lookup pixel from flat texture tile, + // re-index using light/colormap. + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + + // Next step in u,v. + dest += 4; + } + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + + while (sse_count--) + { + // Current texture index in u,v. + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p0 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p1 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p2 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p3 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + // Lookup pixel from flat texture tile, + // re-index using light/colormap. + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + + // Next step in u,v. + dest += 4; + } + } + + if (count == 0) + return; + + do + { + // Current texture index in u,v. + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + + // Lookup pixel from flat texture tile + *dest++ = LightBgra::shade_bgra(source[spot], light, shade_constants); + + // Next step in u,v. + xfrac += xstep; + yfrac += ystep; + } while (--count); + } + else + { + BYTE yshift = 32 - _ybits; + BYTE xshift = yshift - _xbits; + int xmask = ((1 << _xbits) - 1) << _ybits; + + int sse_count = count / 4; + count -= sse_count * 4; + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + + while (sse_count--) + { + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p0 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p1 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p2 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p3 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + // Lookup pixel from flat texture tile + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + + while (sse_count--) + { + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p0 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p1 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p2 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p3 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + // Lookup pixel from flat texture tile + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + + if (count == 0) + return; + + do + { + // Current texture index in u,v. + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + + // Lookup pixel from flat texture tile + *dest++ = LightBgra::shade_bgra(source[spot], light, shade_constants); + + // Next step in u,v. + xfrac += xstep; + yfrac += ystep; + } while (--count); + } + } + else + { + if (_xbits == 6 && _ybits == 6) + { + // 64x64 is the most common case by far, so special case it. + + int sse_count = count / 4; + count -= sse_count * 4; + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + while (sse_count--) + { + __m128i fg; + VEC_SAMPLE_BILINEAR4_SPAN(fg, source, xfrac, yfrac, xstep, ystep, 26, 26); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + while (sse_count--) + { + __m128i fg; + VEC_SAMPLE_BILINEAR4_SPAN(fg, source, xfrac, yfrac, xstep, ystep, 26, 26); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + + if (count == 0) + return; + + do + { + *dest++ = LightBgra::shade_bgra(SampleBgra::sample_bilinear(source, xfrac, yfrac, 26, 26), light, shade_constants); + xfrac += xstep; + yfrac += ystep; + } while (--count); + } + else + { + int sse_count = count / 4; + count -= sse_count * 4; + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + while (sse_count--) + { + __m128i fg; + int tmpx = 32 - _xbits; + int tmpy = 32 - _ybits; + VEC_SAMPLE_BILINEAR4_SPAN(fg, source, xfrac, yfrac, xstep, ystep, tmpx, tmpy); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + while (sse_count--) + { + __m128i fg; + int tmpx = 32 - _xbits; + int tmpy = 32 - _ybits; + VEC_SAMPLE_BILINEAR4_SPAN(fg, source, xfrac, yfrac, xstep, ystep, tmpx, tmpy); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + + if (count == 0) + return; + + do + { + *dest++ = LightBgra::shade_bgra(SampleBgra::sample_bilinear(source, xfrac, yfrac, 32 - _xbits, 32 - _ybits), light, shade_constants); + xfrac += xstep; + yfrac += ystep; + } while (--count); + } + } + } +}; diff --git a/src/r_drawt.cpp b/src/r_drawt.cpp index e8faff0ce..837093044 100644 --- a/src/r_drawt.cpp +++ b/src/r_drawt.cpp @@ -313,21 +313,21 @@ void rt_Translate4cols(const BYTE *translation, int yl, int yh) } // Translates one span at hx to the screen at sx. -void rt_tlate1col (int hx, int sx, int yl, int yh) +void rt_tlate1col_c (int hx, int sx, int yl, int yh) { rt_Translate1col(dc_translation, hx, yl, yh); rt_map1col(hx, sx, yl, yh); } // Translates all four spans to the screen starting at sx. -void rt_tlate4cols (int sx, int yl, int yh) +void rt_tlate4cols_c (int sx, int yl, int yh) { rt_Translate4cols(dc_translation, yl, yh); rt_map4cols(sx, yl, yh); } // Adds one span at hx to the screen at sx without clamping. -void rt_add1col (int hx, int sx, int yl, int yh) +void rt_add1col_c (int hx, int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -417,21 +417,21 @@ void rt_add4cols_c (int sx, int yl, int yh) } // Translates and adds one span at hx to the screen at sx without clamping. -void rt_tlateadd1col (int hx, int sx, int yl, int yh) +void rt_tlateadd1col_c (int hx, int sx, int yl, int yh) { rt_Translate1col(dc_translation, hx, yl, yh); rt_add1col(hx, sx, yl, yh); } // Translates and adds all four spans to the screen starting at sx without clamping. -void rt_tlateadd4cols (int sx, int yl, int yh) +void rt_tlateadd4cols_c (int sx, int yl, int yh) { rt_Translate4cols(dc_translation, yl, yh); rt_add4cols(sx, yl, yh); } // Shades one span at hx to the screen at sx. -void rt_shaded1col (int hx, int sx, int yl, int yh) +void rt_shaded1col_c (int hx, int sx, int yl, int yh) { DWORD *fgstart; BYTE *colormap; @@ -507,7 +507,7 @@ void rt_shaded4cols_c (int sx, int yl, int yh) } // Adds one span at hx to the screen at sx with clamping. -void rt_addclamp1col (int hx, int sx, int yl, int yh) +void rt_addclamp1col_c (int hx, int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -556,13 +556,14 @@ void rt_addclamp4cols_c (int sx, int yl, int yh) return; count++; - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; dest = ylookup[yl] + sx + dc_destorg; source = &dc_temp[yl*4]; pitch = dc_pitch; colormap = dc_colormap; + DWORD *fg2rgb = dc_srcblend; + DWORD *bg2rgb = dc_destblend; + do { DWORD a = fg2rgb[colormap[source[0]]] + bg2rgb[dest[0]]; DWORD b = a; @@ -607,21 +608,21 @@ void rt_addclamp4cols_c (int sx, int yl, int yh) } // Translates and adds one span at hx to the screen at sx with clamping. -void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh) +void rt_tlateaddclamp1col_c (int hx, int sx, int yl, int yh) { rt_Translate1col(dc_translation, hx, yl, yh); rt_addclamp1col(hx, sx, yl, yh); } // Translates and adds all four spans to the screen starting at sx with clamping. -void rt_tlateaddclamp4cols (int sx, int yl, int yh) +void rt_tlateaddclamp4cols_c (int sx, int yl, int yh) { rt_Translate4cols(dc_translation, yl, yh); rt_addclamp4cols(sx, yl, yh); } // Subtracts one span at hx to the screen at sx with clamping. -void rt_subclamp1col (int hx, int sx, int yl, int yh) +void rt_subclamp1col_c (int hx, int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -656,7 +657,7 @@ void rt_subclamp1col (int hx, int sx, int yl, int yh) } // Subtracts all four spans to the screen starting at sx with clamping. -void rt_subclamp4cols (int sx, int yl, int yh) +void rt_subclamp4cols_c (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -716,21 +717,21 @@ void rt_subclamp4cols (int sx, int yl, int yh) } // Translates and subtracts one span at hx to the screen at sx with clamping. -void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh) +void rt_tlatesubclamp1col_c (int hx, int sx, int yl, int yh) { rt_Translate1col(dc_translation, hx, yl, yh); rt_subclamp1col(hx, sx, yl, yh); } // Translates and subtracts all four spans to the screen starting at sx with clamping. -void rt_tlatesubclamp4cols (int sx, int yl, int yh) +void rt_tlatesubclamp4cols_c (int sx, int yl, int yh) { rt_Translate4cols(dc_translation, yl, yh); rt_subclamp4cols(sx, yl, yh); } // Subtracts one span at hx from the screen at sx with clamping. -void rt_revsubclamp1col (int hx, int sx, int yl, int yh) +void rt_revsubclamp1col_c (int hx, int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -765,7 +766,7 @@ void rt_revsubclamp1col (int hx, int sx, int yl, int yh) } // Subtracts all four spans from the screen starting at sx with clamping. -void rt_revsubclamp4cols (int sx, int yl, int yh) +void rt_revsubclamp4cols_c (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -825,14 +826,14 @@ void rt_revsubclamp4cols (int sx, int yl, int yh) } // Translates and subtracts one span at hx from the screen at sx with clamping. -void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh) +void rt_tlaterevsubclamp1col_c (int hx, int sx, int yl, int yh) { rt_Translate1col(dc_translation, hx, yl, yh); rt_revsubclamp1col(hx, sx, yl, yh); } // Translates and subtracts all four spans from the screen starting at sx with clamping. -void rt_tlaterevsubclamp4cols (int sx, int yl, int yh) +void rt_tlaterevsubclamp4cols_c (int sx, int yl, int yh) { rt_Translate4cols(dc_translation, yl, yh); rt_revsubclamp4cols(sx, yl, yh); @@ -855,18 +856,21 @@ void rt_draw4cols (int sx) } #ifdef X86_ASM - // Setup assembly routines for changed colormaps or other parameters. - if (hcolfunc_post4 == rt_shaded4cols) + if (!r_swtruecolor) { - R_SetupShadedCol(); - } - else if (hcolfunc_post4 == rt_addclamp4cols || hcolfunc_post4 == rt_tlateaddclamp4cols) - { - R_SetupAddClampCol(); - } - else if (hcolfunc_post4 == rt_add4cols || hcolfunc_post4 == rt_tlateadd4cols) - { - R_SetupAddCol(); + // Setup assembly routines for changed colormaps or other parameters. + if (hcolfunc_post4 == rt_shaded4cols) + { + R_SetupShadedCol(); + } + else if (hcolfunc_post4 == rt_addclamp4cols || hcolfunc_post4 == rt_tlateaddclamp4cols) + { + R_SetupAddClampCol(); + } + else if (hcolfunc_post4 == rt_add4cols || hcolfunc_post4 == rt_tlateadd4cols) + { + R_SetupAddCol(); + } } #endif @@ -1002,7 +1006,7 @@ void rt_draw4cols (int sx) // Before each pass through a rendering loop that uses these routines, // call this function to set up the span pointers. -void rt_initcols (BYTE *buff) +void rt_initcols_pal (BYTE *buff) { int y; @@ -1011,6 +1015,14 @@ void rt_initcols (BYTE *buff) horizspan[y] = dc_ctspan[y] = &dc_tspans[y][0]; } +void rt_span_coverage_pal(int x, int start, int stop) +{ + unsigned int **tspan = &dc_ctspan[x & 3]; + (*tspan)[0] = start; + (*tspan)[1] = stop; + *tspan += 2; +} + // Stretches a column into a temporary buffer which is later // drawn to the screen along with up to three other columns. void R_DrawColumnHorizP_C (void) @@ -1073,7 +1085,7 @@ void R_DrawColumnHorizP_C (void) } // [RH] Just fills a column with a given color -void R_FillColumnHorizP (void) +void R_FillColumnHorizP_C (void) { int count = dc_count; BYTE color = dc_color; @@ -1108,6 +1120,7 @@ void R_FillColumnHorizP (void) void R_DrawMaskedColumnHoriz (const BYTE *column, const FTexture::Span *span) { + int pixelsize = r_swtruecolor ? 4 : 1; const fixed_t texturemid = FLOAT2FIXED(dc_texturemid); while (span->Length != 0) { @@ -1177,7 +1190,7 @@ void R_DrawMaskedColumnHoriz (const BYTE *column, const FTexture::Span *span) } } dc_source = column + top; - dc_dest = ylookup[dc_yl] + dc_x + dc_destorg; + dc_dest = (ylookup[dc_yl] + dc_x) * pixelsize + dc_destorg; dc_count = dc_yh - dc_yl + 1; hcolfunc_pre (); } diff --git a/src/r_drawt_rgba.cpp b/src/r_drawt_rgba.cpp new file mode 100644 index 000000000..45bd5c029 --- /dev/null +++ b/src/r_drawt_rgba.cpp @@ -0,0 +1,995 @@ +/* +** r_drawt_rgba.cpp +** Faster column drawers for modern processors, true color edition +** +**--------------------------------------------------------------------------- +** Copyright 1998-2006 Randy Heit +** All rights reserved. +** +** Redistribution and use in source and binary forms, with or without +** modification, are permitted provided that the following conditions +** are met: +** +** 1. Redistributions of source code must retain the above copyright +** notice, this list of conditions and the following disclaimer. +** 2. Redistributions in binary form must reproduce the above copyright +** notice, this list of conditions and the following disclaimer in the +** documentation and/or other materials provided with the distribution. +** 3. The name of the author may not be used to endorse or promote products +** derived from this software without specific prior written permission. +** +** THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +** OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +** IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +** INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +** NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**--------------------------------------------------------------------------- +** +** True color versions of the similar functions in r_drawt.cpp +** Please see r_drawt.cpp for a description of the globals used. +*/ + +#include "templates.h" +#include "doomtype.h" +#include "doomdef.h" +#include "r_defs.h" +#include "r_draw.h" +#include "r_main.h" +#include "r_things.h" +#include "v_video.h" +#include "r_draw_rgba.h" +#ifndef NO_SSE +#include +#endif + +extern unsigned int dc_tspans[4][MAXHEIGHT]; +extern unsigned int *dc_ctspan[4]; +extern unsigned int *horizspan[4]; + +#ifndef NO_SSE + +#ifdef _MSC_VER +#pragma warning(disable: 4101) // warning C4101: unreferenced local variable +#endif + +// Generate SSE drawers: +#define VecCommand(name) name##_SSE_Command +#define VEC_SHADE_VARS SSE_SHADE_VARS +#define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT +#define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4 +#define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE +#define VEC_SHADE_INIT SSE_SHADE_INIT +#define VEC_SHADE_INIT4 SSE_SHADE_INIT4 +#define VEC_SHADE SSE_SHADE +#include "r_drawt_rgba_sse.h" +/* +// Generate AVX drawers: +#undef VecCommand +#undef VEC_SHADE_SIMPLE_INIT +#undef VEC_SHADE_SIMPLE_INIT4 +#undef VEC_SHADE_SIMPLE +#undef VEC_SHADE_INIT +#undef VEC_SHADE_INIT4 +#undef VEC_SHADE +#define VecCommand(name) name##_AVX_Command +#define VEC_SHADE_SIMPLE_INIT AVX_LINEAR_SHADE_SIMPLE_INIT +#define VEC_SHADE_SIMPLE_INIT4 AVX_LINEAR_SHADE_SIMPLE_INIT4 +#define VEC_SHADE_SIMPLE AVX_LINEAR_SHADE_SIMPLE +#define VEC_SHADE_INIT AVX_LINEAR_SHADE_INIT +#define VEC_SHADE_INIT4 AVX_LINEAR_SHADE_INIT4 +#define VEC_SHADE AVX_LINEAR_SHADE +#include "r_drawt_rgba_sse.h" +*/ +#endif + +///////////////////////////////////////////////////////////////////////////// + +class DrawerRt1colCommand : public DrawerCommand +{ +public: + int hx; + int sx; + int yl; + int yh; + BYTE * RESTRICT _destorg; + int _pitch; + + uint32_t _light; + ShadeConstants _shade_constants; + BYTE * RESTRICT _colormap; + + uint32_t _srcalpha; + uint32_t _destalpha; + + DrawerRt1colCommand(int hx, int sx, int yl, int yh) + { + this->hx = hx; + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _destorg = dc_destorg; + _pitch = dc_pitch; + + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _colormap = dc_colormap; + + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + class LoopIterator + { + public: + uint32_t *source; + uint32_t *dest; + int count; + int pitch, sincr; + + LoopIterator(DrawerRt1colCommand *command, DrawerThread *thread) + { + count = thread->count_for_thread(command->yl, (command->yh - command->yl + 1)); + if (count <= 0) + return; + + dest = thread->dest_for_thread(command->yl, command->_pitch, ylookup[command->yl] + command->sx + (uint32_t*)command->_destorg); + source = &thread->dc_temp_rgba[command->yl * 4 + command->hx] + thread->skipped_by_thread(command->yl) * 4; + pitch = command->_pitch * thread->num_cores; + sincr = thread->num_cores * 4; + } + + explicit operator bool() + { + return count > 0; + } + + bool next() + { + dest += pitch; + source += sincr; + return (--count) != 0; + } + }; +}; + +class DrawerRt4colsCommand : public DrawerCommand +{ +public: + int sx; + int yl; + int yh; + uint32_t _light; + ShadeConstants _shade_constants; + BYTE * RESTRICT _destorg; + int _pitch; + BYTE * RESTRICT _colormap; + uint32_t _srcalpha; + uint32_t _destalpha; + + DrawerRt4colsCommand(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _destorg = dc_destorg; + _pitch = dc_pitch; + _colormap = dc_colormap; + + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + class LoopIterator + { + public: + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + LoopIterator(DrawerRt4colsCommand *command, DrawerThread *thread) + { + count = thread->count_for_thread(command->yl, command->yh - command->yl + 1); + if (count <= 0) + return; + + dest = thread->dest_for_thread(command->yl, command->_pitch, ylookup[command->yl] + command->sx + (uint32_t*)command->_destorg); + source = &thread->dc_temp_rgba[command->yl * 4] + thread->skipped_by_thread(command->yl) * 4; + pitch = command->_pitch * thread->num_cores; + sincr = thread->num_cores * 4; + } + + explicit operator bool() + { + return count > 0; + } + + bool next() + { + dest += pitch; + source += sincr; + return (--count) != 0; + } + }; +}; + +class RtCopy1colRGBACommand : public DrawerRt1colCommand +{ +public: + RtCopy1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = GPalette.BaseColors[*loop.source]; + *loop.dest = BlendBgra::copy(fg); + } while (loop.next()); + } +}; + +class RtMap1colRGBACommand : public DrawerRt1colCommand +{ +public: + RtMap1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_colormap[*loop.source], _light, _shade_constants); + *loop.dest = BlendBgra::copy(fg); + } while (loop.next()); + } +}; + +class RtMap4colsRGBACommand : public DrawerRt4colsCommand +{ +public: + RtMap4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_pal_index(_colormap[loop.source[i]], _light, _shade_constants); + loop.dest[i] = BlendBgra::copy(fg); + } + } while (loop.next()); + } +}; + +class RtAdd1colRGBACommand : public DrawerRt1colCommand +{ +public: + RtAdd1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_colormap[*loop.source], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class RtAdd4colsRGBACommand : public DrawerRt4colsCommand +{ +public: + RtAdd4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_pal_index(_colormap[loop.source[i]], _light, _shade_constants); + loop.dest[i] = BlendBgra::add(fg, loop.dest[i], _srcalpha, _destalpha); + } + } while (loop.next()); + } +}; + +class RtShaded1colRGBACommand : public DrawerRt1colCommand +{ + uint32_t _color; + +public: + RtShaded1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh) + { + _color = LightBgra::shade_pal_index(dc_color, _light, _shade_constants); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t alpha = _colormap[*loop.source] * 4; + uint32_t inv_alpha = 256 - alpha; + *loop.dest = BlendBgra::add(_color, *loop.dest, alpha, inv_alpha); + } while (loop.next()); + } +}; + +class RtShaded4colsRGBACommand : public DrawerRt4colsCommand +{ + uint32_t _color; + +public: + RtShaded4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh) + { + _color = LightBgra::shade_pal_index(dc_color, _light, _shade_constants); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + for (int i = 0; i < 4; i++) + { + uint32_t alpha = _colormap[loop.source[i]] * 4; + uint32_t inv_alpha = 256 - alpha; + loop.dest[i] = BlendBgra::add(_color, loop.dest[i], alpha, inv_alpha); + } + } while (loop.next()); + } +}; + +class RtAddClamp1colRGBACommand : public DrawerRt1colCommand +{ +public: + RtAddClamp1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(*loop.source, _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class RtAddClamp4colsRGBACommand : public DrawerRt4colsCommand +{ +public: + RtAddClamp4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_pal_index(loop.source[i], _light, _shade_constants); + loop.dest[i] = BlendBgra::add(fg, loop.dest[i], _srcalpha, _destalpha); + } + } while (loop.next()); + } +}; + +class RtSubClamp1colRGBACommand : public DrawerRt1colCommand +{ +public: + RtSubClamp1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(*loop.source, _light, _shade_constants); + *loop.dest = BlendBgra::sub(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class RtSubClamp4colsRGBACommand : public DrawerRt4colsCommand +{ +public: + RtSubClamp4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_pal_index(loop.source[i], _light, _shade_constants); + loop.dest[i] = BlendBgra::sub(fg, loop.dest[i], _srcalpha, _destalpha); + } + } while (loop.next()); + } +}; + +class RtRevSubClamp1colRGBACommand : public DrawerRt1colCommand +{ +public: + RtRevSubClamp1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(*loop.source, _light, _shade_constants); + *loop.dest = BlendBgra::revsub(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class RtRevSubClamp4colsRGBACommand : public DrawerRt4colsCommand +{ +public: + RtRevSubClamp4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_pal_index(loop.source[i], _light, _shade_constants); + loop.dest[i] = BlendBgra::revsub(fg, loop.dest[i], _srcalpha, _destalpha); + } + } while (loop.next()); + } +}; + +class RtTranslate1colRGBACommand : public DrawerCommand +{ + const BYTE * RESTRICT translation; + int hx; + int yl; + int yh; + +public: + RtTranslate1colRGBACommand(const BYTE *translation, int hx, int yl, int yh) + { + this->translation = translation; + this->hx = hx; + this->yl = yl; + this->yh = yh; + } + + void Execute(DrawerThread *thread) override + { + int count = yh - yl + 1; + uint32_t *source = &thread->dc_temp_rgba[yl*4 + hx]; + + // Things we do to hit the compiler's optimizer with a clue bat: + // 1. Parallelism is explicitly spelled out by using a separate + // C instruction for each assembly instruction. GCC lets me + // have four temporaries, but VC++ spills to the stack with + // more than two. Two is probably optimal, anyway. + // 2. The results of the translation lookups are explicitly + // stored in byte-sized variables. This causes the VC++ code + // to use byte mov instructions in most cases; for apparently + // random reasons, it will use movzx for some places. GCC + // ignores this and uses movzx always. + + // Do 8 rows at a time. + for (int count8 = count >> 3; count8; --count8) + { + int c0, c1; + BYTE b0, b1; + + c0 = source[0]; c1 = source[4]; + b0 = translation[c0]; b1 = translation[c1]; + source[0] = b0; source[4] = b1; + + c0 = source[8]; c1 = source[12]; + b0 = translation[c0]; b1 = translation[c1]; + source[8] = b0; source[12] = b1; + + c0 = source[16]; c1 = source[20]; + b0 = translation[c0]; b1 = translation[c1]; + source[16] = b0; source[20] = b1; + + c0 = source[24]; c1 = source[28]; + b0 = translation[c0]; b1 = translation[c1]; + source[24] = b0; source[28] = b1; + + source += 32; + } + // Finish by doing 1 row at a time. + for (count &= 7; count; --count, source += 4) + { + source[0] = translation[source[0]]; + } + } +}; + +class RtTranslate4colsRGBACommand : public DrawerCommand +{ + const BYTE * RESTRICT translation; + int yl; + int yh; + +public: + RtTranslate4colsRGBACommand(const BYTE *translation, int yl, int yh) + { + this->translation = translation; + this->yl = yl; + this->yh = yh; + } + + void Execute(DrawerThread *thread) override + { + int count = yh - yl + 1; + uint32_t *source = &thread->dc_temp_rgba[yl*4]; + int c0, c1; + BYTE b0, b1; + + // Do 2 rows at a time. + for (int count8 = count >> 1; count8; --count8) + { + c0 = source[0]; c1 = source[1]; + b0 = translation[c0]; b1 = translation[c1]; + source[0] = b0; source[1] = b1; + + c0 = source[2]; c1 = source[3]; + b0 = translation[c0]; b1 = translation[c1]; + source[2] = b0; source[3] = b1; + + c0 = source[4]; c1 = source[5]; + b0 = translation[c0]; b1 = translation[c1]; + source[4] = b0; source[5] = b1; + + c0 = source[6]; c1 = source[7]; + b0 = translation[c0]; b1 = translation[c1]; + source[6] = b0; source[7] = b1; + + source += 8; + } + // Do the final row if count was odd. + if (count & 1) + { + c0 = source[0]; c1 = source[1]; + b0 = translation[c0]; b1 = translation[c1]; + source[0] = b0; source[1] = b1; + + c0 = source[2]; c1 = source[3]; + b0 = translation[c0]; b1 = translation[c1]; + source[2] = b0; source[3] = b1; + } + } +}; + +class RtInitColsRGBACommand : public DrawerCommand +{ + BYTE * RESTRICT buff; + +public: + RtInitColsRGBACommand(BYTE *buff) + { + this->buff = buff; + } + + void Execute(DrawerThread *thread) override + { + thread->dc_temp_rgba = buff == NULL ? thread->dc_temp_rgbabuff_rgba : (uint32_t*)buff; + } +}; + +class DrawColumnHorizRGBACommand : public DrawerCommand +{ + int _count; + fixed_t _iscale; + fixed_t _texturefrac; + const BYTE * RESTRICT _source; + int _x; + int _yl; + int _yh; + +public: + DrawColumnHorizRGBACommand() + { + _count = dc_count; + _iscale = dc_iscale; + _texturefrac = dc_texturefrac; + _source = dc_source; + _x = dc_x; + _yl = dc_yl; + _yh = dc_yh; + } + + void Execute(DrawerThread *thread) override + { + int count = _count; + uint32_t *dest; + fixed_t fracstep; + fixed_t frac; + + if (count <= 0) + return; + + { + int x = _x & 3; + dest = &thread->dc_temp_rgba[x + 4 * _yl]; + } + fracstep = _iscale; + frac = _texturefrac; + + const BYTE *source = _source; + + if (count & 1) { + *dest = source[frac >> FRACBITS]; dest += 4; frac += fracstep; + } + if (count & 2) { + dest[0] = source[frac >> FRACBITS]; frac += fracstep; + dest[4] = source[frac >> FRACBITS]; frac += fracstep; + dest += 8; + } + if (count & 4) { + dest[0] = source[frac >> FRACBITS]; frac += fracstep; + dest[4] = source[frac >> FRACBITS]; frac += fracstep; + dest[8] = source[frac >> FRACBITS]; frac += fracstep; + dest[12] = source[frac >> FRACBITS]; frac += fracstep; + dest += 16; + } + count >>= 3; + if (!count) return; + + do + { + dest[0] = source[frac >> FRACBITS]; frac += fracstep; + dest[4] = source[frac >> FRACBITS]; frac += fracstep; + dest[8] = source[frac >> FRACBITS]; frac += fracstep; + dest[12] = source[frac >> FRACBITS]; frac += fracstep; + dest[16] = source[frac >> FRACBITS]; frac += fracstep; + dest[20] = source[frac >> FRACBITS]; frac += fracstep; + dest[24] = source[frac >> FRACBITS]; frac += fracstep; + dest[28] = source[frac >> FRACBITS]; frac += fracstep; + dest += 32; + } while (--count); + } +}; + +class FillColumnHorizRGBACommand : public DrawerCommand +{ + int _x; + int _yl; + int _yh; + int _count; + int _color; + +public: + FillColumnHorizRGBACommand() + { + _x = dc_x; + _count = dc_count; + _color = dc_color; + _yl = dc_yl; + _yh = dc_yh; + } + + void Execute(DrawerThread *thread) override + { + int count = _count; + int color = _color; + uint32_t *dest; + + if (count <= 0) + return; + + { + int x = _x & 3; + dest = &thread->dc_temp_rgba[x + 4 * _yl]; + } + + if (count & 1) { + *dest = color; + dest += 4; + } + if (!(count >>= 1)) + return; + do { + dest[0] = color; dest[4] = color; + dest += 8; + } while (--count); + } +}; + +///////////////////////////////////////////////////////////////////////////// + +// Copies one span at hx to the screen at sx. +void rt_copy1col_rgba (int hx, int sx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(hx, sx, yl, yh); +} + +// Copies all four spans to the screen starting at sx. +void rt_copy4cols_rgba (int sx, int yl, int yh) +{ + // To do: we could do this with SSE using __m128i + rt_copy1col_rgba(0, sx, yl, yh); + rt_copy1col_rgba(1, sx + 1, yl, yh); + rt_copy1col_rgba(2, sx + 2, yl, yh); + rt_copy1col_rgba(3, sx + 3, yl, yh); +} + +// Maps one span at hx to the screen at sx. +void rt_map1col_rgba (int hx, int sx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(hx, sx, yl, yh); +} + +// Maps all four spans to the screen starting at sx. +void rt_map4cols_rgba (int sx, int yl, int yh) +{ +#ifdef NO_SSE + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#else + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#endif +} + +void rt_Translate1col_rgba(const BYTE *translation, int hx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(translation, hx, yl, yh); +} + +void rt_Translate4cols_rgba(const BYTE *translation, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(translation, yl, yh); +} + +// Translates one span at hx to the screen at sx. +void rt_tlate1col_rgba (int hx, int sx, int yl, int yh) +{ + rt_Translate1col_rgba(dc_translation, hx, yl, yh); + rt_map1col(hx, sx, yl, yh); +} + +// Translates all four spans to the screen starting at sx. +void rt_tlate4cols_rgba (int sx, int yl, int yh) +{ + rt_Translate4cols_rgba(dc_translation, yl, yh); + rt_map4cols(sx, yl, yh); +} + +// Adds one span at hx to the screen at sx without clamping. +void rt_add1col_rgba (int hx, int sx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(hx, sx, yl, yh); +} + +// Adds all four spans to the screen starting at sx without clamping. +void rt_add4cols_rgba (int sx, int yl, int yh) +{ +#ifdef NO_SSE + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#else + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#endif +} + +// Translates and adds one span at hx to the screen at sx without clamping. +void rt_tlateadd1col_rgba (int hx, int sx, int yl, int yh) +{ + rt_Translate1col_rgba(dc_translation, hx, yl, yh); + rt_add1col(hx, sx, yl, yh); +} + +// Translates and adds all four spans to the screen starting at sx without clamping. +void rt_tlateadd4cols_rgba(int sx, int yl, int yh) +{ + rt_Translate4cols_rgba(dc_translation, yl, yh); + rt_add4cols(sx, yl, yh); +} + +// Shades one span at hx to the screen at sx. +void rt_shaded1col_rgba (int hx, int sx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(hx, sx, yl, yh); +} + +// Shades all four spans to the screen starting at sx. +void rt_shaded4cols_rgba (int sx, int yl, int yh) +{ +#ifdef NO_SSE + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#else + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#endif +} + +// Adds one span at hx to the screen at sx with clamping. +void rt_addclamp1col_rgba (int hx, int sx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(hx, sx, yl, yh); +} + +// Adds all four spans to the screen starting at sx with clamping. +void rt_addclamp4cols_rgba (int sx, int yl, int yh) +{ +#ifdef NO_SSE + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#else + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#endif +} + +// Translates and adds one span at hx to the screen at sx with clamping. +void rt_tlateaddclamp1col_rgba (int hx, int sx, int yl, int yh) +{ + rt_Translate1col_rgba(dc_translation, hx, yl, yh); + rt_addclamp1col_rgba(hx, sx, yl, yh); +} + +// Translates and adds all four spans to the screen starting at sx with clamping. +void rt_tlateaddclamp4cols_rgba (int sx, int yl, int yh) +{ + rt_Translate4cols_rgba(dc_translation, yl, yh); + rt_addclamp4cols(sx, yl, yh); +} + +// Subtracts one span at hx to the screen at sx with clamping. +void rt_subclamp1col_rgba (int hx, int sx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(hx, sx, yl, yh); +} + +// Subtracts all four spans to the screen starting at sx with clamping. +void rt_subclamp4cols_rgba (int sx, int yl, int yh) +{ +#ifdef NO_SSE + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#else + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#endif +} + +// Translates and subtracts one span at hx to the screen at sx with clamping. +void rt_tlatesubclamp1col_rgba (int hx, int sx, int yl, int yh) +{ + rt_Translate1col_rgba(dc_translation, hx, yl, yh); + rt_subclamp1col_rgba(hx, sx, yl, yh); +} + +// Translates and subtracts all four spans to the screen starting at sx with clamping. +void rt_tlatesubclamp4cols_rgba (int sx, int yl, int yh) +{ + rt_Translate4cols_rgba(dc_translation, yl, yh); + rt_subclamp4cols_rgba(sx, yl, yh); +} + +// Subtracts one span at hx from the screen at sx with clamping. +void rt_revsubclamp1col_rgba (int hx, int sx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(hx, sx, yl, yh); +} + +// Subtracts all four spans from the screen starting at sx with clamping. +void rt_revsubclamp4cols_rgba (int sx, int yl, int yh) +{ +#ifdef NO_SSE + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#else + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#endif +} + +// Translates and subtracts one span at hx from the screen at sx with clamping. +void rt_tlaterevsubclamp1col_rgba (int hx, int sx, int yl, int yh) +{ + rt_Translate1col_rgba(dc_translation, hx, yl, yh); + rt_revsubclamp1col_rgba(hx, sx, yl, yh); +} + +// Translates and subtracts all four spans from the screen starting at sx with clamping. +void rt_tlaterevsubclamp4cols_rgba (int sx, int yl, int yh) +{ + rt_Translate4cols_rgba(dc_translation, yl, yh); + rt_revsubclamp4cols_rgba(sx, yl, yh); +} + +// Before each pass through a rendering loop that uses these routines, +// call this function to set up the span pointers. +void rt_initcols_rgba (BYTE *buff) +{ + for (int y = 3; y >= 0; y--) + horizspan[y] = dc_ctspan[y] = &dc_tspans[y][0]; + + DrawerCommandQueue::QueueCommand(buff); +} + +void rt_span_coverage_rgba(int x, int start, int stop) +{ + unsigned int **tspan = &dc_ctspan[x & 3]; + (*tspan)[0] = start; + (*tspan)[1] = stop; + *tspan += 2; +} + +// Stretches a column into a temporary buffer which is later +// drawn to the screen along with up to three other columns. +void R_DrawColumnHoriz_rgba (void) +{ + if (dc_count <= 0) + return; + + int x = dc_x & 3; + unsigned int **span = &dc_ctspan[x]; + (*span)[0] = dc_yl; + (*span)[1] = dc_yh; + *span += 2; + + DrawerCommandQueue::QueueCommand(); +} + +// [RH] Just fills a column with a given color +void R_FillColumnHoriz_rgba (void) +{ + if (dc_count <= 0) + return; + + int x = dc_x & 3; + unsigned int **span = &dc_ctspan[x]; + (*span)[0] = dc_yl; + (*span)[1] = dc_yh; + *span += 2; + + DrawerCommandQueue::QueueCommand(); +} diff --git a/src/r_drawt_rgba_sse.h b/src/r_drawt_rgba_sse.h new file mode 100644 index 000000000..7a02f2282 --- /dev/null +++ b/src/r_drawt_rgba_sse.h @@ -0,0 +1,757 @@ +// +// SSE/AVX intrinsics based drawers for the r_drawt family of drawers. +// +// Note: This header file is intentionally not guarded by a __R_DRAWT_RGBA_SSE__ define. +// It is because the code is nearly identical for SSE vs AVX. The file is included +// multiple times by r_drawt_rgba.cpp with different defines that changes the class +// names outputted and the type of intrinsics used. + +#ifdef _MSC_VER +#pragma warning(disable: 4752) // warning C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX +#endif + +class VecCommand(RtMap4colsRGBA) : public DrawerCommand +{ + int sx; + int yl; + int yh; + fixed_t _light; + ShadeConstants _shade_constants; + BYTE * RESTRICT _destorg; + int _pitch; + BYTE * RESTRICT _colormap; + +public: + VecCommand(RtMap4colsRGBA)(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _light = dc_light; + _shade_constants = dc_shade_constants; + _destorg = dc_destorg; + _pitch = dc_pitch; + _colormap = dc_colormap; + } + + void Execute(DrawerThread *thread) override + { + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) + return; + + ShadeConstants shade_constants = _shade_constants; + uint32_t light = LightBgra::calc_light_multiplier(_light); + uint32_t *palette = (uint32_t*)GPalette.BaseColors; + + dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = _pitch * thread->num_cores; + sincr = thread->num_cores * 4; + + BYTE *colormap = _colormap; + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + + if (count & 1) { + uint32_t p0 = colormap[source[0]]; + uint32_t p1 = colormap[source[1]]; + uint32_t p2 = colormap[source[2]]; + uint32_t p3 = colormap[source[3]]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + + source += sincr; + dest += pitch; + } + if (!(count >>= 1)) + return; + + do { + // shade_pal_index 0-3 + { + uint32_t p0 = colormap[source[0]]; + uint32_t p1 = colormap[source[1]]; + uint32_t p2 = colormap[source[2]]; + uint32_t p3 = colormap[source[3]]; + + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + } + + // shade_pal_index 4-7 (pitch) + { + uint32_t p0 = colormap[source[sincr]]; + uint32_t p1 = colormap[source[sincr + 1]]; + uint32_t p2 = colormap[source[sincr + 2]]; + uint32_t p3 = colormap[source[sincr + 3]]; + + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)(dest + pitch), fg); + } + + source += sincr * 2; + dest += pitch * 2; + } while (--count); + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + + if (count & 1) { + uint32_t p0 = colormap[source[0]]; + uint32_t p1 = colormap[source[1]]; + uint32_t p2 = colormap[source[2]]; + uint32_t p3 = colormap[source[3]]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + + source += sincr; + dest += pitch; + } + if (!(count >>= 1)) + return; + + do { + // shade_pal_index 0-3 + { + uint32_t p0 = colormap[source[0]]; + uint32_t p1 = colormap[source[1]]; + uint32_t p2 = colormap[source[2]]; + uint32_t p3 = colormap[source[3]]; + + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + } + + // shade_pal_index 4-7 (pitch) + { + uint32_t p0 = colormap[source[sincr]]; + uint32_t p1 = colormap[source[sincr + 1]]; + uint32_t p2 = colormap[source[sincr + 2]]; + uint32_t p3 = colormap[source[sincr + 3]]; + + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)(dest + pitch), fg); + } + + source += sincr * 2; + dest += pitch * 2; + } while (--count); + } + } +}; + +class VecCommand(RtAdd4colsRGBA) : public DrawerCommand +{ + int sx; + int yl; + int yh; + BYTE * RESTRICT _destorg; + int _pitch; + fixed_t _light; + ShadeConstants _shade_constants; + BYTE * RESTRICT _colormap; + fixed_t _srcalpha; + fixed_t _destalpha; + +public: + VecCommand(RtAdd4colsRGBA)(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _destorg = dc_destorg; + _pitch = dc_pitch; + _light = dc_light; + _shade_constants = dc_shade_constants; + _colormap = dc_colormap; + _srcalpha = dc_srcalpha; + _destalpha = dc_destalpha; + } + + void Execute(DrawerThread *thread) override + { + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) + return; + + dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = _pitch * thread->num_cores; + sincr = 4 * thread->num_cores; + + uint32_t light = LightBgra::calc_light_multiplier(_light); + uint32_t *palette = (uint32_t*)GPalette.BaseColors; + BYTE *colormap = _colormap; + + uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + + ShadeConstants shade_constants = _shade_constants; + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = colormap[source[0]]; + uint32_t p1 = colormap[source[1]]; + uint32_t p2 = colormap[source[2]]; + uint32_t p3 = colormap[source[3]]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * fg_alpha + bg_red * bg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = colormap[source[0]]; + uint32_t p1 = colormap[source[1]]; + uint32_t p2 = colormap[source[2]]; + uint32_t p3 = colormap[source[3]]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * fg_alpha + bg_red * bg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + } +}; + +class VecCommand(RtShaded4colsRGBA) : public DrawerCommand +{ + int sx; + int yl; + int yh; + lighttable_t * RESTRICT _colormap; + int _color; + BYTE * RESTRICT _destorg; + int _pitch; + fixed_t _light; + +public: + VecCommand(RtShaded4colsRGBA)(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _colormap = dc_colormap; + _color = dc_color; + _destorg = dc_destorg; + _pitch = dc_pitch; + _light = dc_light; + } + + void Execute(DrawerThread *thread) override + { + BYTE *colormap; + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) + return; + + colormap = _colormap; + dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = _pitch * thread->num_cores; + sincr = 4 * thread->num_cores; + + __m128i fg = _mm_unpackhi_epi8(_mm_set1_epi32(LightBgra::shade_pal_index_simple(_color, LightBgra::calc_light_multiplier(_light))), _mm_setzero_si128()); + __m128i alpha_one = _mm_set1_epi16(64); + + do { + uint32_t p0 = colormap[source[0]]; + uint32_t p1 = colormap[source[1]]; + uint32_t p2 = colormap[source[2]]; + uint32_t p3 = colormap[source[3]]; + + __m128i alpha_hi = _mm_set_epi16(64, p3, p3, p3, 64, p2, p2, p2); + __m128i alpha_lo = _mm_set_epi16(64, p1, p1, p1, 64, p0, p0, p0); + __m128i inv_alpha_hi = _mm_subs_epu16(alpha_one, alpha_hi); + __m128i inv_alpha_lo = _mm_subs_epu16(alpha_one, alpha_lo); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * alpha + bg_red * inv_alpha) / 64: + __m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg, alpha_hi), _mm_mullo_epi16(bg_hi, inv_alpha_hi)), 6); + __m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg, alpha_lo), _mm_mullo_epi16(bg_lo, inv_alpha_lo)), 6); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } +}; + +class VecCommand(RtAddClamp4colsRGBA) : public DrawerCommand +{ + int sx; + int yl; + int yh; + BYTE * RESTRICT _destorg; + int _pitch; + fixed_t _light; + fixed_t _srcalpha; + fixed_t _destalpha; + ShadeConstants _shade_constants; + +public: + VecCommand(RtAddClamp4colsRGBA)(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _destorg = dc_destorg; + _pitch = dc_pitch; + _light = dc_light; + _srcalpha = dc_srcalpha; + _destalpha = dc_destalpha; + _shade_constants = dc_shade_constants; + } + + void Execute(DrawerThread *thread) override + { + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) + return; + + dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = _pitch * thread->num_cores; + sincr = 4 * thread->num_cores; + + uint32_t light = LightBgra::calc_light_multiplier(_light); + uint32_t *palette = (uint32_t*)GPalette.BaseColors; + + uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + + ShadeConstants shade_constants = _shade_constants; + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * fg_alpha + bg_red * bg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * fg_alpha + bg_red * bg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + } +}; + +class VecCommand(RtSubClamp4colsRGBA) : public DrawerCommand +{ + int sx; + int yl; + int yh; + BYTE * RESTRICT _destorg; + int _pitch; + fixed_t _light; + fixed_t _srcalpha; + fixed_t _destalpha; + ShadeConstants _shade_constants; + +public: + VecCommand(RtSubClamp4colsRGBA)(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _destorg = dc_destorg; + _pitch = dc_pitch; + _light = dc_light; + _srcalpha = dc_srcalpha; + _destalpha = dc_destalpha; + _shade_constants = dc_shade_constants; + } + + void Execute(DrawerThread *thread) override + { + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) + return; + + dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = _pitch * thread->num_cores; + sincr = 4 * thread->num_cores; + + uint32_t light = LightBgra::calc_light_multiplier(_light); + uint32_t *palette = (uint32_t*)GPalette.BaseColors; + ShadeConstants shade_constants = _shade_constants; + + uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (bg_red * bg_alpha - fg_red * fg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, mbg_alpha), _mm_mullo_epi16(fg_hi, mfg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, mbg_alpha), _mm_mullo_epi16(fg_lo, mfg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (bg_red * bg_alpha - fg_red * fg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, mbg_alpha), _mm_mullo_epi16(fg_hi, mfg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, mbg_alpha), _mm_mullo_epi16(fg_lo, mfg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + } +}; + +class VecCommand(RtRevSubClamp4colsRGBA) : public DrawerCommand +{ + int sx; + int yl; + int yh; + BYTE * RESTRICT _destorg; + int _pitch; + fixed_t _light; + fixed_t _srcalpha; + fixed_t _destalpha; + ShadeConstants _shade_constants; + +public: + VecCommand(RtRevSubClamp4colsRGBA)(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _destorg = dc_destorg; + _pitch = dc_pitch; + _light = dc_light; + _srcalpha = dc_srcalpha; + _destalpha = dc_destalpha; + _shade_constants = dc_shade_constants; + } + + void Execute(DrawerThread *thread) override + { + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) + return; + + dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = _pitch * thread->num_cores; + sincr = 4 * thread->num_cores; + + uint32_t light = LightBgra::calc_light_multiplier(_light); + uint32_t *palette = (uint32_t*)GPalette.BaseColors; + ShadeConstants shade_constants = _shade_constants; + + uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * fg_alpha - bg_red * bg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * fg_alpha - bg_red * bg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + } +}; diff --git a/src/r_main.cpp b/src/r_main.cpp index ce4841a2e..247a98125 100644 --- a/src/r_main.cpp +++ b/src/r_main.cpp @@ -40,6 +40,7 @@ #include "r_segs.h" #include "r_3dfloors.h" #include "r_sky.h" +#include "r_draw_rgba.h" #include "st_stuff.h" #include "c_cvars.h" #include "c_dispatch.h" @@ -104,6 +105,8 @@ bool r_dontmaplines; CVAR (String, r_viewsize, "", CVAR_NOSET) CVAR (Bool, r_shadercolormaps, true, CVAR_ARCHIVE) +bool r_swtruecolor; + double r_BaseVisibility; double r_WallVisibility; double r_FloorVisibility; @@ -117,7 +120,7 @@ double FocalLengthX; double FocalLengthY; FDynamicColormap*basecolormap; // [RH] colormap currently drawing with int fixedlightlev; -lighttable_t *fixedcolormap; +FColormap *fixedcolormap; FSpecialColormap *realfixedcolormap; double WallTMapScale2; @@ -397,16 +400,6 @@ void R_InitRenderer() R_InitPlanes (); R_InitShadeMaps(); R_InitColumnDrawers (); - - colfunc = basecolfunc = R_DrawColumn; - fuzzcolfunc = R_DrawFuzzColumn; - transcolfunc = R_DrawTranslatedColumn; - spanfunc = R_DrawSpan; - - // [RH] Horizontal column drawers - hcolfunc_pre = R_DrawColumnHoriz; - hcolfunc_post1 = rt_map1col; - hcolfunc_post4 = rt_map4cols; } //========================================================================== @@ -467,16 +460,16 @@ void R_SetupColormap(player_t *player) if (player->fixedcolormap >= 0 && player->fixedcolormap < (int)SpecialColormaps.Size()) { realfixedcolormap = &SpecialColormaps[player->fixedcolormap]; - if (RenderTarget == screen && (DFrameBuffer *)screen->Accel2D && r_shadercolormaps) + if (RenderTarget == screen && (r_swtruecolor || ((DFrameBuffer *)screen->Accel2D && r_shadercolormaps))) { // Render everything fullbright. The copy to video memory will // apply the special colormap, so it won't be restricted to the // palette. - fixedcolormap = realcolormaps; + fixedcolormap = &realcolormaps; } else { - fixedcolormap = SpecialColormaps[player->fixedcolormap].Colormap; + fixedcolormap = &SpecialColormaps[player->fixedcolormap]; } } else if (player->fixedlightlevel >= 0 && player->fixedlightlevel < NUMCOLORMAPS) @@ -487,7 +480,7 @@ void R_SetupColormap(player_t *player) // [RH] Inverse light for shooting the Sigil if (fixedcolormap == NULL && extralight == INT_MIN) { - fixedcolormap = SpecialColormaps[INVERSECOLORMAP].Colormap; + fixedcolormap = &SpecialColormaps[INVERSECOLORMAP]; extralight = 0; } } @@ -576,6 +569,9 @@ void R_HighlightPortal (PortalDrawseg* pds) // [ZZ] NO OVERFLOW CHECKS HERE // I believe it won't break. if it does, blame me. :( + if (r_swtruecolor) // Assuming this is just a debug function + return; + BYTE color = (BYTE)BestColor((DWORD *)GPalette.BaseColors, 255, 0, 0, 0, 255); BYTE* pixels = RenderTarget->GetBuffer(); @@ -623,12 +619,26 @@ void R_EnterPortal (PortalDrawseg* pds, int depth) int Ytop = pds->ceilingclip[x-pds->x1]; int Ybottom = pds->floorclip[x-pds->x1]; - BYTE *dest = RenderTarget->GetBuffer() + x + Ytop * spacing; - - for (int y = Ytop; y <= Ybottom; y++) + if (r_swtruecolor) { - *dest = color; - dest += spacing; + uint32_t *dest = (uint32_t*)RenderTarget->GetBuffer() + x + Ytop * spacing; + + uint32_t c = GPalette.BaseColors[color].d; + for (int y = Ytop; y <= Ybottom; y++) + { + *dest = c; + dest += spacing; + } + } + else + { + BYTE *dest = RenderTarget->GetBuffer() + x + Ytop * spacing; + + for (int y = Ytop; y <= Ybottom; y++) + { + *dest = color; + dest += spacing; + } } } @@ -797,7 +807,8 @@ void R_SetupBuffer () static BYTE *lastbuff = NULL; int pitch = RenderTarget->GetPitch(); - BYTE *lineptr = RenderTarget->GetBuffer() + viewwindowy*pitch + viewwindowx; + int pixelsize = r_swtruecolor ? 4 : 1; + BYTE *lineptr = RenderTarget->GetBuffer() + (viewwindowy*pitch + viewwindowx) * pixelsize; if (dc_pitch != pitch || lineptr != lastbuff) { @@ -847,10 +858,10 @@ void R_RenderActorView (AActor *actor, bool dontmaplines) // [RH] Show off segs if r_drawflat is 1 if (r_drawflat) { - hcolfunc_pre = R_FillColumnHorizP; + hcolfunc_pre = R_FillColumnHoriz; hcolfunc_post1 = rt_copy1col; hcolfunc_post4 = rt_copy4cols; - colfunc = R_FillColumnP; + colfunc = R_FillColumn; spanfunc = R_FillSpan; } else @@ -925,7 +936,7 @@ void R_RenderActorView (AActor *actor, bool dontmaplines) // If we don't want shadered colormaps, NULL it now so that the // copy to the screen does not use a special colormap shader. - if (!r_shadercolormaps) + if (!r_shadercolormaps && !r_swtruecolor) { realfixedcolormap = NULL; } @@ -943,6 +954,15 @@ void R_RenderViewToCanvas (AActor *actor, DCanvas *canvas, int x, int y, int width, int height, bool dontmaplines) { const bool savedviewactive = viewactive; + const bool savedoutputformat = r_swtruecolor; + + if (r_swtruecolor != canvas->IsBgra()) + { + r_swtruecolor = canvas->IsBgra(); + R_InitColumnDrawers(); + } + + R_BeginDrawerCommands(); viewwidth = width; RenderTarget = canvas; @@ -955,13 +975,22 @@ void R_RenderViewToCanvas (AActor *actor, DCanvas *canvas, R_RenderActorView (actor, dontmaplines); + R_EndDrawerCommands(); + RenderTarget = screen; bRenderingToCanvas = false; R_ExecuteSetViewSize (); screen->Lock (true); R_SetupBuffer (); screen->Unlock (); + viewactive = savedviewactive; + r_swtruecolor = savedoutputformat; + + if (r_swtruecolor != canvas->IsBgra()) + { + R_InitColumnDrawers(); + } } //========================================================================== diff --git a/src/r_main.h b/src/r_main.h index 24103393d..fa8fe0bb1 100644 --- a/src/r_main.h +++ b/src/r_main.h @@ -82,6 +82,16 @@ extern bool r_dontmaplines; // Change R_CalcTiltedLighting() when this changes. #define GETPALOOKUP(vis,shade) (clamp (((shade)-FLOAT2FIXED(MIN(MAXLIGHTVIS,double(vis))))>>FRACBITS, 0, NUMCOLORMAPS-1)) +// Calculate the light multiplier for dc_light/ds_light +// This is used instead of GETPALOOKUP when ds_colormap/dc_colormap is set to the base colormap +// Returns a value between 0 and 1 in fixed point +#define LIGHTSCALE(vis,shade) FLOAT2FIXED(clamp((FIXED2DBL(shade) - (MIN(MAXLIGHTVIS,double(vis)))) / NUMCOLORMAPS, 0.0, (NUMCOLORMAPS-1)/(double)NUMCOLORMAPS)) + +// Converts fixedlightlev into a shade value +#define FIXEDLIGHT2SHADE(lightlev) (((lightlev) >> COLORMAPSHIFT) << FRACBITS) + +extern bool r_swtruecolor; + extern double GlobVis; void R_SetVisibility(double visibility); @@ -96,7 +106,7 @@ extern double r_SpriteVisibility; extern int r_actualextralight; extern bool foggy; extern int fixedlightlev; -extern lighttable_t* fixedcolormap; +extern FColormap* fixedcolormap; extern FSpecialColormap*realfixedcolormap; diff --git a/src/r_plane.cpp b/src/r_plane.cpp index ff23492ab..706d6fad7 100644 --- a/src/r_plane.cpp +++ b/src/r_plane.cpp @@ -58,6 +58,7 @@ #include "r_3dfloors.h" #include "v_palette.h" #include "r_data/colormaps.h" +#include "r_draw_rgba.h" #ifdef _MSC_VER #pragma warning(disable:4244) @@ -227,12 +228,11 @@ void R_MapPlane (int y, int x1) if (plane_shade) { // Determine lighting based on the span's distance from the viewer. - ds_colormap = basecolormap->Maps + (GETPALOOKUP ( - GlobVis * fabs(CenterY - y), planeshade) << COLORMAPSHIFT); + R_SetDSColorMapLight(basecolormap, GlobVis * fabs(CenterY - y), planeshade); } #ifdef X86_ASM - if (ds_colormap != ds_curcolormap) + if (!r_swtruecolor && ds_colormap != ds_curcolormap) R_SetSpanColormap_ASM (ds_colormap); #endif @@ -355,7 +355,7 @@ void R_CalcTiltedLighting (double lval, double lend, int width) // //========================================================================== -void R_MapTiltedPlane (int y, int x1) +void R_MapTiltedPlane_C (int y, int x1) { int x2 = spanend[y]; int width = x2 - x1; @@ -392,7 +392,7 @@ void R_MapTiltedPlane (int y, int x1) u = SQWORD(uz*z) + pviewx; v = SQWORD(vz*z) + pviewy; - ds_colormap = tiltlighting[i]; + R_SetDSColorMapLight(tiltlighting[i], 0, 0); fb[i++] = ds_colormap[ds_source[(v >> vshift) | ((u >> ushift) & umask)]]; iz += plane_sz[0]; uz += plane_su[0]; @@ -478,17 +478,27 @@ void R_MapTiltedPlane (int y, int x1) #endif } +void R_MapTiltedPlane_rgba (int y, int x1) +{ + R_DrawTiltedSpan_rgba(y, x1, spanend[y], plane_sz, plane_su, plane_sv, plane_shade, planeshade, planelightfloat, pviewx, pviewy); +} + //========================================================================== // // R_MapColoredPlane // //========================================================================== -void R_MapColoredPlane (int y, int x1) +void R_MapColoredPlane_C (int y, int x1) { memset (ylookup[y] + x1 + dc_destorg, ds_color, spanend[y] - x1 + 1); } +void R_MapColoredPlane_rgba(int y, int x1) +{ + R_DrawColoredSpan_rgba(y, x1, spanend[y]); +} + //========================================================================== // // R_ClearPlanes @@ -841,15 +851,24 @@ extern FTexture *rw_pic; // Allow for layer skies up to 512 pixels tall. This is overkill, // since the most anyone can ever see of the sky is 500 pixels. // We need 4 skybufs because wallscan can draw up to 4 columns at a time. +// Need two versions - one for true color and one for palette static BYTE skybuf[4][512]; +static uint32_t skybuf_bgra[4][512]; static DWORD lastskycol[4]; +static DWORD lastskycol_bgra[4]; static int skycolplace; +static int skycolplace_bgra; // Get a column of sky when there is only one sky texture. static const BYTE *R_GetOneSkyColumn (FTexture *fronttex, int x) { angle_t column = (skyangle + xtoviewangle[x]) ^ skyflip; - return fronttex->GetColumn((UMulScale16(column, frontcyl) + frontpos) >> FRACBITS, NULL); + int tx = (UMulScale16(column, frontcyl) + frontpos) >> FRACBITS; + + if (!r_swtruecolor) + return fronttex->GetColumn(tx, NULL); + else + return (const BYTE *)fronttex->GetColumnBgra(tx, NULL); } // Get a column of sky when there are two overlapping sky textures @@ -864,38 +883,77 @@ static const BYTE *R_GetTwoSkyColumns (FTexture *fronttex, int x) DWORD skycol = (angle1 << 16) | angle2; int i; - for (i = 0; i < 4; ++i) + if (!r_swtruecolor) { - if (lastskycol[i] == skycol) + for (i = 0; i < 4; ++i) { - return skybuf[i]; + if (lastskycol[i] == skycol) + { + return skybuf[i]; + } } + + lastskycol[skycolplace] = skycol; + BYTE *composite = skybuf[skycolplace]; + skycolplace = (skycolplace + 1) & 3; + + // The ordering of the following code has been tuned to allow VC++ to optimize + // it well. In particular, this arrangement lets it keep count in a register + // instead of on the stack. + const BYTE *front = fronttex->GetColumn(angle1, NULL); + const BYTE *back = backskytex->GetColumn(angle2, NULL); + + int count = MIN(512, MIN(backskytex->GetHeight(), fronttex->GetHeight())); + i = 0; + do + { + if (front[i]) + { + composite[i] = front[i]; + } + else + { + composite[i] = back[i]; + } + } while (++i, --count); + return composite; } - - lastskycol[skycolplace] = skycol; - BYTE *composite = skybuf[skycolplace]; - skycolplace = (skycolplace + 1) & 3; - - // The ordering of the following code has been tuned to allow VC++ to optimize - // it well. In particular, this arrangement lets it keep count in a register - // instead of on the stack. - const BYTE *front = fronttex->GetColumn (angle1, NULL); - const BYTE *back = backskytex->GetColumn (angle2, NULL); - - int count = MIN (512, MIN (backskytex->GetHeight(), fronttex->GetHeight())); - i = 0; - do + else { - if (front[i]) + return R_GetOneSkyColumn(fronttex, x); + for (i = 0; i < 4; ++i) { - composite[i] = front[i]; + if (lastskycol_bgra[i] == skycol) + { + return (BYTE*)(skybuf_bgra[i]); + } } - else + + lastskycol_bgra[skycolplace_bgra] = skycol; + uint32_t *composite = skybuf_bgra[skycolplace_bgra]; + skycolplace_bgra = (skycolplace_bgra + 1) & 3; + + // The ordering of the following code has been tuned to allow VC++ to optimize + // it well. In particular, this arrangement lets it keep count in a register + // instead of on the stack. + const uint32_t *front = (const uint32_t *)fronttex->GetColumnBgra(angle1, NULL); + const uint32_t *back = (const uint32_t *)backskytex->GetColumnBgra(angle2, NULL); + + int count = MIN(512, MIN(backskytex->GetHeight(), fronttex->GetHeight())); + i = 0; + do { - composite[i] = back[i]; - } - } while (++i, --count); - return composite; + if (front[i]) + { + composite[i] = front[i]; + } + else + { + composite[i] = back[i]; + } + } while (++i, --count); + return (BYTE*)composite; + } } static void R_DrawSky (visplane_t *pl) @@ -930,6 +988,7 @@ static void R_DrawSky (visplane_t *pl) for (x = 0; x < 4; ++x) { lastskycol[x] = 0xffffffff; + lastskycol_bgra[x] = 0xffffffff; } rw_pic = frontskytex; @@ -943,6 +1002,7 @@ static void R_DrawSky (visplane_t *pl) for (x = 0; x < 4; ++x) { lastskycol[x] = 0xffffffff; + lastskycol_bgra[x] = 0xffffffff; } wallscan (pl->left, pl->right, (short *)pl->top, (short *)pl->bottom, swall, lwall, frontyScale, backskytex == NULL ? R_GetOneSkyColumn : R_GetTwoSkyColumns); @@ -951,7 +1011,7 @@ static void R_DrawSky (visplane_t *pl) { // The texture does not tile nicely frontyScale *= skyscale; frontiScale = 1 / frontyScale; - R_DrawSkyStriped (pl); + //R_DrawSkyStriped (pl); } } @@ -980,6 +1040,7 @@ static void R_DrawSkyStriped (visplane_t *pl) for (x = 0; x < 4; ++x) { lastskycol[x] = 0xffffffff; + lastskycol_bgra[x] = 0xffffffff; } wallscan (pl->left, pl->right, top, bot, swall, lwall, rw_pic->Scale.Y, backskytex == NULL ? R_GetOneSkyColumn : R_GetTwoSkyColumns); @@ -1098,7 +1159,7 @@ void R_DrawSinglePlane (visplane_t *pl, fixed_t alpha, bool additive, bool maske R_SetupSpanBits(tex); double xscale = pl->xform.xScale * tex->Scale.X; double yscale = pl->xform.yScale * tex->Scale.Y; - ds_source = tex->GetPixels (); + R_SetSpanSource(tex); basecolormap = pl->colormap; planeshade = LIGHT2SHADE(pl->lightlevel); @@ -1461,12 +1522,13 @@ void R_DrawSkyPlane (visplane_t *pl) bool fakefixed = false; if (fixedcolormap) { - dc_colormap = fixedcolormap; + R_SetColorMapLight(fixedcolormap, 0, 0); } else { fakefixed = true; - fixedcolormap = dc_colormap = NormalLight.Maps; + fixedcolormap = &NormalLight; + R_SetColorMapLight(fixedcolormap, 0, 0); } R_DrawSky (pl); @@ -1484,7 +1546,7 @@ void R_DrawSkyPlane (visplane_t *pl) void R_DrawNormalPlane (visplane_t *pl, double _xscale, double _yscale, fixed_t alpha, bool additive, bool masked) { #ifdef X86_ASM - if (ds_source != ds_cursource) + if (!r_swtruecolor && ds_source != ds_cursource) { R_SetSpanSource_ASM (ds_source); } @@ -1547,12 +1609,21 @@ void R_DrawNormalPlane (visplane_t *pl, double _xscale, double _yscale, fixed_t planeheight = fabs(pl->height.Zat0() - ViewPos.Z); GlobVis = r_FloorVisibility / planeheight; + ds_light = 0; if (fixedlightlev >= 0) - ds_colormap = basecolormap->Maps + fixedlightlev, plane_shade = false; + { + R_SetDSColorMapLight(basecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); + plane_shade = false; + } else if (fixedcolormap) - ds_colormap = fixedcolormap, plane_shade = false; + { + R_SetDSColorMapLight(fixedcolormap, 0, 0); + plane_shade = false; + } else + { plane_shade = true; + } if (spanfunc != R_FillSpan) { @@ -1565,12 +1636,16 @@ void R_DrawNormalPlane (visplane_t *pl, double _xscale, double _yscale, fixed_t spanfunc = R_DrawSpanMaskedTranslucent; dc_srcblend = Col2RGB8[alpha>>10]; dc_destblend = Col2RGB8[(OPAQUE-alpha)>>10]; + dc_srcalpha = alpha; + dc_destalpha = OPAQUE - alpha; } else { spanfunc = R_DrawSpanMaskedAddClamp; dc_srcblend = Col2RGB8_LessPrecision[alpha>>10]; dc_destblend = Col2RGB8_LessPrecision[FRACUNIT>>10]; + dc_srcalpha = alpha; + dc_destalpha = OPAQUE - alpha; } } else @@ -1587,12 +1662,16 @@ void R_DrawNormalPlane (visplane_t *pl, double _xscale, double _yscale, fixed_t spanfunc = R_DrawSpanTranslucent; dc_srcblend = Col2RGB8[alpha>>10]; dc_destblend = Col2RGB8[(OPAQUE-alpha)>>10]; + dc_srcalpha = alpha; + dc_destalpha = OPAQUE - alpha; } else { spanfunc = R_DrawSpanAddClamp; dc_srcblend = Col2RGB8_LessPrecision[alpha>>10]; dc_destblend = Col2RGB8_LessPrecision[FRACUNIT>>10]; + dc_srcalpha = alpha; + dc_destalpha = OPAQUE - alpha; } } else @@ -1708,11 +1787,20 @@ void R_DrawTiltedPlane (visplane_t *pl, double _xscale, double _yscale, fixed_t planelightfloat = -planelightfloat; if (fixedlightlev >= 0) - ds_colormap = basecolormap->Maps + fixedlightlev, plane_shade = false; + { + R_SetDSColorMapLight(basecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); + plane_shade = false; + } else if (fixedcolormap) - ds_colormap = fixedcolormap, plane_shade = false; + { + R_SetDSColorMapLight(fixedcolormap, 0, 0); + plane_shade = false; + } else - ds_colormap = basecolormap->Maps, plane_shade = true; + { + R_SetDSColorMapLight(basecolormap, 0, 0); + plane_shade = true; + } if (!plane_shade) { @@ -1723,9 +1811,16 @@ void R_DrawTiltedPlane (visplane_t *pl, double _xscale, double _yscale, fixed_t } #if defined(X86_ASM) - if (ds_source != ds_curtiltedsource) - R_SetTiltedSpanSource_ASM (ds_source); - R_MapVisPlane (pl, R_DrawTiltedPlane_ASM); + if (!r_swtruecolor) + { + if (ds_source != ds_curtiltedsource) + R_SetTiltedSpanSource_ASM(ds_source); + R_MapVisPlane(pl, R_DrawTiltedPlane_ASM); + } + else + { + R_MapVisPlane(pl, R_MapTiltedPlane); + } #else R_MapVisPlane (pl, R_MapTiltedPlane); #endif diff --git a/src/r_plane.h b/src/r_plane.h index d4db3dc09..b199d3477 100644 --- a/src/r_plane.h +++ b/src/r_plane.h @@ -93,6 +93,14 @@ void R_DrawNormalPlane (visplane_t *pl, double xscale, double yscale, fixed_t al void R_DrawTiltedPlane (visplane_t *pl, double xscale, double yscale, fixed_t alpha, bool additive, bool masked); void R_MapVisPlane (visplane_t *pl, void (*mapfunc)(int y, int x1)); +extern void(*R_MapColoredPlane)(int y, int x1); +extern void(*R_MapTiltedPlane)(int y, int x1); + +void R_MapTiltedPlane_C(int y, int x1); +void R_MapTiltedPlane_rgba(int y, int x); +void R_MapColoredPlane_C(int y, int x1); +void R_MapColoredPlane_rgba(int y, int x1); + visplane_t *R_FindPlane ( const secplane_t &height, FTextureID picnum, diff --git a/src/r_segs.cpp b/src/r_segs.cpp index edb1949b6..078f1d921 100644 --- a/src/r_segs.cpp +++ b/src/r_segs.cpp @@ -50,6 +50,7 @@ #include "r_plane.h" #include "r_segs.h" #include "r_3dfloors.h" +#include "r_draw.h" #include "v_palette.h" #include "r_data/colormaps.h" @@ -177,7 +178,7 @@ static void BlastMaskedColumn (void (*blastfunc)(const BYTE *pixels, const FText // calculate lighting if (fixedcolormap == NULL && fixedlightlev < 0) { - dc_colormap = basecolormap->Maps + (GETPALOOKUP (rw_light, wallshade) << COLORMAPSHIFT); + R_SetColorMapLight(basecolormap, rw_light, wallshade); } dc_iscale = xs_Fix<16>::ToFix(MaskedSWall[dc_x] * MaskedScaleY); @@ -313,9 +314,9 @@ void R_RenderMaskedSegRange (drawseg_t *ds, int x1, int x2) rw_scalestep = ds->iscalestep; if (fixedlightlev >= 0) - dc_colormap = basecolormap->Maps + fixedlightlev; + R_SetColorMapLight(basecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); else if (fixedcolormap != NULL) - dc_colormap = fixedcolormap; + R_SetColorMapLight(fixedcolormap, 0, 0); // find positioning texheight = tex->GetScaledHeightDouble(); @@ -461,7 +462,7 @@ void R_RenderMaskedSegRange (drawseg_t *ds, int x1, int x2) while (dc_x < stop) { - rt_initcols(); + rt_initcols(nullptr); BlastMaskedColumn (R_DrawMaskedColumnHoriz, tex); dc_x++; BlastMaskedColumn (R_DrawMaskedColumnHoriz, tex); dc_x++; BlastMaskedColumn (R_DrawMaskedColumnHoriz, tex); dc_x++; @@ -630,9 +631,9 @@ void R_RenderFakeWall(drawseg_t *ds, int x1, int x2, F3DFloor *rover) } if (fixedlightlev >= 0) - dc_colormap = basecolormap->Maps + fixedlightlev; + R_SetColorMapLight(basecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); else if (fixedcolormap != NULL) - dc_colormap = fixedcolormap; + R_SetColorMapLight(fixedcolormap, 0, 0); WallC.sz1 = ds->sz1; WallC.sz2 = ds->sz2; @@ -1065,50 +1066,308 @@ void R_RenderFakeWallRange (drawseg_t *ds, int x1, int x2) return; } -// prevlineasm1 is like vlineasm1 but skips the loop if only drawing one pixel -inline fixed_t prevline1 (fixed_t vince, BYTE *colormap, int count, fixed_t vplce, const BYTE *bufplce, BYTE *dest) +EXTERN_CVAR(Bool, r_mipmap) + +struct WallscanSampler { - dc_iscale = vince; - dc_colormap = colormap; - dc_count = count; - dc_texturefrac = vplce; - dc_source = bufplce; - dc_dest = dest; - return doprevline1 (); + WallscanSampler() { } + WallscanSampler(int y1, float swal, double yrepeat, fixed_t xoffset, FTexture *texture, const BYTE*(*getcol)(FTexture *texture, int x)); + + uint32_t uv_pos; + uint32_t uv_step; + uint32_t uv_max; + + const BYTE *source; + const BYTE *source2; + uint32_t texturefracx; + uint32_t height; +}; + +WallscanSampler::WallscanSampler(int y1, float swal, double yrepeat, fixed_t xoffset, FTexture *texture, const BYTE*(*getcol)(FTexture *texture, int x)) +{ + if (!r_swtruecolor) + { + height = texture->GetHeight(); + int uv_fracbits = 32 - texture->HeightBits; + uv_max = height << uv_fracbits; + + // Find start uv in [0-base_height[ range. + // Not using xs_ToFixed because it rounds the result and we need something that always rounds down to stay within the range. + double uv_stepd = swal * yrepeat; + double v = (dc_texturemid + uv_stepd * (y1 - CenterY + 0.5)) / height; + v = v - floor(v); + v *= height; + v *= (1 << uv_fracbits); + + uv_pos = (uint32_t)v; + uv_step = xs_ToFixed(uv_fracbits, uv_stepd); + if (uv_step == 0) // To prevent divide by zero elsewhere + uv_step = 1; + + source = getcol(texture, xoffset >> FRACBITS); + source2 = nullptr; + texturefracx = 0; + } + else + { + // Normalize to 0-1 range: + double uv_stepd = swal * yrepeat; + double v = (dc_texturemid + uv_stepd * (y1 - CenterY + 0.5)) / texture->GetHeight(); + v = v - floor(v); + double v_step = uv_stepd / texture->GetHeight(); + + if (isnan(v) || isnan(v_step)) // this should never happen, but it apparently does.. + { + uv_stepd = 0.0; + v = 0.0; + v_step = 0.0; + } + + // Convert to uint32: + uv_pos = (uint32_t)(v * 0x100000000LL); + uv_step = (uint32_t)(v_step * 0x100000000LL); + uv_max = 0; + + // Texture mipmap and filter selection: + if (getcol != R_GetColumn) + { + source = getcol(texture, xoffset >> FRACBITS); + source2 = nullptr; + height = texture->GetHeight(); + texturefracx = 0; + } + else + { + double magnitude = abs(uv_stepd * 2); + bool magnifying = magnitude < 1.0f; + + int mipmap_offset = 0; + int mip_width = texture->GetWidth(); + int mip_height = texture->GetHeight(); + if (r_mipmap && texture->Mipmapped()) + { + uint32_t xpos = (uint32_t)((((uint64_t)xoffset) << FRACBITS) / mip_width); + double texture_bias = 1.7f; + double level = MAX(magnitude - 3.0, 0.0); + while (level > texture_bias) + { + mipmap_offset += mip_width * mip_height; + level *= 0.5f; + mip_width = MAX(mip_width >> 1, 1); + mip_height = MAX(mip_height >> 1, 1); + } + xoffset = (xpos >> FRACBITS) * mip_width; + } + + const uint32_t *pixels = texture->GetPixelsBgra() + mipmap_offset; + + bool filter_nearest = (magnifying && !r_magfilter) || (!magnifying && !r_minfilter); + if (filter_nearest) + { + int tx = (xoffset >> FRACBITS) % mip_width; + if (tx < 0) + tx += mip_width; + source = (BYTE*)(pixels + tx * mip_height); + source2 = nullptr; + height = mip_height; + texturefracx = 0; + } + else + { + int tx0 = (xoffset >> FRACBITS) % mip_width; + if (tx0 < 0) + tx0 += mip_width; + int tx1 = (tx0 + 1) % mip_width; + source = (BYTE*)(pixels + tx0 * mip_height); + source2 = (BYTE*)(pixels + tx1 * mip_height); + height = mip_height; + texturefracx = (xoffset >> (FRACBITS - 4)) & 15; + } + } + } } -void wallscan (int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, - double yrepeat, const BYTE *(*getcol)(FTexture *tex, int x)) +// Draw a column with support for non-power-of-two ranges +void wallscan_drawcol1(int x, int y1, int y2, WallscanSampler &sampler, DWORD(*draw1column)()) { - int x, fracbits; - int y1ve[4], y2ve[4], u4, d4, z; - char bad; - float light = rw_light - rw_lightstep; - SDWORD xoffset; - BYTE *basecolormapdata; - double iscale; - - // This function also gets used to draw skies. Unlike BUILD, skies are - // drawn by visplane instead of by bunch, so these checks are invalid. - //if ((uwal[x1] > viewheight) && (uwal[x2] > viewheight)) return; - //if ((dwal[x1] < 0) && (dwal[x2] < 0)) return; - - if (rw_pic->UseType == FTexture::TEX_Null) + if (r_swtruecolor) { - return; + int count = y2 - y1; + + dc_source = sampler.source; + dc_source2 = sampler.source2; + dc_texturefracx = sampler.texturefracx; + dc_dest = (ylookup[y1] + x) * 4 + dc_destorg; + dc_count = count; + dc_iscale = sampler.uv_step; + dc_texturefrac = sampler.uv_pos; + dc_textureheight = sampler.height; + draw1column(); + + uint64_t step64 = sampler.uv_step; + uint64_t pos64 = sampler.uv_pos; + sampler.uv_pos = (uint32_t)(pos64 + step64 * count); } + else + { + if (sampler.uv_max == 0) // power of two + { + int count = y2 - y1; -//extern cycle_t WallScanCycles; -//clock (WallScanCycles); + dc_source = sampler.source; + dc_source2 = sampler.source2; + dc_texturefracx = sampler.texturefracx; + dc_dest = (ylookup[y1] + x) + dc_destorg; + dc_count = count; + dc_iscale = sampler.uv_step; + dc_texturefrac = sampler.uv_pos; + draw1column(); - rw_pic->GetHeight(); // Make sure texture size is loaded - fracbits = 32 - rw_pic->HeightBits; - setupvline(fracbits); - xoffset = rw_offset; - basecolormapdata = basecolormap->Maps; + uint64_t step64 = sampler.uv_step; + uint64_t pos64 = sampler.uv_pos; + sampler.uv_pos = (uint32_t)(pos64 + step64 * count); + } + else + { + uint32_t uv_pos = sampler.uv_pos; - x = x1; - //while ((umost[x] > dmost[x]) && (x < x2)) x++; + uint32_t left = y2 - y1; + while (left > 0) + { + uint32_t available = sampler.uv_max - uv_pos; + uint32_t next_uv_wrap = available / sampler.uv_step; + if (available % sampler.uv_step != 0) + next_uv_wrap++; + uint32_t count = MIN(left, next_uv_wrap); + + dc_source = sampler.source; + dc_source2 = sampler.source2; + dc_texturefracx = sampler.texturefracx; + dc_dest = (ylookup[y1] + x) + dc_destorg; + dc_count = count; + dc_iscale = sampler.uv_step; + dc_texturefrac = uv_pos; + draw1column(); + + left -= count; + uv_pos += sampler.uv_step * count; + if (uv_pos >= sampler.uv_max) + uv_pos -= sampler.uv_max; + } + + sampler.uv_pos = uv_pos; + } + } +} + +// Draw four columns with support for non-power-of-two ranges +void wallscan_drawcol4(int x, int y1, int y2, WallscanSampler *sampler, void(*draw4columns)()) +{ + if (r_swtruecolor) + { + int count = y2 - y1; + for (int i = 0; i < 4; i++) + { + bufplce[i] = sampler[i].source; + bufplce2[i] = sampler[i].source2; + buftexturefracx[i] = sampler[i].texturefracx; + bufheight[i] = sampler[i].height; + vplce[i] = sampler[i].uv_pos; + vince[i] = sampler[i].uv_step; + + uint64_t step64 = sampler[i].uv_step; + uint64_t pos64 = sampler[i].uv_pos; + sampler[i].uv_pos = (uint32_t)(pos64 + step64 * count); + } + dc_dest = (ylookup[y1] + x) * 4 + dc_destorg; + dc_count = count; + draw4columns(); + } + else + { + if (sampler[0].uv_max == 0) // power of two, no wrap handling needed + { + int count = y2 - y1; + for (int i = 0; i < 4; i++) + { + bufplce[i] = sampler[i].source; + bufplce2[i] = sampler[i].source2; + buftexturefracx[i] = sampler[i].texturefracx; + vplce[i] = sampler[i].uv_pos; + vince[i] = sampler[i].uv_step; + + uint64_t step64 = sampler[i].uv_step; + uint64_t pos64 = sampler[i].uv_pos; + sampler[i].uv_pos = (uint32_t)(pos64 + step64 * count); + } + dc_dest = (ylookup[y1] + x) + dc_destorg; + dc_count = count; + draw4columns(); + } + else + { + dc_dest = (ylookup[y1] + x) + dc_destorg; + for (int i = 0; i < 4; i++) + { + bufplce[i] = sampler[i].source; + bufplce2[i] = sampler[i].source2; + buftexturefracx[i] = sampler[i].texturefracx; + } + + uint32_t left = y2 - y1; + while (left > 0) + { + // Find which column wraps first + uint32_t count = left; + for (int i = 0; i < 4; i++) + { + uint32_t available = sampler[i].uv_max - sampler[i].uv_pos; + uint32_t next_uv_wrap = available / sampler[i].uv_step; + if (available % sampler[i].uv_step != 0) + next_uv_wrap++; + count = MIN(next_uv_wrap, count); + } + + // Draw until that column wraps + for (int i = 0; i < 4; i++) + { + vplce[i] = sampler[i].uv_pos; + vince[i] = sampler[i].uv_step; + } + dc_count = count; + draw4columns(); + + // Wrap the uv position + for (int i = 0; i < 4; i++) + { + sampler[i].uv_pos += sampler[i].uv_step * count; + if (sampler[i].uv_pos >= sampler[i].uv_max) + sampler[i].uv_pos -= sampler[i].uv_max; + } + + left -= count; + } + } + } +} + +typedef DWORD(*Draw1ColumnFuncPtr)(); +typedef void(*Draw4ColumnsFuncPtr)(); + +void wallscan_any( + int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, double yrepeat, + const BYTE *(*getcol)(FTexture *tex, int x), + void(setupwallscan(int bits, Draw1ColumnFuncPtr &draw1, Draw4ColumnsFuncPtr &draw2))) +{ + if (rw_pic->UseType == FTexture::TEX_Null) + return; + + fixed_t xoffset = rw_offset; + rw_pic->GetHeight(); // To ensure that rw_pic->HeightBits has been set + + DWORD(*draw1column)(); + void(*draw4columns)(); + setupwallscan(r_swtruecolor ? FRACBITS : 32 - rw_pic->HeightBits, draw1column, draw4columns); bool fixed = (fixedcolormap != NULL || fixedlightlev >= 0); if (fixed) @@ -1117,132 +1376,195 @@ void wallscan (int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *l palookupoffse[1] = dc_colormap; palookupoffse[2] = dc_colormap; palookupoffse[3] = dc_colormap; + palookuplight[0] = 0; + palookuplight[1] = 0; + palookuplight[2] = 0; + palookuplight[3] = 0; } - for(; (x < x2) && (x & 3); ++x) + if (fixedcolormap) + R_SetColorMapLight(fixedcolormap, 0, 0); + else + R_SetColorMapLight(basecolormap, 0, 0); + + float light = rw_light; + + // Calculate where 4 column alignment begins and ends: + int aligned_x1 = clamp((x1 + 3) / 4 * 4, x1, x2); + int aligned_x2 = clamp(x2 / 4 * 4, x1, x2); + + // First unaligned columns: + for (int x = x1; x < aligned_x1; x++, light += rw_lightstep) { - light += rw_lightstep; - y1ve[0] = uwal[x];//max(uwal[x],umost[x]); - y2ve[0] = dwal[x];//min(dwal[x],dmost[x]); - if (y2ve[0] <= y1ve[0]) continue; - assert (y1ve[0] < viewheight); - assert (y2ve[0] <= viewheight); + int y1 = uwal[x]; + int y2 = dwal[x]; + if (y2 <= y1) + continue; if (!fixed) - { // calculate lighting - dc_colormap = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } + R_SetColorMapLight(basecolormap, light, wallshade); - dc_source = getcol (rw_pic, (lwal[x] + xoffset) >> FRACBITS); - dc_dest = ylookup[y1ve[0]] + x + dc_destorg; - dc_count = y2ve[0] - y1ve[0]; - iscale = swal[x] * yrepeat; - dc_iscale = xs_ToFixed(fracbits, iscale); - dc_texturefrac = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[0] - CenterY + 0.5)); - - dovline1(); + WallscanSampler sampler(y1, swal[x], yrepeat, lwal[x] + xoffset, rw_pic, getcol); + wallscan_drawcol1(x, y1, y2, sampler, draw1column); } - for(; x < x2-3; x += 4) + // The aligned columns + for (int x = aligned_x1; x < aligned_x2; x += 4) { - bad = 0; - for (z = 3; z>= 0; --z) - { - y1ve[z] = uwal[x+z];//max(uwal[x+z],umost[x+z]); - y2ve[z] = dwal[x+z];//min(dwal[x+z],dmost[x+z])-1; - if (y2ve[z] <= y1ve[z]) { bad += 1<> FRACBITS); - iscale = swal[x + z] * yrepeat; - vince[z] = xs_ToFixed(fracbits, iscale); - vplce[z] = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[z] - CenterY + 0.5)); - } - if (bad == 15) + float lights[4]; + for (int i = 0; i < 4; i++) { - light += rw_lightstep * 4; + lights[i] = light; + light += rw_lightstep; + } + + WallscanSampler sampler[4]; + for (int i = 0; i < 4; i++) + sampler[i] = WallscanSampler(y1[i], swal[x + i], yrepeat, lwal[x + i] + xoffset, rw_pic, getcol); + + // Figure out where we vertically can start and stop drawing 4 columns in one go + int middle_y1 = y1[0]; + int middle_y2 = y2[0]; + for (int i = 1; i < 4; i++) + { + middle_y1 = MAX(y1[i], middle_y1); + middle_y2 = MIN(y2[i], middle_y2); + } + + // If we got an empty column in our set we cannot draw 4 columns in one go: + bool empty_column_in_set = false; + int bilinear_count = 0; + for (int i = 0; i < 4; i++) + { + if (y2[i] <= y1[i]) + empty_column_in_set = true; + if (sampler[i].source2) + bilinear_count++; + } + + if (empty_column_in_set || middle_y2 <= middle_y1 || (bilinear_count > 0 && bilinear_count < 4)) + { + for (int i = 0; i < 4; i++) + { + if (y2[i] <= y1[i]) + continue; + + if (!fixed) + R_SetColorMapLight(basecolormap, lights[i], wallshade); + wallscan_drawcol1(x + i, y1[i], y2[i], sampler[i], draw1column); + } continue; } - if (!fixed) + // Draw the first rows where not all 4 columns are active + for (int i = 0; i < 4; i++) { - for (z = 0; z < 4; ++z) - { - light += rw_lightstep; - palookupoffse[z] = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } + if (!fixed) + R_SetColorMapLight(basecolormap, lights[i], wallshade); + + if (y1[i] < middle_y1) + wallscan_drawcol1(x + i, y1[i], middle_y1, sampler[i], draw1column); } - u4 = MAX(MAX(y1ve[0],y1ve[1]),MAX(y1ve[2],y1ve[3])); - d4 = MIN(MIN(y2ve[0],y2ve[1]),MIN(y2ve[2],y2ve[3])); - - if ((bad != 0) || (u4 >= d4)) + // Draw the area where all 4 columns are active + if (!fixed) { - for (z = 0; z < 4; ++z) + for (int i = 0; i < 4; i++) { - if (!(bad & 1)) + if (r_swtruecolor) { - prevline1(vince[z],palookupoffse[z],y2ve[z]-y1ve[z],vplce[z],bufplce[z],ylookup[y1ve[z]]+x+z+dc_destorg); + palookupoffse[i] = basecolormap->Maps; + palookuplight[i] = LIGHTSCALE(lights[i], wallshade); + } + else + { + palookupoffse[i] = basecolormap->Maps + (GETPALOOKUP(lights[i], wallshade) << COLORMAPSHIFT); + palookuplight[i] = 0; } - bad >>= 1; - } - continue; - } - - for (z = 0; z < 4; ++z) - { - if (u4 > y1ve[z]) - { - vplce[z] = prevline1(vince[z],palookupoffse[z],u4-y1ve[z],vplce[z],bufplce[z],ylookup[y1ve[z]]+x+z+dc_destorg); } } + wallscan_drawcol4(x, middle_y1, middle_y2, sampler, draw4columns); - if (d4 > u4) + // Draw the last rows where not all 4 columns are active + for (int i = 0; i < 4; i++) { - dc_count = d4-u4; - dc_dest = ylookup[u4]+x+dc_destorg; - dovline4(); - } + if (!fixed) + R_SetColorMapLight(basecolormap, lights[i], wallshade); - BYTE *i = x+ylookup[d4]+dc_destorg; - for (z = 0; z < 4; ++z) - { - if (y2ve[z] > d4) - { - prevline1(vince[z],palookupoffse[0],y2ve[z]-d4,vplce[z],bufplce[z],i+z); - } + if (middle_y2 < y2[i]) + wallscan_drawcol1(x + i, middle_y2, y2[i], sampler[i], draw1column); } } - for(;x> FRACBITS); - dc_dest = ylookup[y1ve[0]] + x + dc_destorg; - dc_count = y2ve[0] - y1ve[0]; - iscale = swal[x] * yrepeat; - dc_iscale = xs_ToFixed(fracbits, iscale); - dc_texturefrac = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[0] - CenterY + 0.5)); - - dovline1(); + WallscanSampler sampler(y1, swal[x], yrepeat, lwal[x] + xoffset, rw_pic, getcol); + wallscan_drawcol1(x, y1, y2, sampler, draw1column); } -//unclock (WallScanCycles); - NetUpdate (); } +void wallscan(int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, double yrepeat, const BYTE *(*getcol)(FTexture *tex, int x)) +{ + wallscan_any(x1, x2, uwal, dwal, swal, lwal, yrepeat, getcol, [](int bits, Draw1ColumnFuncPtr &line1, Draw4ColumnsFuncPtr &line4) + { + setupvline(bits); + line1 = dovline1; + line4 = dovline4; + }); +} + +void maskwallscan(int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, double yrepeat, const BYTE *(*getcol)(FTexture *tex, int x)) +{ + if (!rw_pic->bMasked) // Textures that aren't masked can use the faster wallscan. + { + wallscan(x1, x2, uwal, dwal, swal, lwal, yrepeat, getcol); + } + else + { + wallscan_any(x1, x2, uwal, dwal, swal, lwal, yrepeat, getcol, [](int bits, Draw1ColumnFuncPtr &line1, Draw4ColumnsFuncPtr &line4) + { + setupmvline(bits); + line1 = domvline1; + line4 = domvline4; + }); + } +} + +void transmaskwallscan(int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, double yrepeat, const BYTE *(*getcol)(FTexture *tex, int x)) +{ + static fixed_t(*tmvline1)(); + static void(*tmvline4)(); + if (!R_GetTransMaskDrawers(&tmvline1, &tmvline4)) + { + // The current translucency is unsupported, so draw with regular maskwallscan instead. + maskwallscan(x1, x2, uwal, dwal, swal, lwal, yrepeat, getcol); + } + else + { + wallscan_any(x1, x2, uwal, dwal, swal, lwal, yrepeat, getcol, [](int bits, Draw1ColumnFuncPtr &line1, Draw4ColumnsFuncPtr &line4) + { + setuptmvline(bits); + line1 = reinterpret_cast(tmvline1); + line4 = tmvline4; + }); + } +} + void wallscan_striped (int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, double yrepeat) { FDynamicColormap *startcolormap = basecolormap; @@ -1416,358 +1738,6 @@ static void wallscan_np2_ds(drawseg_t *ds, int x1, int x2, short *uwal, short *d } } -inline fixed_t mvline1 (fixed_t vince, BYTE *colormap, int count, fixed_t vplce, const BYTE *bufplce, BYTE *dest) -{ - dc_iscale = vince; - dc_colormap = colormap; - dc_count = count; - dc_texturefrac = vplce; - dc_source = bufplce; - dc_dest = dest; - return domvline1 (); -} - -void maskwallscan (int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, - double yrepeat, const BYTE *(*getcol)(FTexture *tex, int x)) -{ - int x, fracbits; - BYTE *p; - int y1ve[4], y2ve[4], u4, d4, startx, dax, z; - char bad; - float light = rw_light - rw_lightstep; - SDWORD xoffset; - BYTE *basecolormapdata; - double iscale; - - if (rw_pic->UseType == FTexture::TEX_Null) - { - return; - } - - if (!rw_pic->bMasked) - { // Textures that aren't masked can use the faster wallscan. - wallscan (x1, x2, uwal, dwal, swal, lwal, yrepeat, getcol); - return; - } - -//extern cycle_t WallScanCycles; -//clock (WallScanCycles); - - rw_pic->GetHeight(); // Make sure texture size is loaded - fracbits = 32- rw_pic->HeightBits; - setupmvline(fracbits); - xoffset = rw_offset; - basecolormapdata = basecolormap->Maps; - - x = startx = x1; - p = x + dc_destorg; - - bool fixed = (fixedcolormap != NULL || fixedlightlev >= 0); - if (fixed) - { - palookupoffse[0] = dc_colormap; - palookupoffse[1] = dc_colormap; - palookupoffse[2] = dc_colormap; - palookupoffse[3] = dc_colormap; - } - - for(; (x < x2) && ((size_t)p & 3); ++x, ++p) - { - light += rw_lightstep; - y1ve[0] = uwal[x];//max(uwal[x],umost[x]); - y2ve[0] = dwal[x];//min(dwal[x],dmost[x]); - if (y2ve[0] <= y1ve[0]) continue; - - if (!fixed) - { // calculate lighting - dc_colormap = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } - - dc_source = getcol (rw_pic, (lwal[x] + xoffset) >> FRACBITS); - dc_dest = ylookup[y1ve[0]] + p; - dc_count = y2ve[0] - y1ve[0]; - iscale = swal[x] * yrepeat; - dc_iscale = xs_ToFixed(fracbits, iscale); - dc_texturefrac = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[0] - CenterY + 0.5)); - - domvline1(); - } - - for(; x < x2-3; x += 4, p+= 4) - { - bad = 0; - for (z = 3, dax = x+3; z >= 0; --z, --dax) - { - y1ve[z] = uwal[dax]; - y2ve[z] = dwal[dax]; - if (y2ve[z] <= y1ve[z]) { bad += 1<> FRACBITS); - iscale = swal[dax] * yrepeat; - vince[z] = xs_ToFixed(fracbits, iscale); - vplce[z] = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[z] - CenterY + 0.5)); - } - if (bad == 15) - { - light += rw_lightstep * 4; - continue; - } - - if (!fixed) - { - for (z = 0; z < 4; ++z) - { - light += rw_lightstep; - palookupoffse[z] = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } - } - - u4 = MAX(MAX(y1ve[0],y1ve[1]),MAX(y1ve[2],y1ve[3])); - d4 = MIN(MIN(y2ve[0],y2ve[1]),MIN(y2ve[2],y2ve[3])); - - if ((bad != 0) || (u4 >= d4)) - { - for (z = 0; z < 4; ++z) - { - if (!(bad & 1)) - { - mvline1(vince[z],palookupoffse[z],y2ve[z]-y1ve[z],vplce[z],bufplce[z],ylookup[y1ve[z]]+p+z); - } - bad >>= 1; - } - continue; - } - - for (z = 0; z < 4; ++z) - { - if (u4 > y1ve[z]) - { - vplce[z] = mvline1(vince[z],palookupoffse[z],u4-y1ve[z],vplce[z],bufplce[z],ylookup[y1ve[z]]+p+z); - } - } - - if (d4 > u4) - { - dc_count = d4-u4; - dc_dest = ylookup[u4]+p; - domvline4(); - } - - BYTE *i = p+ylookup[d4]; - for (z = 0; z < 4; ++z) - { - if (y2ve[z] > d4) - { - mvline1(vince[z],palookupoffse[0],y2ve[z]-d4,vplce[z],bufplce[z],i+z); - } - } - } - for(; x < x2; ++x, ++p) - { - light += rw_lightstep; - y1ve[0] = uwal[x]; - y2ve[0] = dwal[x]; - if (y2ve[0] <= y1ve[0]) continue; - - if (!fixed) - { // calculate lighting - dc_colormap = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } - - dc_source = getcol (rw_pic, (lwal[x] + xoffset) >> FRACBITS); - dc_dest = ylookup[y1ve[0]] + p; - dc_count = y2ve[0] - y1ve[0]; - iscale = swal[x] * yrepeat; - dc_iscale = xs_ToFixed(fracbits, iscale); - dc_texturefrac = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[0] - CenterY + 0.5)); - - domvline1(); - } - -//unclock(WallScanCycles); - - NetUpdate (); -} - -inline void preptmvline1 (fixed_t vince, BYTE *colormap, int count, fixed_t vplce, const BYTE *bufplce, BYTE *dest) -{ - dc_iscale = vince; - dc_colormap = colormap; - dc_count = count; - dc_texturefrac = vplce; - dc_source = bufplce; - dc_dest = dest; -} - -void transmaskwallscan (int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, - double yrepeat, const BYTE *(*getcol)(FTexture *tex, int x)) -{ - fixed_t (*tmvline1)(); - void (*tmvline4)(); - int x, fracbits; - BYTE *p; - int y1ve[4], y2ve[4], u4, d4, startx, dax, z; - char bad; - float light = rw_light - rw_lightstep; - SDWORD xoffset; - BYTE *basecolormapdata; - double iscale; - - if (rw_pic->UseType == FTexture::TEX_Null) - { - return; - } - - if (!R_GetTransMaskDrawers (&tmvline1, &tmvline4)) - { - // The current translucency is unsupported, so draw with regular maskwallscan instead. - maskwallscan (x1, x2, uwal, dwal, swal, lwal, yrepeat, getcol); - return; - } - -//extern cycle_t WallScanCycles; -//clock (WallScanCycles); - - rw_pic->GetHeight(); // Make sure texture size is loaded - fracbits = 32 - rw_pic->HeightBits; - setuptmvline(fracbits); - xoffset = rw_offset; - basecolormapdata = basecolormap->Maps; - fixed_t centeryfrac = FLOAT2FIXED(CenterY); - - x = startx = x1; - p = x + dc_destorg; - - bool fixed = (fixedcolormap != NULL || fixedlightlev >= 0); - if (fixed) - { - palookupoffse[0] = dc_colormap; - palookupoffse[1] = dc_colormap; - palookupoffse[2] = dc_colormap; - palookupoffse[3] = dc_colormap; - } - - for(; (x < x2) && ((size_t)p & 3); ++x, ++p) - { - light += rw_lightstep; - y1ve[0] = uwal[x];//max(uwal[x],umost[x]); - y2ve[0] = dwal[x];//min(dwal[x],dmost[x]); - if (y2ve[0] <= y1ve[0]) continue; - - if (!fixed) - { // calculate lighting - dc_colormap = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } - - dc_source = getcol (rw_pic, (lwal[x] + xoffset) >> FRACBITS); - dc_dest = ylookup[y1ve[0]] + p; - dc_count = y2ve[0] - y1ve[0]; - iscale = swal[x] * yrepeat; - dc_iscale = xs_ToFixed(fracbits, iscale); - dc_texturefrac = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[0] - CenterY + 0.5)); - - tmvline1(); - } - - for(; x < x2-3; x += 4, p+= 4) - { - bad = 0; - for (z = 3, dax = x+3; z >= 0; --z, --dax) - { - y1ve[z] = uwal[dax]; - y2ve[z] = dwal[dax]; - if (y2ve[z] <= y1ve[z]) { bad += 1<> FRACBITS); - iscale = swal[dax] * yrepeat; - vince[z] = xs_ToFixed(fracbits, iscale); - vplce[z] = xs_ToFixed(fracbits, dc_texturemid + vince[z] * (y1ve[z] - CenterY + 0.5)); - } - if (bad == 15) - { - light += rw_lightstep * 4; - continue; - } - - if (!fixed) - { - for (z = 0; z < 4; ++z) - { - light += rw_lightstep; - palookupoffse[z] = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } - } - - u4 = MAX(MAX(y1ve[0],y1ve[1]),MAX(y1ve[2],y1ve[3])); - d4 = MIN(MIN(y2ve[0],y2ve[1]),MIN(y2ve[2],y2ve[3])); - - if ((bad != 0) || (u4 >= d4)) - { - for (z = 0; z < 4; ++z) - { - if (!(bad & 1)) - { - preptmvline1(vince[z],palookupoffse[z],y2ve[z]-y1ve[z],vplce[z],bufplce[z],ylookup[y1ve[z]]+p+z); - tmvline1(); - } - bad >>= 1; - } - continue; - } - - for (z = 0; z < 4; ++z) - { - if (u4 > y1ve[z]) - { - preptmvline1(vince[z],palookupoffse[z],u4-y1ve[z],vplce[z],bufplce[z],ylookup[y1ve[z]]+p+z); - vplce[z] = tmvline1(); - } - } - - if (d4 > u4) - { - dc_count = d4-u4; - dc_dest = ylookup[u4]+p; - tmvline4(); - } - - BYTE *i = p+ylookup[d4]; - for (z = 0; z < 4; ++z) - { - if (y2ve[z] > d4) - { - preptmvline1(vince[z],palookupoffse[0],y2ve[z]-d4,vplce[z],bufplce[z],i+z); - tmvline1(); - } - } - } - for(; x < x2; ++x, ++p) - { - light += rw_lightstep; - y1ve[0] = uwal[x]; - y2ve[0] = dwal[x]; - if (y2ve[0] <= y1ve[0]) continue; - - if (!fixed) - { // calculate lighting - dc_colormap = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } - - dc_source = getcol (rw_pic, (lwal[x] + xoffset) >> FRACBITS); - dc_dest = ylookup[y1ve[0]] + p; - dc_count = y2ve[0] - y1ve[0]; - iscale = swal[x] * yrepeat; - dc_iscale = xs_ToFixed(fracbits, iscale); - dc_texturefrac = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[0] - CenterY + 0.5)); - - tmvline1(); - } - -//unclock(WallScanCycles); - - NetUpdate (); -} - // // R_RenderSegLoop // Draws zero, one, or two textures for walls. @@ -1788,9 +1758,9 @@ void R_RenderSegLoop () fixed_t xoffset = rw_offset; if (fixedlightlev >= 0) - dc_colormap = basecolormap->Maps + fixedlightlev; + R_SetColorMapLight(basecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); else if (fixedcolormap != NULL) - dc_colormap = fixedcolormap; + R_SetColorMapLight(fixedcolormap, 0, 0); // clip wall to the floor and ceiling for (x = x1; x < x2; ++x) @@ -3187,11 +3157,11 @@ static void R_RenderDecal (side_t *wall, DBaseDecal *decal, drawseg_t *clipper, rw_light = rw_lightleft + (x1 - WallC.sx1) * rw_lightstep; if (fixedlightlev >= 0) - dc_colormap = usecolormap->Maps + fixedlightlev; + R_SetColorMapLight(usecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); else if (fixedcolormap != NULL) - dc_colormap = fixedcolormap; + R_SetColorMapLight(fixedcolormap, 0, 0); else if (!foggy && (decal->RenderFlags & RF_FULLBRIGHT)) - dc_colormap = usecolormap->Maps; + R_SetColorMapLight(usecolormap, 0, 0); else calclighting = true; @@ -3242,7 +3212,7 @@ static void R_RenderDecal (side_t *wall, DBaseDecal *decal, drawseg_t *clipper, { if (calclighting) { // calculate lighting - dc_colormap = usecolormap->Maps + (GETPALOOKUP (rw_light, wallshade) << COLORMAPSHIFT); + R_SetColorMapLight(usecolormap, rw_light, wallshade); } R_WallSpriteColumn (R_DrawMaskedColumn); dc_x++; @@ -3252,9 +3222,9 @@ static void R_RenderDecal (side_t *wall, DBaseDecal *decal, drawseg_t *clipper, { if (calclighting) { // calculate lighting - dc_colormap = usecolormap->Maps + (GETPALOOKUP (rw_light, wallshade) << COLORMAPSHIFT); + R_SetColorMapLight(usecolormap, rw_light, wallshade); } - rt_initcols(); + rt_initcols(nullptr); for (int zz = 4; zz; --zz) { R_WallSpriteColumn (R_DrawMaskedColumnHoriz); @@ -3267,7 +3237,7 @@ static void R_RenderDecal (side_t *wall, DBaseDecal *decal, drawseg_t *clipper, { if (calclighting) { // calculate lighting - dc_colormap = usecolormap->Maps + (GETPALOOKUP (rw_light, wallshade) << COLORMAPSHIFT); + R_SetColorMapLight(usecolormap, rw_light, wallshade); } R_WallSpriteColumn (R_DrawMaskedColumn); dc_x++; diff --git a/src/r_swrenderer.cpp b/src/r_swrenderer.cpp index 07edf25e9..c81d2a110 100644 --- a/src/r_swrenderer.cpp +++ b/src/r_swrenderer.cpp @@ -42,7 +42,9 @@ #include "r_3dfloors.h" #include "textures/textures.h" #include "r_data/voxels.h" +#include "r_draw_rgba.h" +EXTERN_CVAR(Bool, r_shadercolormaps) class FArchive; void R_SWRSetWindow(int windowSize, int fullWidth, int fullHeight, int stHeight, int trueratio); @@ -58,6 +60,7 @@ void R_InitRenderer(); void FSoftwareRenderer::Init() { + r_swtruecolor = screen->IsBgra(); R_InitRenderer(); } @@ -85,11 +88,17 @@ void FSoftwareRenderer::PrecacheTexture(FTexture *tex, int cache) if (cache & FTextureManager::HIT_Columnmode) { const FTexture::Span *spanp; - tex->GetColumn(0, &spanp); + if (r_swtruecolor) + tex->GetColumnBgra(0, &spanp); + else + tex->GetColumn(0, &spanp); } else if (cache != 0) { - tex->GetPixels (); + if (r_swtruecolor) + tex->GetPixelsBgra(); + else + tex->GetPixels (); } else { @@ -155,9 +164,24 @@ void FSoftwareRenderer::Precache(BYTE *texhitlist, TMap &act void FSoftwareRenderer::RenderView(player_t *player) { + if (r_swtruecolor != screen->IsBgra()) + { + r_swtruecolor = screen->IsBgra(); + R_InitColumnDrawers(); + } + + R_BeginDrawerCommands(); R_RenderActorView (player->mo); // [RH] Let cameras draw onto textures that were visible this frame. FCanvasTextureInfo::UpdateAll (); + + // Apply special colormap if the target cannot do it + if (realfixedcolormap && r_swtruecolor && !(r_shadercolormaps && screen->Accel2D)) + { + DrawerCommandQueue::QueueCommand(realfixedcolormap, screen); + } + + R_EndDrawerCommands(); } //========================================================================== @@ -182,7 +206,7 @@ void FSoftwareRenderer::RemapVoxels() void FSoftwareRenderer::WriteSavePic (player_t *player, FILE *file, int width, int height) { - DCanvas *pic = new DSimpleCanvas (width, height); + DCanvas *pic = new DSimpleCanvas (width, height, false); PalEntry palette[256]; // Take a snapshot of the player's view @@ -311,27 +335,67 @@ void FSoftwareRenderer::CopyStackedViewParameters() void FSoftwareRenderer::RenderTextureView (FCanvasTexture *tex, AActor *viewpoint, int fov) { - BYTE *Pixels = const_cast(tex->GetPixels()); - DSimpleCanvas *Canvas = tex->GetCanvas(); + BYTE *Pixels = r_swtruecolor ? (BYTE*)tex->GetPixelsBgra() : (BYTE*)tex->GetPixels(); + DSimpleCanvas *Canvas = r_swtruecolor ? tex->GetCanvasBgra() : tex->GetCanvas(); // curse Doom's overuse of global variables in the renderer. // These get clobbered by rendering to a camera texture but they need to be preserved so the final rendering can be done with the correct palette. - unsigned char *savecolormap = fixedcolormap; + FColormap *savecolormap = fixedcolormap; FSpecialColormap *savecm = realfixedcolormap; DAngle savedfov = FieldOfView; R_SetFOV ((double)fov); R_RenderViewToCanvas (viewpoint, Canvas, 0, 0, tex->GetWidth(), tex->GetHeight(), tex->bFirstUpdate); R_SetFOV (savedfov); - if (Pixels == Canvas->GetBuffer()) + + if (Canvas->IsBgra()) { - FTexture::FlipSquareBlockRemap (Pixels, tex->GetWidth(), tex->GetHeight(), GPalette.Remap); + if (Pixels == Canvas->GetBuffer()) + { + FTexture::FlipSquareBlockBgra((uint32_t*)Pixels, tex->GetWidth(), tex->GetHeight()); + } + else + { + FTexture::FlipNonSquareBlockBgra((uint32_t*)Pixels, (const uint32_t*)Canvas->GetBuffer(), tex->GetWidth(), tex->GetHeight(), Canvas->GetPitch()); + } } else { - FTexture::FlipNonSquareBlockRemap (Pixels, Canvas->GetBuffer(), tex->GetWidth(), tex->GetHeight(), Canvas->GetPitch(), GPalette.Remap); + if (Pixels == Canvas->GetBuffer()) + { + FTexture::FlipSquareBlockRemap(Pixels, tex->GetWidth(), tex->GetHeight(), GPalette.Remap); + } + else + { + FTexture::FlipNonSquareBlockRemap(Pixels, Canvas->GetBuffer(), tex->GetWidth(), tex->GetHeight(), Canvas->GetPitch(), GPalette.Remap); + } } + + if (r_swtruecolor) + { + // True color render still sometimes uses palette textures (for sprites, mostly). + // We need to make sure that both pixel buffers contain data: + int width = tex->GetWidth(); + int height = tex->GetHeight(); + BYTE *palbuffer = (BYTE *)tex->GetPixels(); + uint32_t *bgrabuffer = (uint32_t*)tex->GetPixelsBgra(); + for (int x = 0; x < width; x++) + { + for (int y = 0; y < height; y++) + { + uint32_t color = bgrabuffer[y]; + int r = RPART(color); + int g = GPART(color); + int b = BPART(color); + palbuffer[y] = RGB32k.RGB[r >> 3][g >> 3][b >> 3]; + } + palbuffer += height; + bgrabuffer += height; + } + } + tex->SetUpdated(); + fixedcolormap = savecolormap; realfixedcolormap = savecm; } diff --git a/src/r_things.cpp b/src/r_things.cpp index 28b52129b..0823f6813 100644 --- a/src/r_things.cpp +++ b/src/r_things.cpp @@ -58,6 +58,7 @@ #include "r_plane.h" #include "r_segs.h" #include "r_3dfloors.h" +#include "r_draw_rgba.h" #include "v_palette.h" #include "r_data/r_translate.h" #include "r_data/colormaps.h" @@ -251,6 +252,7 @@ bool sprflipvert; void R_DrawMaskedColumn (const BYTE *column, const FTexture::Span *span) { + int pixelsize = r_swtruecolor ? 4 : 1; const fixed_t centeryfrac = FLOAT2FIXED(CenterY); const fixed_t texturemid = FLOAT2FIXED(dc_texturemid); while (span->Length != 0) @@ -321,7 +323,7 @@ void R_DrawMaskedColumn (const BYTE *column, const FTexture::Span *span) } } dc_source = column + top; - dc_dest = ylookup[dc_yl] + dc_x + dc_destorg; + dc_dest = (ylookup[dc_yl] + dc_x) * pixelsize + dc_destorg; dc_count = dc_yh - dc_yl + 1; colfunc (); } @@ -414,7 +416,7 @@ void R_DrawVisSprite (vissprite_t *vis) } fixed_t centeryfrac = FLOAT2FIXED(CenterY); - dc_colormap = vis->Style.colormap; + R_SetColorMapLight(vis->Style.BaseColormap, 0, vis->Style.ColormapNum << FRACBITS); mode = R_SetPatchStyle (vis->Style.RenderStyle, vis->Style.Alpha, vis->Translation, vis->FillColor); @@ -422,7 +424,7 @@ void R_DrawVisSprite (vissprite_t *vis) { // For shaded sprites, R_SetPatchStyle sets a dc_colormap to an alpha table, but // it is the brightest one. We need to get back to the proper light level for // this sprite. - dc_colormap += vis->ColormapNum << COLORMAPSHIFT; + R_SetColorMapLight(dc_fcolormap, 0, vis->Style.ColormapNum << FRACBITS); } if (mode != DontDraw) @@ -476,7 +478,7 @@ void R_DrawVisSprite (vissprite_t *vis) while (dc_x < stop4) { - rt_initcols(); + rt_initcols(nullptr); for (int zz = 4; zz; --zz) { pixels = tex->GetColumn (frac >> FRACBITS, &spans); @@ -544,11 +546,11 @@ void R_DrawWallSprite(vissprite_t *spr) rw_lightstep = float((GlobVis / spr->wallc.sz2 - rw_lightleft) / (spr->wallc.sx2 - spr->wallc.sx1)); rw_light = rw_lightleft + (x1 - spr->wallc.sx1) * rw_lightstep; if (fixedlightlev >= 0) - dc_colormap = usecolormap->Maps + fixedlightlev; + R_SetColorMapLight(usecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); else if (fixedcolormap != NULL) - dc_colormap = fixedcolormap; + R_SetColorMapLight(fixedcolormap, 0, 0); else if (!foggy && (spr->renderflags & RF_FULLBRIGHT)) - dc_colormap = usecolormap->Maps; + R_SetColorMapLight(usecolormap, 0, 0); else calclighting = true; @@ -599,7 +601,7 @@ void R_DrawWallSprite(vissprite_t *spr) { if (calclighting) { // calculate lighting - dc_colormap = usecolormap->Maps + (GETPALOOKUP (rw_light, shade) << COLORMAPSHIFT); + R_SetColorMapLight(usecolormap, rw_light, shade); } if (!R_ClipSpriteColumnWithPortals(spr)) R_WallSpriteColumn(R_DrawMaskedColumn); @@ -610,9 +612,9 @@ void R_DrawWallSprite(vissprite_t *spr) { if (calclighting) { // calculate lighting - dc_colormap = usecolormap->Maps + (GETPALOOKUP (rw_light, shade) << COLORMAPSHIFT); + R_SetColorMapLight(usecolormap, rw_light, shade); } - rt_initcols(); + rt_initcols(nullptr); for (int zz = 4; zz; --zz) { if (!R_ClipSpriteColumnWithPortals(spr)) @@ -626,7 +628,7 @@ void R_DrawWallSprite(vissprite_t *spr) { if (calclighting) { // calculate lighting - dc_colormap = usecolormap->Maps + (GETPALOOKUP (rw_light, shade) << COLORMAPSHIFT); + R_SetColorMapLight(usecolormap, rw_light, shade); } if (!R_ClipSpriteColumnWithPortals(spr)) R_WallSpriteColumn(R_DrawMaskedColumn); @@ -660,14 +662,14 @@ void R_DrawVisVoxel(vissprite_t *spr, int minslabz, int maxslabz, short *cliptop int flags = 0; // Do setup for blending. - dc_colormap = spr->Style.colormap; + R_SetColorMapLight(spr->Style.BaseColormap, 0, spr->Style.ColormapNum << FRACBITS); mode = R_SetPatchStyle(spr->Style.RenderStyle, spr->Style.Alpha, spr->Translation, spr->FillColor); if (mode == DontDraw) { return; } - if (colfunc == fuzzcolfunc || colfunc == R_FillColumnP) + if (colfunc == fuzzcolfunc || colfunc == R_FillColumn) { flags = DVF_OFFSCREEN | DVF_SPANSONLY; } @@ -686,12 +688,13 @@ void R_DrawVisVoxel(vissprite_t *spr, int minslabz, int maxslabz, short *cliptop // Render the voxel, either directly to the screen or offscreen. R_DrawVoxel(spr->pa.vpos, spr->pa.vang, spr->gpos, spr->Angle, - spr->xscale, FLOAT2FIXED(spr->yscale), spr->voxel, spr->Style.colormap, cliptop, clipbot, + spr->xscale, FLOAT2FIXED(spr->yscale), spr->voxel, spr->Style.BaseColormap, spr->Style.ColormapNum, cliptop, clipbot, minslabz, maxslabz, flags); // Blend the voxel, if that's what we need to do. if ((flags & ~DVF_MIRRORED) != 0) { + int pixelsize = r_swtruecolor ? 4 : 1; for (int x = 0; x < viewwidth; ++x) { if (!(flags & DVF_SPANSONLY) && (x & 3) == 0) @@ -706,15 +709,12 @@ void R_DrawVisVoxel(vissprite_t *spr, int minslabz, int maxslabz, short *cliptop dc_yl = span->Start; dc_yh = span->Stop - 1; dc_count = span->Stop - span->Start; - dc_dest = ylookup[span->Start] + x + dc_destorg; + dc_dest = (ylookup[span->Start] + x) * pixelsize + dc_destorg; colfunc(); } else { - unsigned int **tspan = &dc_ctspan[x & 3]; - (*tspan)[0] = span->Start; - (*tspan)[1] = span->Stop - 1; - *tspan += 2; + rt_span_coverage(x, span->Start, span->Stop - 1); } } if (!(flags & DVF_SPANSONLY) && (x & 3) == 3) @@ -1073,7 +1073,7 @@ void R_ProjectSprite (AActor *thing, int fakeside, F3DFloor *fakefloor, F3DFloor vis->Style.Alpha = float(thing->Alpha); vis->fakefloor = fakefloor; vis->fakeceiling = fakeceiling; - vis->ColormapNum = 0; + vis->Style.ColormapNum = 0; vis->bInMirror = MirrorFlags & RF_XFLIP; vis->bSplitSprite = false; @@ -1125,7 +1125,8 @@ void R_ProjectSprite (AActor *thing, int fakeside, F3DFloor *fakefloor, F3DFloor // get light level if (fixedcolormap != NULL) { // fixed map - vis->Style.colormap = fixedcolormap; + vis->Style.BaseColormap = fixedcolormap; + vis->Style.ColormapNum = 0; } else { @@ -1135,17 +1136,19 @@ void R_ProjectSprite (AActor *thing, int fakeside, F3DFloor *fakefloor, F3DFloor } if (fixedlightlev >= 0) { - vis->Style.colormap = mybasecolormap->Maps + fixedlightlev; + vis->Style.BaseColormap = mybasecolormap; + vis->Style.ColormapNum = fixedlightlev >> COLORMAPSHIFT; } else if (!foggy && ((renderflags & RF_FULLBRIGHT) || (thing->flags5 & MF5_BRIGHT))) { // full bright - vis->Style.colormap = mybasecolormap->Maps; + vis->Style.BaseColormap = mybasecolormap; + vis->Style.ColormapNum = 0; } else { // diminished light - vis->ColormapNum = GETPALOOKUP( + vis->Style.ColormapNum = GETPALOOKUP( r_SpriteVisibility / MAX(tz, MINZ), spriteshade); - vis->Style.colormap = mybasecolormap->Maps + (vis->ColormapNum << COLORMAPSHIFT); + vis->Style.BaseColormap = mybasecolormap; } } } @@ -1214,14 +1217,13 @@ static void R_ProjectWallSprite(AActor *thing, const DVector3 &pos, FTextureID p vis->Style.Alpha = float(thing->Alpha); vis->fakefloor = NULL; vis->fakeceiling = NULL; - vis->ColormapNum = 0; vis->bInMirror = MirrorFlags & RF_XFLIP; vis->pic = pic; vis->bIsVoxel = false; vis->bWallSprite = true; - vis->ColormapNum = GETPALOOKUP( + vis->Style.ColormapNum = GETPALOOKUP( r_SpriteVisibility / MAX(tz, MINZ), spriteshade); - vis->Style.colormap = basecolormap->Maps + (vis->ColormapNum << COLORMAPSHIFT); + vis->Style.BaseColormap = basecolormap; vis->wallc = wallc; } @@ -1401,7 +1403,7 @@ void R_DrawPSprite(DPSprite *pspr, AActor *owner, float bobx, float boby, double vis->yscale = float(pspriteyscale / tex->Scale.Y); vis->Translation = 0; // [RH] Use default colors vis->pic = tex; - vis->ColormapNum = 0; + vis->Style.ColormapNum = 0; if (flip) { @@ -1449,9 +1451,10 @@ void R_DrawPSprite(DPSprite *pspr, AActor *owner, float bobx, float boby, double } } - if (realfixedcolormap != nullptr) + if (realfixedcolormap != nullptr && (!r_swtruecolor || (r_shadercolormaps && screen->Accel2D))) { // fixed color - vis->Style.colormap = realfixedcolormap->Colormap; + vis->Style.BaseColormap = realfixedcolormap; + vis->Style.ColormapNum = 0; } else { @@ -1461,35 +1464,38 @@ void R_DrawPSprite(DPSprite *pspr, AActor *owner, float bobx, float boby, double } if (fixedlightlev >= 0) { - vis->Style.colormap = mybasecolormap->Maps + fixedlightlev; + vis->Style.BaseColormap = mybasecolormap; + vis->Style.ColormapNum = fixedlightlev >> COLORMAPSHIFT; } else if (!foggy && pspr->GetState()->GetFullbright()) { // full bright - vis->Style.colormap = mybasecolormap->Maps; // [RH] use basecolormap + vis->Style.BaseColormap = mybasecolormap; // [RH] use basecolormap + vis->Style.ColormapNum = 0; } else { // local light - vis->Style.colormap = mybasecolormap->Maps + (GETPALOOKUP(0, spriteshade) << COLORMAPSHIFT); + vis->Style.BaseColormap = mybasecolormap; + vis->Style.ColormapNum = GETPALOOKUP(0, spriteshade); } } if (camera->Inventory != nullptr) { - lighttable_t *oldcolormap = vis->Style.colormap; - camera->Inventory->AlterWeaponSprite(&vis->Style); - if (vis->Style.colormap != oldcolormap) + BYTE oldcolormapnum = vis->Style.ColormapNum; + FColormap *oldcolormap = vis->Style.BaseColormap; + camera->Inventory->AlterWeaponSprite (&vis->Style); + if (vis->Style.BaseColormap != oldcolormap || vis->Style.ColormapNum != oldcolormapnum) { // The colormap has changed. Is it one we can easily identify? // If not, then don't bother trying to identify it for // hardware accelerated drawing. - if (vis->Style.colormap < SpecialColormaps[0].Colormap || - vis->Style.colormap > SpecialColormaps.Last().Colormap) + if (vis->Style.BaseColormap < &SpecialColormaps[0] || + vis->Style.BaseColormap > &SpecialColormaps.Last()) { noaccel = true; } // Has the basecolormap changed? If so, we can't hardware accelerate it, // since we don't know what it is anymore. - else if (vis->Style.colormap < mybasecolormap->Maps || - vis->Style.colormap >= mybasecolormap->Maps + NUMCOLORMAPS * 256) + else if (vis->Style.BaseColormap != mybasecolormap) { noaccel = true; } @@ -1497,13 +1503,13 @@ void R_DrawPSprite(DPSprite *pspr, AActor *owner, float bobx, float boby, double } // If we're drawing with a special colormap, but shaders for them are disabled, do // not accelerate. - if (!r_shadercolormaps && (vis->Style.colormap >= SpecialColormaps[0].Colormap && - vis->Style.colormap <= SpecialColormaps.Last().Colormap)) + if (!r_shadercolormaps && (vis->Style.BaseColormap >= &SpecialColormaps[0] && + vis->Style.BaseColormap <= &SpecialColormaps.Last())) { noaccel = true; } // If drawing with a BOOM colormap, disable acceleration. - if (mybasecolormap == &NormalLight && NormalLight.Maps != realcolormaps) + if (mybasecolormap == &NormalLight && NormalLight.Maps != realcolormaps.Maps) { noaccel = true; } @@ -1520,7 +1526,8 @@ void R_DrawPSprite(DPSprite *pspr, AActor *owner, float bobx, float boby, double else { colormap_to_use = basecolormap; - vis->Style.colormap = basecolormap->Maps; + vis->Style.BaseColormap = basecolormap; + vis->Style.ColormapNum = 0; vis->Style.RenderStyle = STYLE_Normal; } @@ -1691,18 +1698,16 @@ void R_DrawRemainingPlayerSprites() FColormapStyle colormapstyle; bool usecolormapstyle = false; - if (vis->Style.colormap >= SpecialColormaps[0].Colormap && - vis->Style.colormap < SpecialColormaps[SpecialColormaps.Size()].Colormap) + if (vis->Style.BaseColormap >= &SpecialColormaps[0] && + vis->Style.BaseColormap < &SpecialColormaps[SpecialColormaps.Size()]) { - // Yuck! There needs to be a better way to store colormaps in the vissprite... :( - ptrdiff_t specialmap = (vis->Style.colormap - SpecialColormaps[0].Colormap) / sizeof(FSpecialColormap); - special = &SpecialColormaps[specialmap]; + special = static_cast(vis->Style.BaseColormap); } else if (colormap->Color == PalEntry(255,255,255) && colormap->Desaturate == 0) { overlay = colormap->Fade; - overlay.a = BYTE(((vis->Style.colormap - colormap->Maps) >> 8) * 255 / NUMCOLORMAPS); + overlay.a = BYTE(vis->Style.ColormapNum * 255 / NUMCOLORMAPS); } else { @@ -1710,7 +1715,7 @@ void R_DrawRemainingPlayerSprites() colormapstyle.Color = colormap->Color; colormapstyle.Fade = colormap->Fade; colormapstyle.Desaturate = colormap->Desaturate; - colormapstyle.FadeLevel = ((vis->Style.colormap - colormap->Maps) >> 8) / float(NUMCOLORMAPS); + colormapstyle.FadeLevel = vis->Style.ColormapNum / float(NUMCOLORMAPS); } screen->DrawTexture(vis->pic, viewwindowx + vispsprites[i].x1, @@ -1955,7 +1960,8 @@ void R_DrawSprite (vissprite_t *spr) int r1, r2; short topclip, botclip; short *clip1, *clip2; - lighttable_t *colormap = spr->Style.colormap; + FColormap *colormap = spr->Style.BaseColormap; + int colormapnum = spr->Style.ColormapNum; F3DFloor *rover; FDynamicColormap *mybasecolormap; @@ -2052,17 +2058,19 @@ void R_DrawSprite (vissprite_t *spr) } if (fixedlightlev >= 0) { - spr->Style.colormap = mybasecolormap->Maps + fixedlightlev; + spr->Style.BaseColormap = mybasecolormap; + spr->Style.ColormapNum = fixedlightlev >> COLORMAPSHIFT; } else if (!foggy && (spr->renderflags & RF_FULLBRIGHT)) { // full bright - spr->Style.colormap = mybasecolormap->Maps; + spr->Style.BaseColormap = mybasecolormap; + spr->Style.ColormapNum = 0; } else { // diminished light spriteshade = LIGHT2SHADE(sec->lightlevel + r_actualextralight); - spr->Style.colormap = mybasecolormap->Maps + (GETPALOOKUP ( - r_SpriteVisibility / MAX(MINZ, (double)spr->depth), spriteshade) << COLORMAPSHIFT); + spr->Style.BaseColormap = mybasecolormap; + spr->Style.ColormapNum = GETPALOOKUP(r_SpriteVisibility / MAX(MINZ, (double)spr->depth), spriteshade); } } } @@ -2210,7 +2218,8 @@ void R_DrawSprite (vissprite_t *spr) if (topclip >= botclip) { - spr->Style.colormap = colormap; + spr->Style.BaseColormap = colormap; + spr->Style.ColormapNum = colormapnum; return; } @@ -2340,7 +2349,8 @@ void R_DrawSprite (vissprite_t *spr) } if (i == x2) { - spr->Style.colormap = colormap; + spr->Style.BaseColormap = colormap; + spr->Style.ColormapNum = colormapnum; return; } } @@ -2358,7 +2368,8 @@ void R_DrawSprite (vissprite_t *spr) int maxvoxely = spr->gzb > hzb ? INT_MAX : xs_RoundToInt((spr->gzt - hzb) / spr->yscale); R_DrawVisVoxel(spr, minvoxely, maxvoxely, cliptop, clipbot); } - spr->Style.colormap = colormap; + spr->Style.BaseColormap = colormap; + spr->Style.ColormapNum = colormapnum; } // kg3D: @@ -2475,7 +2486,7 @@ void R_ProjectParticle (particle_t *particle, const sector_t *sector, int shade, int x1, x2, y1, y2; vissprite_t* vis; sector_t* heightsec = NULL; - BYTE* map; + FColormap* map; // [ZZ] Particle not visible through the portal plane if (CurrentPortal && !!P_PointOnLineSide(particle->Pos, CurrentPortal->dst)) @@ -2548,7 +2559,7 @@ void R_ProjectParticle (particle_t *particle, const sector_t *sector, int shade, botplane = &heightsec->ceilingplane; toppic = sector->GetTexture(sector_t::ceiling); botpic = heightsec->GetTexture(sector_t::ceiling); - map = heightsec->ColorMap->Maps; + map = heightsec->ColorMap; } else if (fakeside == FAKED_BelowFloor) { @@ -2556,7 +2567,7 @@ void R_ProjectParticle (particle_t *particle, const sector_t *sector, int shade, botplane = §or->floorplane; toppic = heightsec->GetTexture(sector_t::floor); botpic = sector->GetTexture(sector_t::floor); - map = heightsec->ColorMap->Maps; + map = heightsec->ColorMap; } else { @@ -2564,7 +2575,7 @@ void R_ProjectParticle (particle_t *particle, const sector_t *sector, int shade, botplane = &heightsec->floorplane; toppic = heightsec->GetTexture(sector_t::ceiling); botpic = heightsec->GetTexture(sector_t::floor); - map = sector->ColorMap->Maps; + map = sector->ColorMap; } } else @@ -2573,7 +2584,7 @@ void R_ProjectParticle (particle_t *particle, const sector_t *sector, int shade, botplane = §or->floorplane; toppic = sector->GetTexture(sector_t::ceiling); botpic = sector->GetTexture(sector_t::floor); - map = sector->ColorMap->Maps; + map = sector->ColorMap; } if (botpic != skyflatnum && particle->Pos.Z < botplane->ZatPoint (particle->Pos)) @@ -2602,25 +2613,28 @@ void R_ProjectParticle (particle_t *particle, const sector_t *sector, int shade, vis->renderflags = particle->trans; vis->FakeFlatStat = fakeside; vis->floorclip = 0; - vis->ColormapNum = 0; + vis->Style.ColormapNum = 0; if (fixedlightlev >= 0) { - vis->Style.colormap = map + fixedlightlev; + vis->Style.BaseColormap = map; + vis->Style.ColormapNum = fixedlightlev >> COLORMAPSHIFT; } else if (fixedcolormap) { - vis->Style.colormap = fixedcolormap; + vis->Style.BaseColormap = fixedcolormap; + vis->Style.ColormapNum = 0; } else if (particle->bright) { - vis->Style.colormap = map; + vis->Style.BaseColormap = map; + vis->Style.ColormapNum = 0; } else { // Particles are slightly more visible than regular sprites. - vis->ColormapNum = GETPALOOKUP(tiz * r_SpriteVisibility * 0.5, shade); - vis->Style.colormap = map + (vis->ColormapNum << COLORMAPSHIFT); + vis->Style.ColormapNum = GETPALOOKUP(tiz * r_SpriteVisibility * 0.5, shade); + vis->Style.BaseColormap = map; } } @@ -2649,13 +2663,13 @@ static void R_DrawMaskedSegsBehindParticle (const vissprite_t *vis) } } -void R_DrawParticle (vissprite_t *vis) +void R_DrawParticle_C (vissprite_t *vis) { DWORD *bg2rgb; int spacing; BYTE *dest; DWORD fg; - BYTE color = vis->Style.colormap[vis->startfrac]; + BYTE color = vis->Style.BaseColormap->Maps[(vis->Style.ColormapNum << COLORMAPSHIFT) + vis->startfrac]; int yl = vis->y1; int ycount = vis->y2 - yl + 1; int x1 = vis->x1; @@ -2714,12 +2728,64 @@ void R_DrawParticle (vissprite_t *vis) } } +void R_DrawParticle_rgba(vissprite_t *vis) +{ + int spacing; + uint32_t *dest; + BYTE color = vis->Style.BaseColormap->Maps[vis->startfrac]; + int yl = vis->y1; + int ycount = vis->y2 - yl + 1; + int x1 = vis->x1; + int countbase = vis->x2 - x1; + + R_DrawMaskedSegsBehindParticle(vis); + + DrawerCommandQueue::WaitForWorkers(); + + uint32_t fg = LightBgra::shade_pal_index_simple(color, LightBgra::calc_light_multiplier(LIGHTSCALE(0, vis->Style.ColormapNum << FRACBITS))); + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + // vis->renderflags holds translucency level (0-255) + fixed_t fglevel = ((vis->renderflags + 1) << 8) & ~0x3ff; + uint32_t alpha = fglevel * 256 / FRACUNIT; + uint32_t inv_alpha = 256 - alpha; + + fg_red *= alpha; + fg_green *= alpha; + fg_blue *= alpha; + + spacing = RenderTarget->GetPitch(); + + for (int x = x1; x < (x1 + countbase); x++) + { + dc_x = x; + if (R_ClipSpriteColumnWithPortals(vis)) + continue; + dest = ylookup[yl] + x + (uint32_t*)dc_destorg; + for (int y = 0; y < ycount; y++) + { + uint32_t bg_red = (*dest >> 16) & 0xff; + uint32_t bg_green = (*dest >> 8) & 0xff; + uint32_t bg_blue = (*dest) & 0xff; + + uint32_t red = (fg_red + bg_red * inv_alpha) / 256; + uint32_t green = (fg_green + bg_green * inv_alpha) / 256; + uint32_t blue = (fg_blue + bg_blue * inv_alpha) / 256; + + *dest = 0xff000000 | (red << 16) | (green << 8) | blue; + dest += spacing; + } + } +} + extern double BaseYaspectMul;; void R_DrawVoxel(const FVector3 &globalpos, FAngle viewangle, const FVector3 &dasprpos, DAngle dasprang, fixed_t daxscale, fixed_t dayscale, FVoxel *voxobj, - lighttable_t *colormap, short *daumost, short *dadmost, int minslabz, int maxslabz, int flags) + FColormap *colormap, int colormapnum, short *daumost, short *dadmost, int minslabz, int maxslabz, int flags) { int i, j, k, x, y, syoff, ggxstart, ggystart, nxoff; fixed_t cosang, sinang, sprcosang, sprsinang; @@ -2761,7 +2827,9 @@ void R_DrawVoxel(const FVector3 &globalpos, FAngle viewangle, sprcosang = FLOAT2FIXED(dasprang.Cos()) >> 2; sprsinang = FLOAT2FIXED(-dasprang.Sin()) >> 2; - R_SetupDrawSlab(colormap); + R_SetupDrawSlab(colormap, 0.0f, colormapnum << FRACBITS); + + int pixelsize = r_swtruecolor ? 4 : 1; // Select mip level i = abs(DMulScale6(dasprx - globalposx, cosang, daspry - globalposy, sinang)); @@ -3016,7 +3084,7 @@ void R_DrawVoxel(const FVector3 &globalpos, FAngle viewangle, if (!(flags & DVF_OFFSCREEN)) { // Draw directly to the screen. - R_DrawSlab(xxr - xxl, yplc[xxl], z2 - z1, yinc, col, ylookup[z1] + lxt + xxl + dc_destorg); + R_DrawSlab(xxr - xxl, yplc[xxl], z2 - z1, yinc, col, (ylookup[z1] + lxt + xxl) * pixelsize + dc_destorg); } else { @@ -3247,12 +3315,12 @@ void R_CheckOffscreenBuffer(int width, int height, bool spansonly) { if (OffscreenColorBuffer == NULL) { - OffscreenColorBuffer = new BYTE[width * height]; + OffscreenColorBuffer = new BYTE[width * height * 4]; } else if (OffscreenBufferWidth != width || OffscreenBufferHeight != height) { delete[] OffscreenColorBuffer; - OffscreenColorBuffer = new BYTE[width * height]; + OffscreenColorBuffer = new BYTE[width * height * 4]; } } OffscreenBufferWidth = width; diff --git a/src/r_things.h b/src/r_things.h index 29e69d3a5..13f89574b 100644 --- a/src/r_things.h +++ b/src/r_things.h @@ -86,7 +86,6 @@ struct vissprite_t BYTE bSplitSprite:1; // [RH] Sprite was split by a drawseg BYTE bInMirror:1; // [RH] Sprite is "inside" a mirror BYTE FakeFlatStat; // [RH] which side of fake/floor ceiling sprite is on - BYTE ColormapNum; // Which colormap is rendered (needed for shaded drawer) short renderflags; DWORD Translation; // [RH] for color translation visstyle_t Style; @@ -97,7 +96,10 @@ struct vissprite_t struct particle_t; -void R_DrawParticle (vissprite_t *); +extern void(*R_DrawParticle)(vissprite_t *); +void R_DrawParticle_C (vissprite_t *); +void R_DrawParticle_rgba (vissprite_t *); + void R_ProjectParticle (particle_t *, const sector_t *sector, int shade, int fakeside); extern int MaxVisSprites; @@ -142,7 +144,7 @@ enum { DVF_OFFSCREEN = 1, DVF_SPANSONLY = 2, DVF_MIRRORED = 4 }; void R_DrawVoxel(const FVector3 &viewpos, FAngle viewangle, const FVector3 &sprpos, DAngle dasprang, fixed_t daxscale, fixed_t dayscale, struct FVoxel *voxobj, - lighttable_t *colormap, short *daumost, short *dadmost, int minslabz, int maxslabz, int flags); + FColormap *colormap, int colormapnum, short *daumost, short *dadmost, int minslabz, int maxslabz, int flags); void R_ClipVisSprite (vissprite_t *vis, int xl, int xh); diff --git a/src/r_utility.cpp b/src/r_utility.cpp index 00ac91c97..688555b21 100644 --- a/src/r_utility.cpp +++ b/src/r_utility.cpp @@ -889,11 +889,11 @@ void R_SetupFrame (AActor *actor) BaseBlendG = GPART(newblend); BaseBlendB = BPART(newblend); BaseBlendA = APART(newblend) / 255.f; - NormalLight.Maps = realcolormaps; + NormalLight.Maps = realcolormaps.Maps; } else { - NormalLight.Maps = realcolormaps + NUMCOLORMAPS*256*newblend; + NormalLight.Maps = realcolormaps.Maps + NUMCOLORMAPS*256*newblend; BaseBlendR = BaseBlendG = BaseBlendB = 0; BaseBlendA = 0.f; } diff --git a/src/textures/automaptexture.cpp b/src/textures/automaptexture.cpp index 67d68b9fe..9aac379ef 100644 --- a/src/textures/automaptexture.cpp +++ b/src/textures/automaptexture.cpp @@ -122,6 +122,7 @@ void FAutomapTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/buildtexture.cpp b/src/textures/buildtexture.cpp index bfcc6333d..1155dacc4 100644 --- a/src/textures/buildtexture.cpp +++ b/src/textures/buildtexture.cpp @@ -56,7 +56,6 @@ public: const BYTE *GetColumn (unsigned int column, const Span **spans_out); const BYTE *GetPixels (); - void Unload (); protected: const BYTE *Pixels; @@ -103,17 +102,6 @@ FBuildTexture::~FBuildTexture () // //========================================================================== -void FBuildTexture::Unload () -{ - // Nothing to do, since the pixels are accessed from memory-mapped files directly -} - -//========================================================================== -// -// -// -//========================================================================== - const BYTE *FBuildTexture::GetPixels () { return Pixels; diff --git a/src/textures/canvastexture.cpp b/src/textures/canvastexture.cpp index 062c3af1d..109d927ab 100644 --- a/src/textures/canvastexture.cpp +++ b/src/textures/canvastexture.cpp @@ -53,7 +53,6 @@ FCanvasTexture::FCanvasTexture (const char *name, int width, int height) DummySpans[1].TopOffset = 0; DummySpans[1].Length = 0; UseType = TEX_Wall; - Canvas = NULL; bNeedsUpdate = true; bDidUpdate = false; bHasCanvas = true; @@ -101,11 +100,22 @@ const BYTE *FCanvasTexture::GetPixels () return Pixels; } +const uint32_t *FCanvasTexture::GetPixelsBgra() +{ + bNeedsUpdate = true; + if (CanvasBgra == NULL) + { + MakeTextureBgra(); + } + return PixelsBgra; +} + void FCanvasTexture::MakeTexture () { - Canvas = new DSimpleCanvas (Width, Height); + Canvas = new DSimpleCanvas (Width, Height, false); Canvas->Lock (); GC::AddSoftRoot(Canvas); + if (Width != Height || Width != Canvas->GetPitch()) { Pixels = new BYTE[Width*Height]; @@ -113,29 +123,68 @@ void FCanvasTexture::MakeTexture () } else { - Pixels = Canvas->GetBuffer(); + Pixels = (BYTE*)Canvas->GetBuffer(); bPixelsAllocated = false; } + // Draw a special "unrendered" initial texture into the buffer. memset (Pixels, 0, Width*Height/2); memset (Pixels+Width*Height/2, 255, Width*Height/2); } +void FCanvasTexture::MakeTextureBgra() +{ + CanvasBgra = new DSimpleCanvas(Width, Height, true); + CanvasBgra->Lock(); + GC::AddSoftRoot(CanvasBgra); + + if (Width != Height || Width != CanvasBgra->GetPitch()) + { + PixelsBgra = new uint32_t[Width*Height]; + bPixelsAllocatedBgra = true; + } + else + { + PixelsBgra = (uint32_t*)CanvasBgra->GetBuffer(); + bPixelsAllocatedBgra = false; + } + + // Draw a special "unrendered" initial texture into the buffer. + memset(PixelsBgra, 0, Width*Height / 2 * 4); + memset(PixelsBgra + Width*Height / 2, 255, Width*Height / 2 * 4); +} + void FCanvasTexture::Unload () { if (bPixelsAllocated) { - if (Pixels != NULL) delete [] Pixels; + if (Pixels != NULL) delete[] Pixels; bPixelsAllocated = false; Pixels = NULL; } + if (bPixelsAllocatedBgra) + { + if (PixelsBgra != NULL) delete[] PixelsBgra; + bPixelsAllocatedBgra = false; + PixelsBgra = NULL; + } + if (Canvas != NULL) { GC::DelSoftRoot(Canvas); Canvas->Destroy(); Canvas = NULL; } + + if (CanvasBgra != NULL) + { + GC::DelSoftRoot(CanvasBgra); + CanvasBgra->Destroy(); + CanvasBgra = NULL; + } + + FTexture::Unload(); } bool FCanvasTexture::CheckModified () diff --git a/src/textures/ddstexture.cpp b/src/textures/ddstexture.cpp index 31e748022..fb4de34c5 100644 --- a/src/textures/ddstexture.cpp +++ b/src/textures/ddstexture.cpp @@ -401,6 +401,7 @@ void FDDSTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/flattexture.cpp b/src/textures/flattexture.cpp index 840d53aaf..08e0d1221 100644 --- a/src/textures/flattexture.cpp +++ b/src/textures/flattexture.cpp @@ -138,6 +138,7 @@ void FFlatTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/imgztexture.cpp b/src/textures/imgztexture.cpp index 1c262d707..04932d4bf 100644 --- a/src/textures/imgztexture.cpp +++ b/src/textures/imgztexture.cpp @@ -142,6 +142,7 @@ void FIMGZTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/jpegtexture.cpp b/src/textures/jpegtexture.cpp index 225396598..fc629b37e 100644 --- a/src/textures/jpegtexture.cpp +++ b/src/textures/jpegtexture.cpp @@ -295,11 +295,9 @@ FJPEGTexture::~FJPEGTexture () void FJPEGTexture::Unload () { - if (Pixels != NULL) - { - delete[] Pixels; - Pixels = NULL; - } + delete[] Pixels; + Pixels = NULL; + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/multipatchtexture.cpp b/src/textures/multipatchtexture.cpp index b0db481a8..6ae45c785 100644 --- a/src/textures/multipatchtexture.cpp +++ b/src/textures/multipatchtexture.cpp @@ -362,6 +362,7 @@ void FMultiPatchTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/patchtexture.cpp b/src/textures/patchtexture.cpp index 423ce4deb..8388515c0 100644 --- a/src/textures/patchtexture.cpp +++ b/src/textures/patchtexture.cpp @@ -184,6 +184,7 @@ void FPatchTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/pcxtexture.cpp b/src/textures/pcxtexture.cpp index 0ec5d2933..42a13b85a 100644 --- a/src/textures/pcxtexture.cpp +++ b/src/textures/pcxtexture.cpp @@ -191,6 +191,7 @@ void FPCXTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/pngtexture.cpp b/src/textures/pngtexture.cpp index e47fa62c0..9a64bac61 100644 --- a/src/textures/pngtexture.cpp +++ b/src/textures/pngtexture.cpp @@ -369,11 +369,9 @@ FPNGTexture::~FPNGTexture () void FPNGTexture::Unload () { - if (Pixels != NULL) - { - delete[] Pixels; - Pixels = NULL; - } + delete[] Pixels; + Pixels = NULL; + FTexture::Unload(); } //========================================================================== @@ -446,6 +444,7 @@ const BYTE *FPNGTexture::GetPixels () return Pixels; } + //========================================================================== // // diff --git a/src/textures/rawpagetexture.cpp b/src/textures/rawpagetexture.cpp index 1402f8844..69313fd1c 100644 --- a/src/textures/rawpagetexture.cpp +++ b/src/textures/rawpagetexture.cpp @@ -206,6 +206,7 @@ void FRawPageTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/texture.cpp b/src/textures/texture.cpp index 7b90c295f..12e9d8549 100644 --- a/src/textures/texture.cpp +++ b/src/textures/texture.cpp @@ -45,6 +45,7 @@ #include "v_video.h" #include "m_fixed.h" #include "textures/textures.h" +#include "v_palette.h" typedef bool (*CheckFunc)(FileReader & file); typedef FTexture * (*CreateFunc)(FileReader & file, int lumpnum); @@ -175,6 +176,37 @@ FTexture::~FTexture () KillNative(); } +void FTexture::Unload() +{ + PixelsBgra = std::vector(); +} + +const uint32_t *FTexture::GetColumnBgra(unsigned int column, const Span **spans_out) +{ + const uint32_t *pixels = GetPixelsBgra(); + + column %= Width; + + if (spans_out != nullptr) + GetColumn(column, spans_out); + return pixels + column * Height; +} + +const uint32_t *FTexture::GetPixelsBgra() +{ + if (PixelsBgra.empty() || CheckModified()) + { + if (!GetColumn(0, nullptr)) + return nullptr; + + FBitmap bitmap; + bitmap.Create(GetWidth(), GetHeight()); + CopyTrueColorPixels(&bitmap, 0, 0); + GenerateBgraFromBitmap(bitmap); + } + return PixelsBgra.data(); +} + bool FTexture::CheckModified () { return false; @@ -318,6 +350,210 @@ void FTexture::FreeSpans (Span **spans) const M_Free (spans); } +void FTexture::GenerateBgraFromBitmap(const FBitmap &bitmap) +{ + CreatePixelsBgraWithMipmaps(); + + // Transpose + const uint32_t *src = (const uint32_t *)bitmap.GetPixels(); + uint32_t *dest = PixelsBgra.data(); + for (int x = 0; x < Width; x++) + { + for (int y = 0; y < Height; y++) + { + dest[y + x * Height] = src[x + y * Width]; + } + } + + GenerateBgraMipmaps(); +} + +void FTexture::CreatePixelsBgraWithMipmaps() +{ + int levels = MipmapLevels(); + int buffersize = 0; + for (int i = 0; i < levels; i++) + { + int w = MAX(Width >> i, 1); + int h = MAX(Height >> i, 1); + buffersize += w * h; + } + PixelsBgra.resize(buffersize, 0xffff0000); +} + +int FTexture::MipmapLevels() const +{ + int widthbits = 0; + while ((Width >> widthbits) != 0) widthbits++; + + int heightbits = 0; + while ((Height >> heightbits) != 0) heightbits++; + + return MAX(widthbits, heightbits); +} + +void FTexture::GenerateBgraMipmaps() +{ + struct Color4f + { + float a, r, g, b; + Color4f operator*(const Color4f &v) const { return Color4f{ a * v.a, r * v.r, g * v.g, b * v.b }; } + Color4f operator/(const Color4f &v) const { return Color4f{ a / v.a, r / v.r, g / v.g, b / v.b }; } + Color4f operator+(const Color4f &v) const { return Color4f{ a + v.a, r + v.r, g + v.g, b + v.b }; } + Color4f operator-(const Color4f &v) const { return Color4f{ a - v.a, r - v.r, g - v.g, b - v.b }; } + Color4f operator*(float s) const { return Color4f{ a * s, r * s, g * s, b * s }; } + Color4f operator/(float s) const { return Color4f{ a / s, r / s, g / s, b / s }; } + Color4f operator+(float s) const { return Color4f{ a + s, r + s, g + s, b + s }; } + Color4f operator-(float s) const { return Color4f{ a - s, r - s, g - s, b - s }; } + }; + + int levels = MipmapLevels(); + std::vector image(PixelsBgra.size()); + + // Convert to normalized linear colorspace + { + for (int x = 0; x < Width; x++) + { + for (int y = 0; y < Height; y++) + { + uint32_t c8 = PixelsBgra[x * Height + y]; + Color4f c; + c.a = powf(APART(c8) * (1.0f / 255.0f), 2.2f); + c.r = powf(RPART(c8) * (1.0f / 255.0f), 2.2f); + c.g = powf(GPART(c8) * (1.0f / 255.0f), 2.2f); + c.b = powf(BPART(c8) * (1.0f / 255.0f), 2.2f); + image[x * Height + y] = c; + } + } + } + + // Generate mipmaps + { + std::vector smoothed(Width * Height); + Color4f *src = image.data(); + Color4f *dest = src + Width * Height; + for (int i = 1; i < levels; i++) + { + int srcw = MAX(Width >> (i - 1), 1); + int srch = MAX(Height >> (i - 1), 1); + int w = MAX(Width >> i, 1); + int h = MAX(Height >> i, 1); + + // Downscale + for (int x = 0; x < w; x++) + { + int sx0 = x * 2; + int sx1 = MIN((x + 1) * 2, srcw - 1); + for (int y = 0; y < h; y++) + { + int sy0 = y * 2; + int sy1 = MIN((y + 1) * 2, srch - 1); + + Color4f src00 = src[sy0 + sx0 * srch]; + Color4f src01 = src[sy1 + sx0 * srch]; + Color4f src10 = src[sy0 + sx1 * srch]; + Color4f src11 = src[sy1 + sx1 * srch]; + Color4f c = (src00 + src01 + src10 + src11) * 0.25f; + + dest[y + x * h] = src00; + } + } + + // Sharpen filter with a 3x3 kernel: + for (int x = 0; x < w; x++) + { + for (int y = 0; y < h; y++) + { + Color4f c = { 0.0f, 0.0f, 0.0f, 0.0f }; + for (int kx = -1; kx < 2; kx++) + { + for (int ky = -1; ky < 2; ky++) + { + int a = y + ky; + int b = x + kx; + if (a < 0) a = h - 1; + if (a == h) a = 0; + if (b < 0) b = w - 1; + if (b == w) b = 0; + c = c + dest[a + b * h]; + } + } + c = c * (1.0f / 9.0f); + smoothed[y + x * h] = c; + } + } + float k = 0.04f; + for (int j = 0; j < w * h; j++) + dest[j] = dest[j] + (dest[j] - smoothed[j]) * k; + + src = dest; + dest += w * h; + } + } + + // Convert to bgra8 sRGB colorspace + { + Color4f *src = image.data() + Width * Height; + uint32_t *dest = PixelsBgra.data() + Width * Height; + for (int i = 1; i < levels; i++) + { + int w = MAX(Width >> i, 1); + int h = MAX(Height >> i, 1); + for (int j = 0; j < w * h; j++) + { + uint32_t a = (uint32_t)clamp(powf(src[j].a, 1.0f / 2.2f) * 255.0f + 0.5f, 0.0f, 255.0f); + uint32_t r = (uint32_t)clamp(powf(src[j].r, 1.0f / 2.2f) * 255.0f + 0.5f, 0.0f, 255.0f); + uint32_t g = (uint32_t)clamp(powf(src[j].g, 1.0f / 2.2f) * 255.0f + 0.5f, 0.0f, 255.0f); + uint32_t b = (uint32_t)clamp(powf(src[j].b, 1.0f / 2.2f) * 255.0f + 0.5f, 0.0f, 255.0f); + dest[j] = (a << 24) | (r << 16) | (g << 8) | b; + } + src += w * h; + dest += w * h; + } + } +} + +void FTexture::GenerateBgraMipmapsFast() +{ + uint32_t *src = PixelsBgra.data(); + uint32_t *dest = src + Width * Height; + int levels = MipmapLevels(); + for (int i = 1; i < levels; i++) + { + int srcw = MAX(Width >> (i - 1), 1); + int srch = MAX(Height >> (i - 1), 1); + int w = MAX(Width >> i, 1); + int h = MAX(Height >> i, 1); + + for (int x = 0; x < w; x++) + { + int sx0 = x * 2; + int sx1 = MIN((x + 1) * 2, srcw - 1); + + for (int y = 0; y < h; y++) + { + int sy0 = y * 2; + int sy1 = MIN((y + 1) * 2, srch - 1); + + uint32_t src00 = src[sy0 + sx0 * srch]; + uint32_t src01 = src[sy1 + sx0 * srch]; + uint32_t src10 = src[sy0 + sx1 * srch]; + uint32_t src11 = src[sy1 + sx1 * srch]; + + uint32_t alpha = (APART(src00) + APART(src01) + APART(src10) + APART(src11) + 2) / 4; + uint32_t red = (RPART(src00) + RPART(src01) + RPART(src10) + RPART(src11) + 2) / 4; + uint32_t green = (GPART(src00) + GPART(src01) + GPART(src10) + GPART(src11) + 2) / 4; + uint32_t blue = (BPART(src00) + BPART(src01) + BPART(src10) + BPART(src11) + 2) / 4; + + dest[y + x * h] = (alpha << 24) | (red << 16) | (green << 8) | blue; + } + } + + src = dest; + dest += w * h; + } +} + void FTexture::CopyToBlock (BYTE *dest, int dwidth, int dheight, int xpos, int ypos, int rotate, const BYTE *translation) { const BYTE *pixels = GetPixels(); @@ -384,6 +620,29 @@ void FTexture::FlipSquareBlock (BYTE *block, int x, int y) } } +void FTexture::FlipSquareBlockBgra(uint32_t *block, int x, int y) +{ + int i, j; + + if (x != y) return; + + for (i = 0; i < x; ++i) + { + uint32_t *corner = block + x*i + i; + int count = x - i; + if (count & 1) + { + count--; + swapvalues(corner[count], corner[count*x]); + } + for (j = 0; j < count; j += 2) + { + swapvalues(corner[j], corner[j*x]); + swapvalues(corner[j + 1], corner[(j + 1)*x]); + } + } +} + void FTexture::FlipSquareBlockRemap (BYTE *block, int x, int y, const BYTE *remap) { int i, j; @@ -427,6 +686,19 @@ void FTexture::FlipNonSquareBlock (BYTE *dst, const BYTE *src, int x, int y, int } } +void FTexture::FlipNonSquareBlockBgra(uint32_t *dst, const uint32_t *src, int x, int y, int srcpitch) +{ + int i, j; + + for (i = 0; i < x; ++i) + { + for (j = 0; j < y; ++j) + { + dst[i*y + j] = src[i + j*srcpitch]; + } + } +} + void FTexture::FlipNonSquareBlockRemap (BYTE *dst, const BYTE *src, int x, int y, int srcpitch, const BYTE *remap) { int i, j; @@ -580,10 +852,6 @@ FDummyTexture::FDummyTexture () UseType = TEX_Null; } -void FDummyTexture::Unload () -{ -} - void FDummyTexture::SetSize (int width, int height) { Width = width; diff --git a/src/textures/textures.h b/src/textures/textures.h index 14667093c..e5ecdc679 100644 --- a/src/textures/textures.h +++ b/src/textures/textures.h @@ -3,6 +3,7 @@ #include "doomtype.h" #include "vectors.h" +#include class FBitmap; struct FRemapTable; @@ -175,9 +176,18 @@ public: // Returns a single column of the texture virtual const BYTE *GetColumn (unsigned int column, const Span **spans_out) = 0; + // Returns a single column of the texture, in BGRA8 format + virtual const uint32_t *GetColumnBgra(unsigned int column, const Span **spans_out); + // Returns the whole texture, stored in column-major order virtual const BYTE *GetPixels () = 0; - + + // Returns the whole texture, stored in column-major order, in BGRA8 format + virtual const uint32_t *GetPixelsBgra(); + + // Returns true if GetPixelsBgra includes mipmaps + virtual bool Mipmapped() { return true; } + virtual int CopyTrueColorPixels(FBitmap *bmp, int x, int y, int rotate=0, FCopyInfo *inf = NULL); int CopyTrueColorTranslated(FBitmap *bmp, int x, int y, int rotate, FRemapTable *remap, FCopyInfo *inf = NULL); virtual bool UseBasePalette(); @@ -185,7 +195,7 @@ public: virtual FTexture *GetRedirect(bool wantwarped); virtual FTexture *GetRawTexture(); // for FMultiPatchTexture to override - virtual void Unload () = 0; + virtual void Unload (); // Returns the native pixel format for this image virtual FTextureFormat GetFormat(); @@ -262,10 +272,20 @@ protected: Rotations = other->Rotations; } + std::vector PixelsBgra; + + void GenerateBgraFromBitmap(const FBitmap &bitmap); + void CreatePixelsBgraWithMipmaps(); + void GenerateBgraMipmaps(); + void GenerateBgraMipmapsFast(); + int MipmapLevels() const; + public: static void FlipSquareBlock (BYTE *block, int x, int y); + static void FlipSquareBlockBgra (uint32_t *block, int x, int y); static void FlipSquareBlockRemap (BYTE *block, int x, int y, const BYTE *remap); static void FlipNonSquareBlock (BYTE *blockto, const BYTE *blockfrom, int x, int y, int srcpitch); + static void FlipNonSquareBlockBgra (uint32_t *blockto, const uint32_t *blockfrom, int x, int y, int srcpitch); static void FlipNonSquareBlockRemap (BYTE *blockto, const BYTE *blockfrom, int x, int y, int srcpitch, const BYTE *remap); friend class D3DTex; @@ -460,7 +480,6 @@ public: FDummyTexture (); const BYTE *GetColumn (unsigned int column, const Span **spans_out); const BYTE *GetPixels (); - void Unload (); void SetSize (int width, int height); }; @@ -474,6 +493,7 @@ public: virtual int CopyTrueColorPixels(FBitmap *bmp, int x, int y, int rotate=0, FCopyInfo *inf = NULL); const BYTE *GetColumn (unsigned int column, const Span **spans_out); const BYTE *GetPixels (); + const uint32_t *GetPixelsBgra() override; void Unload (); bool CheckModified (); @@ -508,21 +528,28 @@ public: const BYTE *GetColumn (unsigned int column, const Span **spans_out); const BYTE *GetPixels (); + const uint32_t *GetPixelsBgra() override; void Unload (); bool CheckModified (); void NeedUpdate() { bNeedsUpdate=true; } void SetUpdated() { bNeedsUpdate = false; bDidUpdate = true; bFirstUpdate = false; } DSimpleCanvas *GetCanvas() { return Canvas; } + DSimpleCanvas *GetCanvasBgra() { return CanvasBgra; } + bool Mipmapped() override { return false; } void MakeTexture (); + void MakeTextureBgra (); protected: - DSimpleCanvas *Canvas; - BYTE *Pixels; + DSimpleCanvas *Canvas = nullptr; + DSimpleCanvas *CanvasBgra = nullptr; + BYTE *Pixels = nullptr; + uint32_t *PixelsBgra = nullptr; Span DummySpans[2]; - bool bNeedsUpdate; - bool bDidUpdate; - bool bPixelsAllocated; + bool bNeedsUpdate = true; + bool bDidUpdate = false; + bool bPixelsAllocated = false; + bool bPixelsAllocatedBgra = false; public: bool bFirstUpdate; diff --git a/src/textures/tgatexture.cpp b/src/textures/tgatexture.cpp index b208a51a3..5e76a63b2 100644 --- a/src/textures/tgatexture.cpp +++ b/src/textures/tgatexture.cpp @@ -181,6 +181,7 @@ void FTGATexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/warptexture.cpp b/src/textures/warptexture.cpp index a8a2ddb9e..91c7b9fc4 100644 --- a/src/textures/warptexture.cpp +++ b/src/textures/warptexture.cpp @@ -39,6 +39,7 @@ #include "r_utility.h" #include "textures/textures.h" #include "warpbuffer.h" +#include "v_palette.h" FWarpTexture::FWarpTexture (FTexture *source, int warptype) @@ -74,6 +75,7 @@ void FWarpTexture::Unload () Spans = NULL; } SourcePic->Unload (); + FTexture::Unload(); } bool FWarpTexture::CheckModified () @@ -92,6 +94,25 @@ const BYTE *FWarpTexture::GetPixels () return Pixels; } +const uint32_t *FWarpTexture::GetPixelsBgra() +{ + DWORD time = r_FrameTime; + if (Pixels == NULL || time != GenTime) + { + MakeTexture(time); + CreatePixelsBgraWithMipmaps(); + for (int i = 0; i < Width * Height; i++) + { + if (Pixels[i] != 0) + PixelsBgra[i] = 0xff000000 | GPalette.BaseColors[Pixels[i]].d; + else + PixelsBgra[i] = 0; + } + GenerateBgraMipmapsFast(); + } + return PixelsBgra.data(); +} + const BYTE *FWarpTexture::GetColumn (unsigned int column, const Span **spans_out) { DWORD time = r_FrameTime; diff --git a/src/v_draw.cpp b/src/v_draw.cpp index dada0cd57..cfcfd7507 100644 --- a/src/v_draw.cpp +++ b/src/v_draw.cpp @@ -44,6 +44,7 @@ #include "r_utility.h" #ifndef NO_SWRENDER #include "r_draw.h" +#include "r_draw_rgba.h" #include "r_main.h" #include "r_things.h" #endif @@ -137,6 +138,12 @@ void DCanvas::DrawTextureParms(FTexture *img, DrawParms &parms) static short bottomclipper[MAXWIDTH], topclipper[MAXWIDTH]; const BYTE *translation = NULL; + if (r_swtruecolor != IsBgra()) + { + r_swtruecolor = IsBgra(); + R_InitColumnDrawers(); + } + if (parms.masked) { spanptr = &spans; @@ -173,14 +180,14 @@ void DCanvas::DrawTextureParms(FTexture *img, DrawParms &parms) if (translation != NULL) { - dc_colormap = (lighttable_t *)translation; + R_SetTranslationMap((lighttable_t *)translation); } else { - dc_colormap = identitymap; + R_SetTranslationMap(identitymap); } - fixedcolormap = dc_colormap; + fixedcolormap = dc_fcolormap; ESPSResult mode = R_SetPatchStyle (parms.style, parms.Alpha, 0, parms.fillcolor); BYTE *destorgsave = dc_destorg; @@ -306,7 +313,7 @@ void DCanvas::DrawTextureParms(FTexture *img, DrawParms &parms) while (dc_x < stop4) { - rt_initcols(); + rt_initcols(nullptr); for (int zz = 4; zz; --zz) { pixels = img->GetColumn(frac >> FRACBITS, spanptr); @@ -1023,13 +1030,35 @@ void DCanvas::PUTTRANSDOT (int xx, int yy, int basecolor, int level) oldyyshifted = yy * GetPitch(); } - BYTE *spot = GetBuffer() + oldyyshifted + xx; - DWORD *bg2rgb = Col2RGB8[1+level]; - DWORD *fg2rgb = Col2RGB8[63-level]; - DWORD fg = fg2rgb[basecolor]; - DWORD bg = bg2rgb[*spot]; - bg = (fg+bg) | 0x1f07c1f; - *spot = RGB32k.All[bg&(bg>>15)]; + if (IsBgra()) + { + uint32_t *spot = (uint32_t*)GetBuffer() + oldyyshifted + xx; + + uint32_t fg = LightBgra::shade_pal_index_simple(basecolor, LightBgra::calc_light_multiplier(0)); + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + uint32_t bg_red = (*spot >> 16) & 0xff; + uint32_t bg_green = (*spot >> 8) & 0xff; + uint32_t bg_blue = (*spot) & 0xff; + + uint32_t red = (fg_red + bg_red + 1) / 2; + uint32_t green = (fg_green + bg_green + 1) / 2; + uint32_t blue = (fg_blue + bg_blue + 1) / 2; + + *spot = 0xff000000 | (red << 16) | (green << 8) | blue; + } + else + { + BYTE *spot = GetBuffer() + oldyyshifted + xx; + DWORD *bg2rgb = Col2RGB8[1+level]; + DWORD *fg2rgb = Col2RGB8[63-level]; + DWORD fg = fg2rgb[basecolor]; + DWORD bg = bg2rgb[*spot]; + bg = (fg+bg) | 0x1f07c1f; + *spot = RGB32k.All[bg&(bg>>15)]; + } } void DCanvas::DrawLine(int x0, int y0, int x1, int y1, int palColor, uint32 realcolor) @@ -1073,27 +1102,65 @@ void DCanvas::DrawLine(int x0, int y0, int x1, int y1, int palColor, uint32 real { swapvalues (x0, x1); } - memset (GetBuffer() + y0*GetPitch() + x0, palColor, deltaX+1); + if (IsBgra()) + { + uint32_t fillColor = GPalette.BaseColors[palColor].d; + uint32_t *spot = (uint32_t*)GetBuffer() + y0*GetPitch() + x0; + for (int i = 0; i <= deltaX; i++) + spot[i] = fillColor; + } + else + { + memset (GetBuffer() + y0*GetPitch() + x0, palColor, deltaX+1); + } } else if (deltaX == 0) { // vertical line - BYTE *spot = GetBuffer() + y0*GetPitch() + x0; - int pitch = GetPitch (); - do + if (IsBgra()) { - *spot = palColor; - spot += pitch; - } while (--deltaY != 0); + uint32_t fillColor = GPalette.BaseColors[palColor].d; + uint32_t *spot = (uint32_t*)GetBuffer() + y0*GetPitch() + x0; + int pitch = GetPitch(); + do + { + *spot = fillColor; + spot += pitch; + } while (--deltaY != 0); + } + else + { + BYTE *spot = GetBuffer() + y0*GetPitch() + x0; + int pitch = GetPitch(); + do + { + *spot = palColor; + spot += pitch; + } while (--deltaY != 0); + } } else if (deltaX == deltaY) { // diagonal line. - BYTE *spot = GetBuffer() + y0*GetPitch() + x0; - int advance = GetPitch() + xDir; - do + if (IsBgra()) { - *spot = palColor; - spot += advance; - } while (--deltaY != 0); + uint32_t fillColor = GPalette.BaseColors[palColor].d; + uint32_t *spot = (uint32_t*)GetBuffer() + y0*GetPitch() + x0; + int advance = GetPitch() + xDir; + do + { + *spot = fillColor; + spot += advance; + } while (--deltaY != 0); + } + else + { + BYTE *spot = GetBuffer() + y0*GetPitch() + x0; + int advance = GetPitch() + xDir; + do + { + *spot = palColor; + spot += advance; + } while (--deltaY != 0); + } } else { @@ -1213,7 +1280,6 @@ void DCanvas::DrawPixel(int x, int y, int palColor, uint32 realcolor) void DCanvas::Clear (int left, int top, int right, int bottom, int palcolor, uint32 color) { int x, y; - BYTE *dest; if (left == right || top == bottom) { @@ -1243,12 +1309,28 @@ void DCanvas::Clear (int left, int top, int right, int bottom, int palcolor, uin palcolor = PalFromRGB(color); } - dest = Buffer + top * Pitch + left; - x = right - left; - for (y = top; y < bottom; y++) + if (IsBgra()) { - memset(dest, palcolor, x); - dest += Pitch; + uint32_t fill_color = GPalette.BaseColors[palcolor]; + + uint32_t *dest = (uint32_t*)Buffer + top * Pitch + left; + x = right - left; + for (y = top; y < bottom; y++) + { + for (int i = 0; i < x; i++) + dest[i] = fill_color; + dest += Pitch; + } + } + else + { + BYTE *dest = Buffer + top * Pitch + left; + x = right - left; + for (y = top; y < bottom; y++) + { + memset(dest, palcolor, x); + dest += Pitch; + } } } @@ -1339,8 +1421,11 @@ void DCanvas::FillSimplePoly(FTexture *tex, FVector2 *points, int npoints, // Setup constant texture mapping parameters. R_SetupSpanBits(tex); - R_SetSpanColormap(colormap != NULL ? &colormap->Maps[clamp(shade >> FRACBITS, 0, NUMCOLORMAPS-1) * 256] : identitymap); - R_SetSpanSource(tex->GetPixels()); + if (colormap) + R_SetSpanColormap(colormap, clamp(shade >> FRACBITS, 0, NUMCOLORMAPS - 1)); + else + R_SetSpanColormap(&identitycolormap, 0); + R_SetSpanSource(tex); scalex = double(1u << (32 - ds_xbits)) / scalex; scaley = double(1u << (32 - ds_ybits)) / scaley; ds_xstep = xs_RoundToInt(cosrot * scalex); @@ -1449,6 +1534,9 @@ void DCanvas::FillSimplePoly(FTexture *tex, FVector2 *points, int npoints, // void DCanvas::DrawBlock (int x, int y, int _width, int _height, const BYTE *src) const { + if (IsBgra()) + return; + int srcpitch = _width; int destpitch; BYTE *dest; @@ -1475,6 +1563,9 @@ void DCanvas::DrawBlock (int x, int y, int _width, int _height, const BYTE *src) // void DCanvas::GetBlock (int x, int y, int _width, int _height, BYTE *dest) const { + if (IsBgra()) + return; + const BYTE *src; #ifdef RANGECHECK diff --git a/src/v_font.cpp b/src/v_font.cpp index 052074d11..ef9b69dd1 100644 --- a/src/v_font.cpp +++ b/src/v_font.cpp @@ -1662,6 +1662,7 @@ void FFontChar1::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== @@ -1723,6 +1724,7 @@ void FFontChar2::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/v_video.cpp b/src/v_video.cpp index b9917a1cb..630f517ad 100644 --- a/src/v_video.cpp +++ b/src/v_video.cpp @@ -83,7 +83,7 @@ class DDummyFrameBuffer : public DFrameBuffer DECLARE_CLASS (DDummyFrameBuffer, DFrameBuffer); public: DDummyFrameBuffer (int width, int height) - : DFrameBuffer (0, 0) + : DFrameBuffer (0, 0, false) { Width = width; Height = height; @@ -119,7 +119,6 @@ public: const BYTE *GetColumn(unsigned int column, const Span **spans_out); const BYTE *GetPixels(); - void Unload(); bool CheckModified(); void SetTranslation(int num); @@ -208,13 +207,14 @@ DCanvas *DCanvas::CanvasChain = NULL; // //========================================================================== -DCanvas::DCanvas (int _width, int _height) +DCanvas::DCanvas (int _width, int _height, bool _bgra) { // Init member vars Buffer = NULL; LockCount = 0; Width = _width; Height = _height; + Bgra = _bgra; // Add to list of active canvases Next = CanvasChain; @@ -344,10 +344,7 @@ void DCanvas::Dim (PalEntry color, float damount, int x1, int y1, int w, int h) if (damount == 0.f) return; - DWORD *bg2rgb; - DWORD fg; int gap; - BYTE *spot; int x, y; if (x1 >= Width || y1 >= Height) @@ -367,31 +364,73 @@ void DCanvas::Dim (PalEntry color, float damount, int x1, int y1, int w, int h) return; } - { - int amount; - - amount = (int)(damount * 64); - bg2rgb = Col2RGB8[64-amount]; - - fg = (((color.r * amount) >> 4) << 20) | - ((color.g * amount) >> 4) | - (((color.b * amount) >> 4) << 10); - } - - spot = Buffer + x1 + y1*Pitch; gap = Pitch - w; - for (y = h; y != 0; y--) - { - for (x = w; x != 0; x--) - { - DWORD bg; - bg = bg2rgb[(*spot)&0xff]; - bg = (fg+bg) | 0x1f07c1f; - *spot = RGB32k.All[bg&(bg>>15)]; - spot++; + if (IsBgra()) + { + uint32_t *spot = (uint32_t*)Buffer + x1 + y1*Pitch; + + uint32_t fg = color.d; + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + uint32_t alpha = (uint32_t)clamp(damount * 256 + 0.5f, 0.0f, 256.0f); + uint32_t inv_alpha = 256 - alpha; + + fg_red *= alpha; + fg_green *= alpha; + fg_blue *= alpha; + + for (y = h; y != 0; y--) + { + for (x = w; x != 0; x--) + { + uint32_t bg_red = (*spot >> 16) & 0xff; + uint32_t bg_green = (*spot >> 8) & 0xff; + uint32_t bg_blue = (*spot) & 0xff; + + uint32_t red = (fg_red + bg_red * inv_alpha) / 256; + uint32_t green = (fg_green + bg_green * inv_alpha) / 256; + uint32_t blue = (fg_blue + bg_blue * inv_alpha) / 256; + + *spot = 0xff000000 | (red << 16) | (green << 8) | blue; + spot++; + } + spot += gap; + } + } + else + { + BYTE *spot = Buffer + x1 + y1*Pitch; + + DWORD *bg2rgb; + DWORD fg; + + { + int amount; + + amount = (int)(damount * 64); + bg2rgb = Col2RGB8[64-amount]; + + fg = (((color.r * amount) >> 4) << 20) | + ((color.g * amount) >> 4) | + (((color.b * amount) >> 4) << 10); + } + + for (y = h; y != 0; y--) + { + for (x = w; x != 0; x--) + { + DWORD bg; + + bg = bg2rgb[(*spot)&0xff]; + bg = (fg+bg) | 0x1f07c1f; + *spot = RGB32k.All[bg&(bg>>15)]; + spot++; + } + spot += gap; } - spot += gap; } } @@ -408,8 +447,8 @@ void DCanvas::GetScreenshotBuffer(const BYTE *&buffer, int &pitch, ESSType &colo { Lock(true); buffer = GetBuffer(); - pitch = GetPitch(); - color_type = SS_PAL; + pitch = IsBgra() ? GetPitch() * 4 : GetPitch(); + color_type = IsBgra() ? SS_BGRA : SS_PAL; } //========================================================================== @@ -704,13 +743,12 @@ void DCanvas::CalcGamma (float gamma, BYTE gammalookup[256]) // I found this formula on the web at // , // but that page no longer exits. - double invgamma = 1.f / gamma; int i; for (i = 0; i < 256; i++) { - gammalookup[i] = (BYTE)(255.0 * pow (i / 255.0, invgamma)); + gammalookup[i] = (BYTE)(255.0 * pow (i / 255.0, invgamma) + 0.5); } } @@ -722,8 +760,8 @@ void DCanvas::CalcGamma (float gamma, BYTE gammalookup[256]) // //========================================================================== -DSimpleCanvas::DSimpleCanvas (int width, int height) - : DCanvas (width, height) +DSimpleCanvas::DSimpleCanvas (int width, int height, bool bgra) + : DCanvas (width, height, bgra) { // Making the pitch a power of 2 is very bad for performance // Try to maximize the number of cache lines that can be filled @@ -760,8 +798,9 @@ DSimpleCanvas::DSimpleCanvas (int width, int height) Pitch = width + MAX(0, CPU.DataL1LineSize - 8); } } - MemBuffer = new BYTE[Pitch * height]; - memset (MemBuffer, 0, Pitch * height); + int bytes_per_pixel = bgra ? 4 : 1; + MemBuffer = new BYTE[Pitch * height * bytes_per_pixel]; + memset (MemBuffer, 0, Pitch * height * bytes_per_pixel); } //========================================================================== @@ -830,13 +869,77 @@ void DSimpleCanvas::Unlock () // //========================================================================== -DFrameBuffer::DFrameBuffer (int width, int height) - : DSimpleCanvas (width, height) +DFrameBuffer::DFrameBuffer (int width, int height, bool bgra) + : DSimpleCanvas (width, height, bgra) { LastMS = LastSec = FrameCount = LastCount = LastTic = 0; Accel2D = false; } +//========================================================================== +// +// DFrameBuffer :: PostprocessBgra +// +// Copies data to destination buffer while performing gamma and flash. +// This is only needed if a target cannot do this with shaders. +// +//========================================================================== + +void DFrameBuffer::CopyWithGammaBgra(void *output, int pitch, const BYTE *gammared, const BYTE *gammagreen, const BYTE *gammablue, PalEntry flash, int flash_amount) +{ + const BYTE *gammatables[3] = { gammared, gammagreen, gammablue }; + + if (flash_amount > 0) + { + uint16_t inv_flash_amount = 256 - flash_amount; + uint16_t flash_red = flash.r * flash_amount; + uint16_t flash_green = flash.g * flash_amount; + uint16_t flash_blue = flash.b * flash_amount; + + for (int y = 0; y < Height; y++) + { + BYTE *dest = (BYTE*)output + y * pitch; + BYTE *src = MemBuffer + y * Pitch * 4; + for (int x = 0; x < Width; x++) + { + uint16_t fg_red = src[2]; + uint16_t fg_green = src[1]; + uint16_t fg_blue = src[0]; + uint16_t red = (fg_red * inv_flash_amount + flash_red) >> 8; + uint16_t green = (fg_green * inv_flash_amount + flash_green) >> 8; + uint16_t blue = (fg_blue * inv_flash_amount + flash_blue) >> 8; + + dest[0] = gammatables[2][blue]; + dest[1] = gammatables[1][green]; + dest[2] = gammatables[0][red]; + dest[3] = 0xff; + + dest += 4; + src += 4; + } + } + } + else + { + for (int y = 0; y < Height; y++) + { + BYTE *dest = (BYTE*)output + y * pitch; + BYTE *src = MemBuffer + y * Pitch * 4; + for (int x = 0; x < Width; x++) + { + dest[0] = gammatables[2][src[0]]; + dest[1] = gammatables[1][src[1]]; + dest[2] = gammatables[0][src[2]]; + dest[3] = 0xff; + + dest += 4; + src += 4; + } + } + } +} + + //========================================================================== // // DFrameBuffer :: DrawRateStuff @@ -898,10 +1001,21 @@ void DFrameBuffer::DrawRateStuff () // Buffer can be NULL if we're doing hardware accelerated 2D if (buffer != NULL) { - buffer += (GetHeight()-1) * GetPitch(); - - for (i = 0; i < tics*2; i += 2) buffer[i] = 0xff; - for ( ; i < 20*2; i += 2) buffer[i] = 0x00; + if (IsBgra()) + { + uint32_t *buffer32 = (uint32_t*)buffer; + buffer32 += (GetHeight() - 1) * GetPitch(); + + for (i = 0; i < tics * 2; i += 2) buffer32[i] = 0xffffffff; + for (; i < 20 * 2; i += 2) buffer32[i] = 0xff000000; + } + else + { + buffer += (GetHeight() - 1) * GetPitch(); + + for (i = 0; i < tics * 2; i += 2) buffer[i] = 0xff; + for (; i < 20 * 2; i += 2) buffer[i] = 0x00; + } } else { @@ -974,16 +1088,6 @@ void FPaletteTester::SetTranslation(int num) } } -//========================================================================== -// -// FPaletteTester :: Unload -// -//========================================================================== - -void FPaletteTester::Unload() -{ -} - //========================================================================== // // FPaletteTester :: GetColumn diff --git a/src/v_video.h b/src/v_video.h index 4f0a78675..b3be2578c 100644 --- a/src/v_video.h +++ b/src/v_video.h @@ -185,7 +185,7 @@ class DCanvas : public DObject { DECLARE_ABSTRACT_CLASS (DCanvas, DObject) public: - DCanvas (int width, int height); + DCanvas (int width, int height, bool bgra); virtual ~DCanvas (); // Member variable access @@ -193,6 +193,7 @@ public: inline int GetWidth () const { return Width; } inline int GetHeight () const { return Height; } inline int GetPitch () const { return Pitch; } + inline bool IsBgra() const { return Bgra; } virtual bool IsValid (); @@ -270,6 +271,7 @@ protected: int Height; int Pitch; int LockCount; + bool Bgra; bool ClipBox (int &left, int &top, int &width, int &height, const BYTE *&src, const int srcpitch) const; void DrawTextureV(FTexture *img, double x, double y, uint32 tag, va_list tags) = delete; @@ -292,7 +294,7 @@ class DSimpleCanvas : public DCanvas { DECLARE_CLASS (DSimpleCanvas, DCanvas) public: - DSimpleCanvas (int width, int height); + DSimpleCanvas (int width, int height, bool bgra); ~DSimpleCanvas (); bool IsValid (); @@ -330,7 +332,7 @@ class DFrameBuffer : public DSimpleCanvas { DECLARE_ABSTRACT_CLASS (DFrameBuffer, DSimpleCanvas) public: - DFrameBuffer (int width, int height); + DFrameBuffer (int width, int height, bool bgra); // Force the surface to use buffered output if true is passed. virtual bool Lock (bool buffered) = 0; @@ -421,6 +423,7 @@ public: protected: void DrawRateStuff (); void CopyFromBuff (BYTE *src, int srcPitch, int width, int height, BYTE *dest); + void CopyWithGammaBgra(void *output, int pitch, const BYTE *gammared, const BYTE *gammagreen, const BYTE *gammablue, PalEntry flash, int flash_amount); DFrameBuffer () {} diff --git a/src/win32/fb_d3d9.cpp b/src/win32/fb_d3d9.cpp index efdced151..fd84e3bbb 100644 --- a/src/win32/fb_d3d9.cpp +++ b/src/win32/fb_d3d9.cpp @@ -242,8 +242,8 @@ CVAR(Bool, vid_hwaalines, true, CVAR_ARCHIVE|CVAR_GLOBALCONFIG) // //========================================================================== -D3DFB::D3DFB (UINT adapter, int width, int height, bool fullscreen) - : BaseWinFB (width, height) +D3DFB::D3DFB (UINT adapter, int width, int height, bool bgra, bool fullscreen) + : BaseWinFB (width, height, bgra) { D3DPRESENT_PARAMETERS d3dpp; @@ -765,14 +765,16 @@ void D3DFB::KillNativeTexs() bool D3DFB::CreateFBTexture () { - if (FAILED(D3DDevice->CreateTexture(Width, Height, 1, D3DUSAGE_DYNAMIC, D3DFMT_L8, D3DPOOL_DEFAULT, &FBTexture, NULL))) + FBFormat = IsBgra() ? D3DFMT_A8R8G8B8 : D3DFMT_L8; + + if (FAILED(D3DDevice->CreateTexture(Width, Height, 1, D3DUSAGE_DYNAMIC, FBFormat, D3DPOOL_DEFAULT, &FBTexture, NULL))) { int pow2width, pow2height, i; for (i = 1; i < Width; i <<= 1) {} pow2width = i; for (i = 1; i < Height; i <<= 1) {} pow2height = i; - if (FAILED(D3DDevice->CreateTexture(pow2width, pow2height, 1, D3DUSAGE_DYNAMIC, D3DFMT_L8, D3DPOOL_DEFAULT, &FBTexture, NULL))) + if (FAILED(D3DDevice->CreateTexture(pow2width, pow2height, 1, D3DUSAGE_DYNAMIC, FBFormat, D3DPOOL_DEFAULT, &FBTexture, NULL))) { return false; } @@ -1304,20 +1306,45 @@ void D3DFB::Draw3DPart(bool copy3d) SUCCEEDED(FBTexture->LockRect (0, &lockrect, NULL, D3DLOCK_DISCARD))) || SUCCEEDED(FBTexture->LockRect (0, &lockrect, &texrect, 0))) { - if (lockrect.Pitch == Pitch && Pitch == Width) + if (IsBgra() && FBFormat == D3DFMT_A8R8G8B8) { - memcpy (lockrect.pBits, MemBuffer, Width * Height); + if (lockrect.Pitch == Pitch * sizeof(uint32_t) && Pitch == Width) + { + memcpy(lockrect.pBits, MemBuffer, Width * Height * sizeof(uint32_t)); + } + else + { + uint32_t *dest = (uint32_t *)lockrect.pBits; + uint32_t *src = (uint32_t*)MemBuffer; + for (int y = 0; y < Height; y++) + { + memcpy(dest, src, Width * sizeof(uint32_t)); + dest = reinterpret_cast(reinterpret_cast(dest) + lockrect.Pitch); + src += Pitch; + } + } + } + else if (!IsBgra() && FBFormat == D3DFMT_L8) + { + if (lockrect.Pitch == Pitch && Pitch == Width) + { + memcpy(lockrect.pBits, MemBuffer, Width * Height); + } + else + { + BYTE *dest = (BYTE *)lockrect.pBits; + BYTE *src = (BYTE *)MemBuffer; + for (int y = 0; y < Height; y++) + { + memcpy(dest, src, Width); + dest = reinterpret_cast(reinterpret_cast(dest) + lockrect.Pitch); + src += Pitch; + } + } } else { - BYTE *dest = (BYTE *)lockrect.pBits; - BYTE *src = MemBuffer; - for (int y = 0; y < Height; y++) - { - memcpy (dest, src, Width); - dest += lockrect.Pitch; - src += Pitch; - } + memset(lockrect.pBits, 0, lockrect.Pitch * Height); } FBTexture->UnlockRect (0); } @@ -1349,7 +1376,10 @@ void D3DFB::Draw3DPart(bool copy3d) memset(Constant, 0, sizeof(Constant)); SetAlphaBlend(D3DBLENDOP(0)); EnableAlphaTest(FALSE); - SetPixelShader(Shaders[SHADER_NormalColorPal]); + if (IsBgra()) + SetPixelShader(Shaders[SHADER_NormalColor]); + else + SetPixelShader(Shaders[SHADER_NormalColorPal]); if (copy3d) { FBVERTEX verts[4]; @@ -1367,7 +1397,10 @@ void D3DFB::Draw3DPart(bool copy3d) realfixedcolormap->ColorizeStart[1]/2, realfixedcolormap->ColorizeStart[2]/2, 0); color1 = D3DCOLOR_COLORVALUE(realfixedcolormap->ColorizeEnd[0]/2, realfixedcolormap->ColorizeEnd[1]/2, realfixedcolormap->ColorizeEnd[2]/2, 1); - SetPixelShader(Shaders[SHADER_SpecialColormapPal]); + if (IsBgra()) + SetPixelShader(Shaders[SHADER_SpecialColormap]); + else + SetPixelShader(Shaders[SHADER_SpecialColormapPal]); } } else @@ -1378,7 +1411,10 @@ void D3DFB::Draw3DPart(bool copy3d) CalcFullscreenCoords(verts, Accel2D, false, color0, color1); D3DDevice->DrawPrimitiveUP(D3DPT_TRIANGLEFAN, 2, verts, sizeof(FBVERTEX)); } - SetPixelShader(Shaders[SHADER_NormalColorPal]); + if (IsBgra()) + SetPixelShader(Shaders[SHADER_NormalColor]); + else + SetPixelShader(Shaders[SHADER_NormalColorPal]); } //========================================================================== diff --git a/src/win32/fb_ddraw.cpp b/src/win32/fb_ddraw.cpp index 7cc603786..5637e9695 100644 --- a/src/win32/fb_ddraw.cpp +++ b/src/win32/fb_ddraw.cpp @@ -32,7 +32,6 @@ ** */ - // HEADER FILES ------------------------------------------------------------ #define DIRECTDRAW_VERSION 0x0300 @@ -120,7 +119,7 @@ cycle_t BlitCycles; // CODE -------------------------------------------------------------------- DDrawFB::DDrawFB (int width, int height, bool fullscreen) - : BaseWinFB (width, height) + : BaseWinFB (width, height, false) { int i; diff --git a/src/win32/hardware.cpp b/src/win32/hardware.cpp index 8cc770556..49c970457 100644 --- a/src/win32/hardware.cpp +++ b/src/win32/hardware.cpp @@ -51,6 +51,7 @@ EXTERN_CVAR (Bool, ticker) EXTERN_CVAR (Bool, fullscreen) +EXTERN_CVAR (Bool, swtruecolor) EXTERN_CVAR (Float, vid_winscale) CVAR(Int, win_x, -1, CVAR_ARCHIVE | CVAR_GLOBALCONFIG) @@ -146,7 +147,7 @@ DFrameBuffer *I_SetMode (int &width, int &height, DFrameBuffer *old) } break; } - DFrameBuffer *res = Video->CreateFrameBuffer (width, height, fs, old); + DFrameBuffer *res = Video->CreateFrameBuffer (width, height, swtruecolor, fs, old); /* Right now, CreateFrameBuffer cannot return NULL if (res == NULL) @@ -312,6 +313,16 @@ void I_RestoreWindowedPos () extern int NewWidth, NewHeight, NewBits, DisplayBits; +CUSTOM_CVAR(Bool, swtruecolor, false, CVAR_ARCHIVE|CVAR_GLOBALCONFIG|CVAR_NOINITCALL) +{ + // Strictly speaking this doesn't require a mode switch, but it is the easiest + // way to force a CreateFramebuffer call without a lot of refactoring. + NewWidth = screen->GetWidth(); + NewHeight = screen->GetHeight(); + NewBits = DisplayBits; + setmodeneeded = true; +} + CUSTOM_CVAR (Bool, fullscreen, true, CVAR_ARCHIVE|CVAR_GLOBALCONFIG|CVAR_NOINITCALL) { NewWidth = screen->GetWidth(); diff --git a/src/win32/hardware.h b/src/win32/hardware.h index b2bafef32..184eeccf5 100644 --- a/src/win32/hardware.h +++ b/src/win32/hardware.h @@ -45,7 +45,7 @@ class IVideo virtual EDisplayType GetDisplayType () = 0; virtual void SetWindowedScale (float scale) = 0; - virtual DFrameBuffer *CreateFrameBuffer (int width, int height, bool fs, DFrameBuffer *old) = 0; + virtual DFrameBuffer *CreateFrameBuffer (int width, int height, bool bgra, bool fs, DFrameBuffer *old) = 0; virtual void StartModeIterator (int bits, bool fs) = 0; virtual bool NextMode (int *width, int *height, bool *letterbox) = 0; diff --git a/src/win32/win32iface.h b/src/win32/win32iface.h index 9b2754eae..d30475eb3 100644 --- a/src/win32/win32iface.h +++ b/src/win32/win32iface.h @@ -70,7 +70,7 @@ class Win32Video : public IVideo EDisplayType GetDisplayType () { return DISPLAY_Both; } void SetWindowedScale (float scale); - DFrameBuffer *CreateFrameBuffer (int width, int height, bool fs, DFrameBuffer *old); + DFrameBuffer *CreateFrameBuffer (int width, int height, bool bgra, bool fs, DFrameBuffer *old); void StartModeIterator (int bits, bool fs); bool NextMode (int *width, int *height, bool *letterbox); @@ -121,7 +121,7 @@ class BaseWinFB : public DFrameBuffer { DECLARE_ABSTRACT_CLASS(BaseWinFB, DFrameBuffer) public: - BaseWinFB (int width, int height) : DFrameBuffer (width, height), Windowed (true) {} + BaseWinFB (int width, int height, bool bgra) : DFrameBuffer (width, height, bgra), Windowed (true) {} bool IsFullscreen () { return !Windowed; } virtual void Blank () = 0; @@ -228,7 +228,7 @@ class D3DFB : public BaseWinFB { DECLARE_CLASS(D3DFB, BaseWinFB) public: - D3DFB (UINT adapter, int width, int height, bool fullscreen); + D3DFB (UINT adapter, int width, int height, bool bgra, bool fullscreen); ~D3DFB (); bool IsValid (); @@ -422,6 +422,7 @@ private: bool NeedPalUpdate; bool NeedGammaUpdate; int FBWidth, FBHeight; + D3DFORMAT FBFormat; bool VSync; RECT BlendingRect; int In2D; diff --git a/src/win32/win32video.cpp b/src/win32/win32video.cpp index 8eb2349ec..0d1fe0cff 100644 --- a/src/win32/win32video.cpp +++ b/src/win32/win32video.cpp @@ -629,7 +629,7 @@ bool Win32Video::NextMode (int *width, int *height, bool *letterbox) return false; } -DFrameBuffer *Win32Video::CreateFrameBuffer (int width, int height, bool fullscreen, DFrameBuffer *old) +DFrameBuffer *Win32Video::CreateFrameBuffer (int width, int height, bool bgra, bool fullscreen, DFrameBuffer *old) { static int retry = 0; static int owidth, oheight; @@ -645,7 +645,8 @@ DFrameBuffer *Win32Video::CreateFrameBuffer (int width, int height, bool fullscr BaseWinFB *fb = static_cast (old); if (fb->Width == width && fb->Height == height && - fb->Windowed == !fullscreen) + fb->Windowed == !fullscreen && + fb->Bgra == bgra) { return old; } @@ -662,12 +663,13 @@ DFrameBuffer *Win32Video::CreateFrameBuffer (int width, int height, bool fullscr if (D3D != NULL) { - fb = new D3DFB (m_Adapter, width, height, fullscreen); + fb = new D3DFB (m_Adapter, width, height, bgra, fullscreen); } else { fb = new DDrawFB (width, height, fullscreen); } + LOG1 ("New fb created @ %p\n", fb); // If we could not create the framebuffer, try again with slightly @@ -726,7 +728,7 @@ DFrameBuffer *Win32Video::CreateFrameBuffer (int width, int height, bool fullscr } ++retry; - fb = static_cast(CreateFrameBuffer (width, height, fullscreen, NULL)); + fb = static_cast(CreateFrameBuffer (width, height, bgra, fullscreen, NULL)); } retry = 0; diff --git a/wadsrc/static/language.enu b/wadsrc/static/language.enu index e91c9dd43..91459673a 100644 --- a/wadsrc/static/language.enu +++ b/wadsrc/static/language.enu @@ -1780,6 +1780,10 @@ DSPLYMNU_BRIGHTNESS = "Brightness"; DSPLYMNU_VSYNC = "Vertical Sync"; DSPLYMNU_CAPFPS = "Rendering Interpolation"; DSPLYMNU_COLUMNMETHOD = "Column render mode"; +DSPLYMNU_TRUECOLOR = "True color output"; +DSPLYMNU_MINFILTER = "Linear filter when downscaling"; +DSPLYMNU_MAGFILTER = "Linear filter when upscaling"; +DSPLYMNU_MIPMAP = "Use mipmapped textures"; DSPLYMNU_WIPETYPE = "Screen wipe style"; DSPLYMNU_SHOWENDOOM = "Show ENDOOM screen"; DSPLYMNU_PALLETEHACK = "DirectDraw palette hack"; // Not used diff --git a/wadsrc/static/menudef.txt b/wadsrc/static/menudef.txt index a605b5cc4..0202a40e6 100644 --- a/wadsrc/static/menudef.txt +++ b/wadsrc/static/menudef.txt @@ -661,6 +661,10 @@ OptionMenu "VideoOptions" Option "$DSPLYMNU_VSYNC", "vid_vsync", "OnOff" Option "$DSPLYMNU_CAPFPS", "cl_capfps", "OffOn" Option "$DSPLYMNU_COLUMNMETHOD", "r_columnmethod", "ColumnMethods" + Option "$DSPLYMNU_TRUECOLOR", "swtruecolor", "OnOff" + Option "$DSPLYMNU_MINFILTER", "r_minfilter", "OnOff" + Option "$DSPLYMNU_MAGFILTER", "r_magfilter", "OnOff" + Option "$DSPLYMNU_MIPMAP", "r_mipmap", "OnOff" StaticText " " Option "$DSPLYMNU_WIPETYPE", "wipetype", "Wipes"