diff --git a/CMakeLists.txt b/CMakeLists.txt index e54dd8a33..199a1fca8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required( VERSION 2.8.7 ) -project(GZDoom) +project(QZDoom) if( COMMAND cmake_policy ) if( POLICY CMP0011 ) @@ -73,7 +73,7 @@ IF( NOT CMAKE_BUILD_TYPE ) ENDIF() set( ZDOOM_OUTPUT_DIR ${CMAKE_BINARY_DIR} CACHE PATH "Directory where zdoom.pk3 and the executable will be created." ) -set( ZDOOM_EXE_NAME "gzdoom" CACHE FILEPATH "Name of the executable to create" ) +set( ZDOOM_EXE_NAME "qzdoom" CACHE FILEPATH "Name of the executable to create" ) if( MSVC ) # Allow the user to use ZDOOM_OUTPUT_DIR as a single release point. # Use zdoom, zdoomd, zdoom64, and zdoomd64 for the binary names diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 039a03b52..476e945bb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -974,7 +974,9 @@ set( FASTMATH_PCH_SOURCES r_3dfloors.cpp r_bsp.cpp r_draw.cpp + r_draw_rgba.cpp r_drawt.cpp + r_drawt_rgba.cpp r_main.cpp r_plane.cpp r_segs.cpp diff --git a/src/doomtype.h b/src/doomtype.h index 129c5f122..cbf1fbd5e 100644 --- a/src/doomtype.h +++ b/src/doomtype.h @@ -99,6 +99,11 @@ typedef TMap FClassMap; #endif +// Only use SSE intrinsics on Intel architecture +#if !defined(_M_IX86) && !defined(__i386__) && !defined(_M_X64) && !defined(__amd64__) +#define NO_SSE +#endif + #if defined(_MSC_VER) #define NOVTABLE __declspec(novtable) diff --git a/src/f_wipe.cpp b/src/f_wipe.cpp index a3ceb8d50..aa9038eeb 100644 --- a/src/f_wipe.cpp +++ b/src/f_wipe.cpp @@ -382,6 +382,9 @@ static bool (*wipes[])(int) = // Returns true if the wipe should be performed. bool wipe_StartScreen (int type) { + if (screen->IsBgra()) + return false; + CurrentWipeType = clamp(type, 0, wipe_NUMWIPES - 1); if (CurrentWipeType) @@ -395,11 +398,15 @@ bool wipe_StartScreen (int type) void wipe_EndScreen (void) { + if (screen->IsBgra()) + return; + if (CurrentWipeType) { wipe_scr_end = new short[SCREENWIDTH * SCREENHEIGHT / 2]; screen->GetBlock (0, 0, SCREENWIDTH, SCREENHEIGHT, (BYTE *)wipe_scr_end); screen->DrawBlock (0, 0, SCREENWIDTH, SCREENHEIGHT, (BYTE *)wipe_scr_start); // restore start scr. + // Initialize the wipe (*wipes[(CurrentWipeType-1)*3])(0); } @@ -410,6 +417,9 @@ bool wipe_ScreenWipe (int ticks) { bool rc; + if (screen->IsBgra()) + return true; + if (CurrentWipeType == wipe_None) return true; @@ -423,6 +433,9 @@ bool wipe_ScreenWipe (int ticks) // Final things for the wipe void wipe_Cleanup() { + if (screen->IsBgra()) + return; + if (wipe_scr_start != NULL) { delete[] wipe_scr_start; diff --git a/src/g_level.cpp b/src/g_level.cpp index dcf97ba4f..c854fa849 100644 --- a/src/g_level.cpp +++ b/src/g_level.cpp @@ -1315,7 +1315,7 @@ void G_InitLevelLocals () level_info_t *info; BaseBlendA = 0.0f; // Remove underwater blend effect, if any - NormalLight.Maps = realcolormaps; + NormalLight.Maps = realcolormaps.Maps; // [BB] Instead of just setting the color, we also have to reset Desaturate and build the lights. NormalLight.ChangeColor (PalEntry (255, 255, 255), 0); diff --git a/src/g_shared/a_artifacts.cpp b/src/g_shared/a_artifacts.cpp index 7e4970e04..2a3746e75 100644 --- a/src/g_shared/a_artifacts.cpp +++ b/src/g_shared/a_artifacts.cpp @@ -743,7 +743,8 @@ int APowerInvisibility::AlterWeaponSprite (visstyle_t *vis) if ((vis->Alpha < 0.25f && special1 > 0) || (vis->Alpha == 0)) { vis->Alpha = clamp((1.f - float(Strength/100)), 0.f, 1.f); - vis->colormap = SpecialColormaps[INVERSECOLORMAP].Colormap; + vis->BaseColormap = &SpecialColormaps[INVERSECOLORMAP]; + vis->ColormapNum = 0; } return -1; // This item is valid so another one shouldn't reset the translucency } diff --git a/src/g_strife/strife_sbar.cpp b/src/g_strife/strife_sbar.cpp index bcdf624d7..eb3fa2608 100644 --- a/src/g_strife/strife_sbar.cpp +++ b/src/g_strife/strife_sbar.cpp @@ -35,7 +35,6 @@ public: const BYTE *GetColumn (unsigned int column, const Span **spans_out); const BYTE *GetPixels (); bool CheckModified (); - void Unload (); void SetVial (int level); @@ -90,10 +89,6 @@ bool FHealthBar::CheckModified () return NeedRefresh; } -void FHealthBar::Unload () -{ -} - const BYTE *FHealthBar::GetColumn (unsigned int column, const Span **spans_out) { if (NeedRefresh) diff --git a/src/gl/renderer/gl_colormap.h b/src/gl/renderer/gl_colormap.h index 2122b1248..d66950309 100644 --- a/src/gl/renderer/gl_colormap.h +++ b/src/gl/renderer/gl_colormap.h @@ -75,5 +75,4 @@ struct FColormap }; - #endif diff --git a/src/menu/playerdisplay.cpp b/src/menu/playerdisplay.cpp index c3d11a43a..16671975a 100644 --- a/src/menu/playerdisplay.cpp +++ b/src/menu/playerdisplay.cpp @@ -78,7 +78,6 @@ public: const BYTE *GetColumn(unsigned int column, const Span **spans_out); const BYTE *GetPixels(); - void Unload(); bool CheckModified(); protected: @@ -212,10 +211,6 @@ bool FBackdropTexture::CheckModified() return LastRenderTic != gametic; } -void FBackdropTexture::Unload() -{ -} - //============================================================================= // // diff --git a/src/posix/cocoa/i_video.mm b/src/posix/cocoa/i_video.mm index 08a563b25..ba3a3e27e 100644 --- a/src/posix/cocoa/i_video.mm +++ b/src/posix/cocoa/i_video.mm @@ -106,6 +106,17 @@ EXTERN_CVAR(Bool, ticker ) EXTERN_CVAR(Bool, vid_vsync) EXTERN_CVAR(Bool, vid_hidpi) +CUSTOM_CVAR(Bool, swtruecolor, true, CVAR_ARCHIVE | CVAR_GLOBALCONFIG | CVAR_NOINITCALL) +{ + // Strictly speaking this doesn't require a mode switch, but it is the easiest + // way to force a CreateFramebuffer call without a lot of refactoring. + extern int NewWidth, NewHeight, NewBits, DisplayBits; + NewWidth = screen->GetWidth(); + NewHeight = screen->GetHeight(); + NewBits = DisplayBits; + setmodeneeded = true; +} + CUSTOM_CVAR(Bool, fullscreen, false, CVAR_ARCHIVE | CVAR_GLOBALCONFIG) { extern int NewWidth, NewHeight, NewBits, DisplayBits; @@ -123,7 +134,7 @@ CUSTOM_CVAR(Bool, vid_autoswitch, true, CVAR_ARCHIVE | CVAR_GLOBALCONFIG | CVAR_ static int s_currentRenderer; -CUSTOM_CVAR(Int, vid_renderer, 1, CVAR_ARCHIVE | CVAR_GLOBALCONFIG | CVAR_NOINITCALL) +CUSTOM_CVAR(Int, vid_renderer, 0, CVAR_ARCHIVE | CVAR_GLOBALCONFIG | CVAR_NOINITCALL) { // 0: Software renderer // 1: OpenGL renderer @@ -238,7 +249,7 @@ public: virtual EDisplayType GetDisplayType() { return DISPLAY_Both; } virtual void SetWindowedScale(float scale); - virtual DFrameBuffer* CreateFrameBuffer(int width, int height, bool fs, DFrameBuffer* old); + virtual DFrameBuffer* CreateFrameBuffer(int width, int height, bool bgra, bool fs, DFrameBuffer* old); virtual void StartModeIterator(int bits, bool fullscreen); virtual bool NextMode(int* width, int* height, bool* letterbox); @@ -280,7 +291,7 @@ private: class CocoaFrameBuffer : public DFrameBuffer { public: - CocoaFrameBuffer(int width, int height, bool fullscreen); + CocoaFrameBuffer(int width, int height, bool bgra, bool fullscreen); ~CocoaFrameBuffer(); virtual bool Lock(bool buffer); @@ -590,14 +601,14 @@ bool CocoaVideo::NextMode(int* const width, int* const height, bool* const lette return false; } -DFrameBuffer* CocoaVideo::CreateFrameBuffer(const int width, const int height, const bool fullscreen, DFrameBuffer* const old) +DFrameBuffer* CocoaVideo::CreateFrameBuffer(const int width, const int height, const bool bgra, const bool fullscreen, DFrameBuffer* const old) { PalEntry flashColor = 0; int flashAmount = 0; if (NULL != old) { - if (width == m_width && height == m_height) + if (width == m_width && height == m_height && bgra == old->IsBgra()) { SetMode(width, height, fullscreen, vid_hidpi); return old; @@ -622,7 +633,7 @@ DFrameBuffer* CocoaVideo::CreateFrameBuffer(const int width, const int height, c } else { - fb = new CocoaFrameBuffer(width, height, fullscreen); + fb = new CocoaFrameBuffer(width, height, bgra, fullscreen); } fb->SetFlash(flashColor, flashAmount); @@ -846,8 +857,8 @@ CocoaVideo* CocoaVideo::GetInstance() // --------------------------------------------------------------------------- -CocoaFrameBuffer::CocoaFrameBuffer(int width, int height, bool fullscreen) -: DFrameBuffer(width, height) +CocoaFrameBuffer::CocoaFrameBuffer(int width, int height, bool bgra, bool fullscreen) +: DFrameBuffer(width, height, bgra) , m_needPaletteUpdate(false) , m_gamma(0.0f) , m_needGammaUpdate(false) @@ -949,8 +960,15 @@ void CocoaFrameBuffer::Update() FlipCycles.Reset(); BlitCycles.Clock(); - GPfx.Convert(MemBuffer, Pitch, m_pixelBuffer, Width * BYTES_PER_PIXEL, - Width, Height, FRACUNIT, FRACUNIT, 0, 0); + if (IsBgra()) + { + CopyWithGammaBgra(m_pixelBuffer, Width * BYTES_PER_PIXEL, m_gammaTable[0], m_gammaTable[1], m_gammaTable[2], m_flashColor, m_flashAmount); + } + else + { + GPfx.Convert(MemBuffer, Pitch, m_pixelBuffer, Width * BYTES_PER_PIXEL, + Width, Height, FRACUNIT, FRACUNIT, 0, 0); + } FlipCycles.Clock(); Flip(); @@ -1082,8 +1100,10 @@ void CocoaFrameBuffer::Flip() static const GLenum format = GL_ABGR_EXT; #endif // __LITTLE_ENDIAN__ - glTexImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, GL_RGBA8, - Width, Height, 0, format, GL_UNSIGNED_BYTE, m_pixelBuffer); + if (IsBgra()) + glTexImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, GL_RGBA8, Width, Height, 0, GL_BGRA_EXT, GL_UNSIGNED_BYTE, m_pixelBuffer); + else + glTexImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, GL_RGBA8, Width, Height, 0, format, GL_UNSIGNED_BYTE, m_pixelBuffer); glBegin(GL_QUADS); glColor4f(1.0f, 1.0f, 1.0f, 1.0f); @@ -1313,7 +1333,7 @@ void I_CreateRenderer() DFrameBuffer* I_SetMode(int &width, int &height, DFrameBuffer* old) { - return Video->CreateFrameBuffer(width, height, fullscreen, old); + return Video->CreateFrameBuffer(width, height, swtruecolor, fullscreen, old); } bool I_CheckResolution(const int width, const int height, const int bits) diff --git a/src/posix/hardware.h b/src/posix/hardware.h index 618941fe5..3c06cb6c6 100644 --- a/src/posix/hardware.h +++ b/src/posix/hardware.h @@ -74,7 +74,7 @@ class IVideo virtual EDisplayType GetDisplayType () = 0; virtual void SetWindowedScale (float scale) = 0; - virtual DFrameBuffer *CreateFrameBuffer (int width, int height, bool fs, DFrameBuffer *old) = 0; + virtual DFrameBuffer *CreateFrameBuffer (int width, int height, bool bgra, bool fs, DFrameBuffer *old) = 0; virtual void StartModeIterator (int bits, bool fs) = 0; virtual bool NextMode (int *width, int *height, bool *letterbox) = 0; diff --git a/src/posix/sdl/hardware.cpp b/src/posix/sdl/hardware.cpp index f4ac13fc5..18c7ad737 100644 --- a/src/posix/sdl/hardware.cpp +++ b/src/posix/sdl/hardware.cpp @@ -53,6 +53,7 @@ EXTERN_CVAR (Bool, ticker) EXTERN_CVAR (Bool, fullscreen) +EXTERN_CVAR (Bool, swtruecolor) EXTERN_CVAR (Float, vid_winscale) IVideo *Video; @@ -64,7 +65,7 @@ void I_RestartRenderer(); int currentrenderer; // [ZDoomGL] -CUSTOM_CVAR (Int, vid_renderer, 1, CVAR_ARCHIVE | CVAR_GLOBALCONFIG | CVAR_NOINITCALL) +CUSTOM_CVAR (Int, vid_renderer, 0, CVAR_ARCHIVE | CVAR_GLOBALCONFIG | CVAR_NOINITCALL) { // 0: Software renderer // 1: OpenGL renderer @@ -166,7 +167,7 @@ DFrameBuffer *I_SetMode (int &width, int &height, DFrameBuffer *old) fs = fullscreen; break; } - DFrameBuffer *res = Video->CreateFrameBuffer (width, height, fs, old); + DFrameBuffer *res = Video->CreateFrameBuffer (width, height, swtruecolor, fs, old); /* Right now, CreateFrameBuffer cannot return NULL if (res == NULL) @@ -320,6 +321,16 @@ CUSTOM_CVAR (Int, vid_maxfps, 200, CVAR_ARCHIVE | CVAR_GLOBALCONFIG) extern int NewWidth, NewHeight, NewBits, DisplayBits; +CUSTOM_CVAR(Bool, swtruecolor, true, CVAR_ARCHIVE|CVAR_GLOBALCONFIG|CVAR_NOINITCALL) +{ + // Strictly speaking this doesn't require a mode switch, but it is the easiest + // way to force a CreateFramebuffer call without a lot of refactoring. + NewWidth = screen->GetWidth(); + NewHeight = screen->GetHeight(); + NewBits = DisplayBits; + setmodeneeded = true; +} + CUSTOM_CVAR (Bool, fullscreen, false, CVAR_ARCHIVE|CVAR_GLOBALCONFIG) { NewWidth = screen->GetWidth(); diff --git a/src/posix/sdl/sdlglvideo.cpp b/src/posix/sdl/sdlglvideo.cpp index d8c00f236..e581cfde9 100644 --- a/src/posix/sdl/sdlglvideo.cpp +++ b/src/posix/sdl/sdlglvideo.cpp @@ -163,7 +163,7 @@ bool SDLGLVideo::NextMode (int *width, int *height, bool *letterbox) return false; } -DFrameBuffer *SDLGLVideo::CreateFrameBuffer (int width, int height, bool fullscreen, DFrameBuffer *old) +DFrameBuffer *SDLGLVideo::CreateFrameBuffer (int width, int height, bool bgra, bool fullscreen, DFrameBuffer *old) { static int retry = 0; static int owidth, oheight; @@ -315,7 +315,7 @@ bool SDLGLVideo::InitHardware (bool allowsoftware, int multisample) // FrameBuffer implementation ----------------------------------------------- SDLGLFB::SDLGLFB (void *, int width, int height, int, int, bool fullscreen) - : DFrameBuffer (width, height) + : DFrameBuffer (width, height, false) { int i; diff --git a/src/posix/sdl/sdlglvideo.h b/src/posix/sdl/sdlglvideo.h index d8ce9005d..3b84f83c4 100644 --- a/src/posix/sdl/sdlglvideo.h +++ b/src/posix/sdl/sdlglvideo.h @@ -21,7 +21,7 @@ class SDLGLVideo : public IVideo EDisplayType GetDisplayType () { return DISPLAY_Both; } void SetWindowedScale (float scale); - DFrameBuffer *CreateFrameBuffer (int width, int height, bool fs, DFrameBuffer *old); + DFrameBuffer *CreateFrameBuffer (int width, int height, bool bgra, bool fs, DFrameBuffer *old); void StartModeIterator (int bits, bool fs); bool NextMode (int *width, int *height, bool *letterbox); diff --git a/src/posix/sdl/sdlvideo.cpp b/src/posix/sdl/sdlvideo.cpp index 04c3a3f2e..56b883978 100644 --- a/src/posix/sdl/sdlvideo.cpp +++ b/src/posix/sdl/sdlvideo.cpp @@ -28,7 +28,7 @@ class SDLFB : public DFrameBuffer { DECLARE_CLASS(SDLFB, DFrameBuffer) public: - SDLFB (int width, int height, bool fullscreen, SDL_Window *oldwin); + SDLFB (int width, int height, bool bgra, bool fullscreen, SDL_Window *oldwin); ~SDLFB (); bool Lock (bool buffer); @@ -257,7 +257,7 @@ bool SDLVideo::NextMode (int *width, int *height, bool *letterbox) return false; } -DFrameBuffer *SDLVideo::CreateFrameBuffer (int width, int height, bool fullscreen, DFrameBuffer *old) +DFrameBuffer *SDLVideo::CreateFrameBuffer (int width, int height, bool bgra, bool fullscreen, DFrameBuffer *old) { static int retry = 0; static int owidth, oheight; @@ -271,7 +271,8 @@ DFrameBuffer *SDLVideo::CreateFrameBuffer (int width, int height, bool fullscree { // Reuse the old framebuffer if its attributes are the same SDLFB *fb = static_cast (old); if (fb->Width == width && - fb->Height == height) + fb->Height == height && + fb->Bgra == bgra) { bool fsnow = (SDL_GetWindowFlags (fb->Screen) & SDL_WINDOW_FULLSCREEN_DESKTOP) != 0; @@ -296,7 +297,7 @@ DFrameBuffer *SDLVideo::CreateFrameBuffer (int width, int height, bool fullscree flashAmount = 0; } - SDLFB *fb = new SDLFB (width, height, fullscreen, oldwin); + SDLFB *fb = new SDLFB (width, height, bgra, fullscreen, oldwin); // If we could not create the framebuffer, try again with slightly // different parameters in this order: @@ -335,7 +336,7 @@ DFrameBuffer *SDLVideo::CreateFrameBuffer (int width, int height, bool fullscree } ++retry; - fb = static_cast(CreateFrameBuffer (width, height, fullscreen, NULL)); + fb = static_cast(CreateFrameBuffer (width, height, bgra, fullscreen, NULL)); } retry = 0; @@ -350,8 +351,8 @@ void SDLVideo::SetWindowedScale (float scale) // FrameBuffer implementation ----------------------------------------------- -SDLFB::SDLFB (int width, int height, bool fullscreen, SDL_Window *oldwin) - : DFrameBuffer (width, height) +SDLFB::SDLFB (int width, int height, bool bgra, bool fullscreen, SDL_Window *oldwin) + : DFrameBuffer (width, height, bgra) { int i; @@ -494,7 +495,11 @@ void SDLFB::Update () pitch = Surface->pitch; } - if (NotPaletted) + if (Bgra) + { + CopyWithGammaBgra(pixels, pitch, GammaTable[0], GammaTable[1], GammaTable[2], Flash, FlashAmount); + } + else if (NotPaletted) { GPfx.Convert (MemBuffer, Pitch, pixels, pitch, Width, Height, @@ -674,13 +679,20 @@ void SDLFB::ResetSDLRenderer () SDL_SetRenderDrawColor(Renderer, 0, 0, 0, 255); Uint32 fmt; - switch(vid_displaybits) + if (Bgra) { - default: fmt = SDL_PIXELFORMAT_ARGB8888; break; - case 30: fmt = SDL_PIXELFORMAT_ARGB2101010; break; - case 24: fmt = SDL_PIXELFORMAT_RGB888; break; - case 16: fmt = SDL_PIXELFORMAT_RGB565; break; - case 15: fmt = SDL_PIXELFORMAT_ARGB1555; break; + fmt = SDL_PIXELFORMAT_ARGB8888; + } + else + { + switch (vid_displaybits) + { + default: fmt = SDL_PIXELFORMAT_ARGB8888; break; + case 30: fmt = SDL_PIXELFORMAT_ARGB2101010; break; + case 24: fmt = SDL_PIXELFORMAT_RGB888; break; + case 16: fmt = SDL_PIXELFORMAT_RGB565; break; + case 15: fmt = SDL_PIXELFORMAT_ARGB1555; break; + } } Texture = SDL_CreateTexture (Renderer, fmt, SDL_TEXTUREACCESS_STREAMING, Width, Height); diff --git a/src/posix/sdl/sdlvideo.h b/src/posix/sdl/sdlvideo.h index 072167b5a..385733bc1 100644 --- a/src/posix/sdl/sdlvideo.h +++ b/src/posix/sdl/sdlvideo.h @@ -10,7 +10,7 @@ class SDLVideo : public IVideo EDisplayType GetDisplayType () { return DISPLAY_Both; } void SetWindowedScale (float scale); - DFrameBuffer *CreateFrameBuffer (int width, int height, bool fs, DFrameBuffer *old); + DFrameBuffer *CreateFrameBuffer (int width, int height, bool bgra, bool fs, DFrameBuffer *old); void StartModeIterator (int bits, bool fs); bool NextMode (int *width, int *height, bool *letterbox); diff --git a/src/r_data/colormaps.cpp b/src/r_data/colormaps.cpp index b46342463..3bfc89b4b 100644 --- a/src/r_data/colormaps.cpp +++ b/src/r_data/colormaps.cpp @@ -71,7 +71,7 @@ struct FakeCmap }; TArray fakecmaps; -BYTE *realcolormaps; +FSWColormap realcolormaps; size_t numfakecmaps; @@ -408,7 +408,7 @@ void R_SetDefaultColormap (const char *name) foo.Color = 0xFFFFFF; foo.Fade = 0; - foo.Maps = realcolormaps; + foo.Maps = realcolormaps.Maps; foo.Desaturate = 0; foo.Next = NULL; foo.BuildLights (); @@ -430,7 +430,7 @@ void R_SetDefaultColormap (const char *name) remap[0] = 0; for (i = 0; i < NUMCOLORMAPS; ++i) { - BYTE *map2 = &realcolormaps[i*256]; + BYTE *map2 = &realcolormaps.Maps[i*256]; lumpr.Read (map, 256); for (j = 0; j < 256; ++j) { @@ -454,11 +454,7 @@ void R_DeinitColormaps () { SpecialColormaps.Clear(); fakecmaps.Clear(); - if (realcolormaps != NULL) - { - delete[] realcolormaps; - realcolormaps = NULL; - } + delete[] realcolormaps.Maps; FreeSpecialLights(); } @@ -501,7 +497,7 @@ void R_InitColormaps () } } } - realcolormaps = new BYTE[256*NUMCOLORMAPS*fakecmaps.Size()]; + realcolormaps.Maps = new BYTE[256*NUMCOLORMAPS*fakecmaps.Size()]; R_SetDefaultColormap ("COLORMAP"); if (fakecmaps.Size() > 1) @@ -523,7 +519,7 @@ void R_InitColormaps () { int k, r, g, b; FWadLump lump = Wads.OpenLumpNum (fakecmaps[j].lump); - BYTE *const map = realcolormaps + NUMCOLORMAPS*256*j; + BYTE *const map = realcolormaps.Maps + NUMCOLORMAPS*256*j; for (k = 0; k < NUMCOLORMAPS; ++k) { @@ -550,8 +546,8 @@ void R_InitColormaps () } NormalLight.Color = PalEntry (255, 255, 255); NormalLight.Fade = 0; - NormalLight.Maps = realcolormaps; - NormalLightHasFixedLights = R_CheckForFixedLights(realcolormaps); + NormalLight.Maps = realcolormaps.Maps; + NormalLightHasFixedLights = R_CheckForFixedLights(realcolormaps.Maps); numfakecmaps = fakecmaps.Size(); // build default special maps (e.g. invulnerability) diff --git a/src/r_data/colormaps.h b/src/r_data/colormaps.h index 0764191a3..ca1574893 100644 --- a/src/r_data/colormaps.h +++ b/src/r_data/colormaps.h @@ -1,18 +1,26 @@ #ifndef __RES_CMAP_H #define __RES_CMAP_H +struct FSWColormap; + void R_InitColormaps (); void R_DeinitColormaps (); DWORD R_ColormapNumForName(const char *name); // killough 4/4/98 void R_SetDefaultColormap (const char *name); // [RH] change normal fadetable DWORD R_BlendForColormap (DWORD map); // [RH] return calculated blend for a colormap -extern BYTE *realcolormaps; // [RH] make the colormaps externally visible +extern FSWColormap realcolormaps; // [RH] make the colormaps externally visible extern size_t numfakecmaps; +struct FSWColormap +{ + BYTE *Maps = nullptr; + PalEntry Color = 0xffffffff; + PalEntry Fade = 0xff000000; + int Desaturate = 0; +}; - -struct FDynamicColormap +struct FDynamicColormap : FSWColormap { void ChangeFade (PalEntry fadecolor); void ChangeColor (PalEntry lightcolor, int desaturate); @@ -20,10 +28,6 @@ struct FDynamicColormap void BuildLights (); static void RebuildAllLights(); - BYTE *Maps; - PalEntry Color; - PalEntry Fade; - int Desaturate; FDynamicColormap *Next; }; @@ -43,8 +47,13 @@ enum }; -struct FSpecialColormap +struct FSpecialColormap : FSWColormap { + FSpecialColormap() + { + Maps = Colormap; + } + float ColorizeStart[3]; float ColorizeEnd[3]; BYTE Colormap[256]; diff --git a/src/r_defs.h b/src/r_defs.h index f172a84a6..97552fb52 100644 --- a/src/r_defs.h +++ b/src/r_defs.h @@ -1498,11 +1498,14 @@ struct FMiniBSP // typedef BYTE lighttable_t; // This could be wider for >8 bit display. +struct FSWColormap; // This encapsulates the fields of vissprite_t that can be altered by AlterWeaponSprite struct visstyle_t { - lighttable_t *colormap; + int ColormapNum; // Which colormap is rendered + FSWColormap *BaseColormap; // Base colormap used together with ColormapNum + lighttable_t *colormap; // [SP] Restored from GZDoom - will this work? float Alpha; FRenderStyle RenderStyle; }; diff --git a/src/r_draw.cpp b/src/r_draw.cpp index 80b91ed2d..f255352f5 100644 --- a/src/r_draw.cpp +++ b/src/r_draw.cpp @@ -38,6 +38,8 @@ #include "r_data/r_translate.h" #include "v_palette.h" #include "r_data/colormaps.h" +#include "r_plane.h" +#include "r_draw_rgba.h" #include "gi.h" #include "stats.h" @@ -70,6 +72,19 @@ int scaledviewwidth; // screen depth and asm/no asm. void (*R_DrawColumnHoriz)(void); void (*R_DrawColumn)(void); +void (*R_FillColumn)(void); +void (*R_FillAddColumn)(void); +void (*R_FillAddClampColumn)(void); +void (*R_FillSubClampColumn)(void); +void (*R_FillRevSubClampColumn)(void); +void (*R_DrawAddColumn)(void); +void (*R_DrawTlatedAddColumn)(void); +void (*R_DrawAddClampColumn)(void); +void (*R_DrawAddClampTranslatedColumn)(void); +void (*R_DrawSubClampColumn)(void); +void (*R_DrawSubClampTranslatedColumn)(void); +void (*R_DrawRevSubClampColumn)(void); +void (*R_DrawRevSubClampTranslatedColumn)(void); void (*R_DrawFuzzColumn)(void); void (*R_DrawTranslatedColumn)(void); void (*R_DrawShadedColumn)(void); @@ -79,7 +94,48 @@ void (*R_DrawSpanTranslucent)(void); void (*R_DrawSpanMaskedTranslucent)(void); void (*R_DrawSpanAddClamp)(void); void (*R_DrawSpanMaskedAddClamp)(void); -void (*rt_map4cols)(int,int,int); +void (*R_FillSpan)(void); +void (*R_FillColumnHoriz)(void); +void (*R_DrawFogBoundary)(int x1, int x2, short *uclip, short *dclip); +void (*R_MapTiltedPlane)(int y, int x1); +void (*R_MapColoredPlane)(int y, int x1); +void (*R_DrawParticle)(vissprite_t *); +void (*R_SetupDrawSlab)(FSWColormap *base_colormap, float light, int shade); +void (*R_DrawSlab)(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p); +fixed_t (*tmvline1_add)(); +void (*tmvline4_add)(); +fixed_t (*tmvline1_addclamp)(); +void (*tmvline4_addclamp)(); +fixed_t (*tmvline1_subclamp)(); +void (*tmvline4_subclamp)(); +fixed_t (*tmvline1_revsubclamp)(); +void (*tmvline4_revsubclamp)(); +void (*rt_copy1col)(int hx, int sx, int yl, int yh); +void (*rt_copy4cols)(int sx, int yl, int yh); +void (*rt_shaded1col)(int hx, int sx, int yl, int yh); +void (*rt_shaded4cols)(int sx, int yl, int yh); +void (*rt_map1col)(int hx, int sx, int yl, int yh); +void (*rt_add1col)(int hx, int sx, int yl, int yh); +void (*rt_addclamp1col)(int hx, int sx, int yl, int yh); +void (*rt_subclamp1col)(int hx, int sx, int yl, int yh); +void (*rt_revsubclamp1col)(int hx, int sx, int yl, int yh); +void (*rt_tlate1col)(int hx, int sx, int yl, int yh); +void (*rt_tlateadd1col)(int hx, int sx, int yl, int yh); +void (*rt_tlateaddclamp1col)(int hx, int sx, int yl, int yh); +void (*rt_tlatesubclamp1col)(int hx, int sx, int yl, int yh); +void (*rt_tlaterevsubclamp1col)(int hx, int sx, int yl, int yh); +void (*rt_map4cols)(int sx, int yl, int yh); +void (*rt_add4cols)(int sx, int yl, int yh); +void (*rt_addclamp4cols)(int sx, int yl, int yh); +void (*rt_subclamp4cols)(int sx, int yl, int yh); +void (*rt_revsubclamp4cols)(int sx, int yl, int yh); +void (*rt_tlate4cols)(int sx, int yl, int yh); +void (*rt_tlateadd4cols)(int sx, int yl, int yh); +void (*rt_tlateaddclamp4cols)(int sx, int yl, int yh); +void (*rt_tlatesubclamp4cols)(int sx, int yl, int yh); +void (*rt_tlaterevsubclamp4cols)(int sx, int yl, int yh); +void (*rt_initcols)(BYTE *buffer); +void (*rt_span_coverage)(int x, int start, int stop); // // R_DrawColumn @@ -90,18 +146,27 @@ extern "C" { int dc_pitch=0xABadCafe; // [RH] Distance between rows lighttable_t* dc_colormap; +FSWColormap *dc_fcolormap; +ShadeConstants dc_shade_constants; +fixed_t dc_light; int dc_x; int dc_yl; int dc_yh; fixed_t dc_iscale; fixed_t dc_texturefrac; +uint32_t dc_textureheight; int dc_color; // [RH] Color for column filler DWORD dc_srccolor; +uint32_t dc_srccolor_bgra; DWORD *dc_srcblend; // [RH] Source and destination DWORD *dc_destblend; // blending lookups +fixed_t dc_srcalpha; // Alpha value used by dc_srcblend +fixed_t dc_destalpha; // Alpha value used by dc_destblend // first pixel in a column (possibly virtual) const BYTE* dc_source; +const BYTE* dc_source2; +uint32_t dc_texturefracx; BYTE* dc_dest; int dc_count; @@ -109,7 +174,11 @@ int dc_count; DWORD vplce[4]; DWORD vince[4]; BYTE* palookupoffse[4]; +fixed_t palookuplight[4]; const BYTE* bufplce[4]; +const BYTE* bufplce2[4]; +uint32_t buftexturefracx[4]; +uint32_t bufheight[4]; // just for profiling int dccount; @@ -120,10 +189,10 @@ BYTE *dc_translation; BYTE shadetables[NUMCOLORMAPS*16*256]; FDynamicColormap ShadeFakeColormap[16]; BYTE identitymap[256]; +FDynamicColormap identitycolormap; EXTERN_CVAR (Int, r_columnmethod) - void R_InitShadeMaps() { int i,j; @@ -161,6 +230,10 @@ void R_InitShadeMaps() { identitymap[i] = i; } + identitycolormap.Color = ~0u; + identitycolormap.Desaturate = 0; + identitycolormap.Next = NULL; + identitycolormap.Maps = identitymap; } /************************************/ @@ -223,7 +296,7 @@ void R_DrawColumnP_C (void) #endif // [RH] Just fills a column with a color -void R_FillColumnP (void) +void R_FillColumnP_C (void) { int count; BYTE* dest; @@ -247,7 +320,7 @@ void R_FillColumnP (void) } } -void R_FillAddColumn (void) +void R_FillAddColumn_C (void) { int count; BYTE *dest; @@ -271,10 +344,9 @@ void R_FillAddColumn (void) *dest = RGB32k.All[bg & (bg>>15)]; dest += pitch; } while (--count); - } -void R_FillAddClampColumn (void) +void R_FillAddClampColumn_C (void) { int count; BYTE *dest; @@ -304,10 +376,9 @@ void R_FillAddClampColumn (void) *dest = RGB32k.All[a & (a>>15)]; dest += pitch; } while (--count); - } -void R_FillSubClampColumn (void) +void R_FillSubClampColumn_C (void) { int count; BYTE *dest; @@ -336,10 +407,9 @@ void R_FillSubClampColumn (void) *dest = RGB32k.All[a & (a>>15)]; dest += pitch; } while (--count); - } -void R_FillRevSubClampColumn (void) +void R_FillRevSubClampColumn_C (void) { int count; BYTE *dest; @@ -368,13 +438,11 @@ void R_FillRevSubClampColumn (void) *dest = RGB32k.All[a & (a>>15)]; dest += pitch; } while (--count); - } // // Spectre/Invisibility. // -#define FUZZTABLE 50 extern "C" { @@ -647,8 +715,8 @@ void R_DrawTlatedAddColumnP_C (void) fg = fg2rgb[fg]; bg = bg2rgb[bg]; - fg = (fg+bg) | 0x1f07c1f; - *dest = RGB32k.All[fg & (fg>>15)]; + fg = (fg + bg) | 0x1f07c1f; + *dest = RGB32k.All[fg & (fg >> 15)]; dest += pitch; frac += fracstep; } while (--count); @@ -937,8 +1005,6 @@ void R_DrawRevSubClampTranslatedColumnP_C () } } - - // // R_DrawSpan // With DOOM style restrictions on view orientation, @@ -966,7 +1032,10 @@ int ds_y; int ds_x1; int ds_x2; +FSWColormap* ds_fcolormap; lighttable_t* ds_colormap; +ShadeConstants ds_shade_constants; +dsfixed_t ds_light; dsfixed_t ds_xfrac; dsfixed_t ds_yfrac; @@ -977,6 +1046,7 @@ int ds_ybits; // start of a floor/ceiling tile image const BYTE* ds_source; +bool ds_source_mipmapped; // just for profiling int dscount; @@ -997,13 +1067,14 @@ extern "C" BYTE *ds_curcolormap, *ds_cursource, *ds_curtiltedsource; // //========================================================================== -void R_SetSpanSource(const BYTE *pixels) +void R_SetSpanSource(FTexture *tex) { - ds_source = pixels; + ds_source = r_swtruecolor ? (const BYTE*)tex->GetPixelsBgra() : tex->GetPixels(); + ds_source_mipmapped = tex->Mipmapped(); #ifdef X86_ASM - if (ds_cursource != ds_source) + if (!r_swtruecolor && ds_cursource != ds_source) { - R_SetSpanSource_ASM(pixels); + R_SetSpanSource_ASM(ds_source); } #endif } @@ -1016,11 +1087,11 @@ void R_SetSpanSource(const BYTE *pixels) // //========================================================================== -void R_SetSpanColormap(BYTE *colormap) +void R_SetSpanColormap(FDynamicColormap *colormap, int shade) { - ds_colormap = colormap; + R_SetDSColorMapLight(colormap, 0, shade); #ifdef X86_ASM - if (ds_colormap != ds_curcolormap) + if (!r_swtruecolor && ds_colormap != ds_curcolormap) { R_SetSpanColormap_ASM (ds_colormap); } @@ -1049,7 +1120,8 @@ void R_SetupSpanBits(FTexture *tex) ds_ybits--; } #ifdef X86_ASM - R_SetSpanSize_ASM (ds_xbits, ds_ybits); + if (!r_swtruecolor) + R_SetSpanSize_ASM (ds_xbits, ds_ybits); #endif } @@ -1090,6 +1162,7 @@ void R_DrawSpanP_C (void) if (ds_xbits == 6 && ds_ybits == 6) { // 64x64 is the most common case by far, so special case it. + do { // Current texture index in u,v. @@ -1471,11 +1544,12 @@ void R_DrawSpanMaskedAddClampP_C (void) } // [RH] Just fill a span with a color -void R_FillSpan (void) +void R_FillSpan_C (void) { memset (ylookup[ds_y] + ds_x1 + dc_destorg, ds_color, ds_x2 - ds_x1 + 1); } + // Draw a voxel slab // // "Build Engine & Tools" Copyright (c) 1993-1997 Ken Silverman @@ -1572,17 +1646,19 @@ extern "C" void R_DrawSlabC(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *v // wallscan stuff, in C +int vlinebits; +int mvlinebits; + #ifndef X86_ASM static DWORD vlinec1 (); -static int vlinebits; DWORD (*dovline1)() = vlinec1; DWORD (*doprevline1)() = vlinec1; #ifdef X64_ASM extern "C" void vlinetallasm4(); -#define dovline4 vlinetallasm4 extern "C" void setupvlinetallasm (int); +void (*dovline4)() = vlinetallasm4; #else static void vlinec4 (); void (*dovline4)() = vlinec4; @@ -1590,7 +1666,6 @@ void (*dovline4)() = vlinec4; static DWORD mvlinec1(); static void mvlinec4(); -static int mvlinebits; DWORD (*domvline1)() = mvlinec1; void (*domvline4)() = mvlinec4; @@ -1624,6 +1699,12 @@ void (*domvline4)() = mvlineasm4; void setupvline (int fracbits) { + if (r_swtruecolor) + { + vlinebits = fracbits; + return; + } + #ifdef X86_ASM if (CPU.Family <= 5) { @@ -1679,7 +1760,9 @@ DWORD vlinec1 () return frac; } +#endif +#if !defined(X86_ASM) void vlinec4 () { BYTE *dest = dc_dest; @@ -1700,13 +1783,20 @@ void vlinec4 () void setupmvline (int fracbits) { + if (!r_swtruecolor) + { #if defined(X86_ASM) - setupmvlineasm (fracbits); - domvline1 = mvlineasm1; - domvline4 = mvlineasm4; + setupmvlineasm(fracbits); + domvline1 = mvlineasm1; + domvline4 = mvlineasm4; #else - mvlinebits = fracbits; + mvlinebits = fracbits; #endif + } + else + { + mvlinebits = fracbits; + } } #if !defined(X86_ASM) @@ -1788,7 +1878,7 @@ static void R_DrawFogBoundaryLine (int y, int x) } while (++x <= x2); } -void R_DrawFogBoundary (int x1, int x2, short *uclip, short *dclip) +void R_DrawFogBoundary_C (int x1, int x2, short *uclip, short *dclip) { // This is essentially the same as R_MapVisPlane but with an extra step // to create new horizontal spans whenever the light changes enough that @@ -1808,7 +1898,7 @@ void R_DrawFogBoundary (int x1, int x2, short *uclip, short *dclip) clearbufshort (spanend+t2, b2-t2, x); } - dc_colormap = basecolormapdata + (rcolormap << COLORMAPSHIFT); + R_SetColorMapLight(basecolormap, (float)light, wallshade); for (--x; x >= x1; --x) { @@ -1833,7 +1923,7 @@ void R_DrawFogBoundary (int x1, int x2, short *uclip, short *dclip) clearbufshort (spanend+t2, b2-t2, x); } rcolormap = lcolormap; - dc_colormap = basecolormapdata + (lcolormap << COLORMAPSHIFT); + R_SetColorMapLight(basecolormap, (float)light, wallshade); } else { @@ -1884,7 +1974,7 @@ void setuptmvline (int bits) tmvlinebits = bits; } -fixed_t tmvline1_add () +fixed_t tmvline1_add_C () { DWORD fracstep = dc_iscale; DWORD frac = dc_texturefrac; @@ -1915,7 +2005,7 @@ fixed_t tmvline1_add () return frac; } -void tmvline4_add () +void tmvline4_add_C () { BYTE *dest = dc_dest; int count = dc_count; @@ -1942,7 +2032,7 @@ void tmvline4_add () } while (--count); } -fixed_t tmvline1_addclamp () +fixed_t tmvline1_addclamp_C () { DWORD fracstep = dc_iscale; DWORD frac = dc_texturefrac; @@ -1978,7 +2068,7 @@ fixed_t tmvline1_addclamp () return frac; } -void tmvline4_addclamp () +void tmvline4_addclamp_C () { BYTE *dest = dc_dest; int count = dc_count; @@ -2010,7 +2100,7 @@ void tmvline4_addclamp () } while (--count); } -fixed_t tmvline1_subclamp () +fixed_t tmvline1_subclamp_C () { DWORD fracstep = dc_iscale; DWORD frac = dc_texturefrac; @@ -2045,7 +2135,7 @@ fixed_t tmvline1_subclamp () return frac; } -void tmvline4_subclamp () +void tmvline4_subclamp_C () { BYTE *dest = dc_dest; int count = dc_count; @@ -2076,7 +2166,7 @@ void tmvline4_subclamp () } while (--count); } -fixed_t tmvline1_revsubclamp () +fixed_t tmvline1_revsubclamp_C () { DWORD fracstep = dc_iscale; DWORD frac = dc_texturefrac; @@ -2111,7 +2201,7 @@ fixed_t tmvline1_revsubclamp () return frac; } -void tmvline4_revsubclamp () +void tmvline4_revsubclamp_C () { BYTE *dest = dc_dest; int count = dc_count; @@ -2142,7 +2232,6 @@ void tmvline4_revsubclamp () } while (--count); } - //========================================================================== // // R_GetColumn @@ -2159,43 +2248,242 @@ const BYTE *R_GetColumn (FTexture *tex, int col) { col = width + (col % width); } - return tex->GetColumn (col, NULL); -} + if (r_swtruecolor) + return (const BYTE *)tex->GetColumnBgra(col, NULL); + else + return tex->GetColumn(col, NULL); +} // [RH] Initialize the column drawer pointers void R_InitColumnDrawers () { -#ifdef X86_ASM - R_DrawColumn = R_DrawColumnP_ASM; - R_DrawColumnHoriz = R_DrawColumnHorizP_ASM; - R_DrawFuzzColumn = R_DrawFuzzColumnP_ASM; - R_DrawTranslatedColumn = R_DrawTranslatedColumnP_C; - R_DrawShadedColumn = R_DrawShadedColumnP_C; - R_DrawSpan = R_DrawSpanP_ASM; - R_DrawSpanMasked = R_DrawSpanMaskedP_ASM; - if (CPU.Family <= 5) + // Save a copy when switching to true color mode as the assembly palette drawers might change them + static bool pointers_saved = false; + static DWORD(*dovline1_saved)(); + static DWORD(*doprevline1_saved)(); + static DWORD(*domvline1_saved)(); + static void(*dovline4_saved)(); + static void(*domvline4_saved)(); + + if (r_swtruecolor) { - rt_map4cols = rt_map4cols_asm2; + if (!pointers_saved) + { + pointers_saved = true; + dovline1_saved = dovline1; + doprevline1_saved = doprevline1; + domvline1_saved = domvline1; + dovline4_saved = dovline4; + domvline4_saved = domvline4; + } + + R_DrawColumnHoriz = R_DrawColumnHoriz_rgba; + R_DrawColumn = R_DrawColumn_rgba; + R_DrawFuzzColumn = R_DrawFuzzColumn_rgba; + R_DrawTranslatedColumn = R_DrawTranslatedColumn_rgba; + R_DrawShadedColumn = R_DrawShadedColumn_rgba; + R_DrawSpanMasked = R_DrawSpanMasked_rgba; + R_DrawSpan = R_DrawSpan_rgba; + + R_DrawSpanTranslucent = R_DrawSpanTranslucent_rgba; + R_DrawSpanMaskedTranslucent = R_DrawSpanMaskedTranslucent_rgba; + R_DrawSpanAddClamp = R_DrawSpanAddClamp_rgba; + R_DrawSpanMaskedAddClamp = R_DrawSpanMaskedAddClamp_rgba; + R_FillColumn = R_FillColumn_rgba; + R_FillAddColumn = R_FillAddColumn_rgba; + R_FillAddClampColumn = R_FillAddClampColumn_rgba; + R_FillSubClampColumn = R_FillSubClampColumn_rgba; + R_FillRevSubClampColumn = R_FillRevSubClampColumn_rgba; + R_DrawAddColumn = R_DrawAddColumn_rgba; + R_DrawTlatedAddColumn = R_DrawTlatedAddColumn_rgba; + R_DrawAddClampColumn = R_DrawAddClampColumn_rgba; + R_DrawAddClampTranslatedColumn = R_DrawAddClampTranslatedColumn_rgba; + R_DrawSubClampColumn = R_DrawSubClampColumn_rgba; + R_DrawSubClampTranslatedColumn = R_DrawSubClampTranslatedColumn_rgba; + R_DrawRevSubClampColumn = R_DrawRevSubClampColumn_rgba; + R_DrawRevSubClampTranslatedColumn = R_DrawRevSubClampTranslatedColumn_rgba; + R_FillSpan = R_FillSpan_rgba; + R_DrawFogBoundary = R_DrawFogBoundary_rgba; + R_FillColumnHoriz = R_FillColumnHoriz_rgba; + + R_DrawFogBoundary = R_DrawFogBoundary_rgba; + R_MapTiltedPlane = R_MapTiltedPlane_rgba; + R_MapColoredPlane = R_MapColoredPlane_rgba; + R_DrawParticle = R_DrawParticle_rgba; + + R_SetupDrawSlab = R_SetupDrawSlab_rgba; + R_DrawSlab = R_DrawSlab_rgba; + + tmvline1_add = tmvline1_add_rgba; + tmvline4_add = tmvline4_add_rgba; + tmvline1_addclamp = tmvline1_addclamp_rgba; + tmvline4_addclamp = tmvline4_addclamp_rgba; + tmvline1_subclamp = tmvline1_subclamp_rgba; + tmvline4_subclamp = tmvline4_subclamp_rgba; + tmvline1_revsubclamp = tmvline1_revsubclamp_rgba; + tmvline4_revsubclamp = tmvline4_revsubclamp_rgba; + + rt_copy1col = rt_copy1col_rgba; + rt_copy4cols = rt_copy4cols_rgba; + rt_map1col = rt_map1col_rgba; + rt_map4cols = rt_map4cols_rgba; + rt_shaded1col = rt_shaded1col_rgba; + rt_shaded4cols = rt_shaded4cols_rgba; + rt_add1col = rt_add1col_rgba; + rt_add4cols = rt_add4cols_rgba; + rt_addclamp1col = rt_addclamp1col_rgba; + rt_addclamp4cols = rt_addclamp4cols_rgba; + rt_subclamp1col = rt_subclamp1col_rgba; + rt_revsubclamp1col = rt_revsubclamp1col_rgba; + rt_tlate1col = rt_tlate1col_rgba; + rt_tlateadd1col = rt_tlateadd1col_rgba; + rt_tlateaddclamp1col = rt_tlateaddclamp1col_rgba; + rt_tlatesubclamp1col = rt_tlatesubclamp1col_rgba; + rt_tlaterevsubclamp1col = rt_tlaterevsubclamp1col_rgba; + rt_subclamp4cols = rt_subclamp4cols_rgba; + rt_revsubclamp4cols = rt_revsubclamp4cols_rgba; + rt_tlate4cols = rt_tlate4cols_rgba; + rt_tlateadd4cols = rt_tlateadd4cols_rgba; + rt_tlateaddclamp4cols = rt_tlateaddclamp4cols_rgba; + rt_tlatesubclamp4cols = rt_tlatesubclamp4cols_rgba; + rt_tlaterevsubclamp4cols = rt_tlaterevsubclamp4cols_rgba; + rt_initcols = rt_initcols_rgba; + rt_span_coverage = rt_span_coverage_rgba; + + dovline1 = vlinec1_rgba; + doprevline1 = vlinec1_rgba; + domvline1 = mvlinec1_rgba; + + dovline4 = vlinec4_rgba; + domvline4 = mvlinec4_rgba; } else { - rt_map4cols = rt_map4cols_asm1; - } +#ifdef X86_ASM + R_DrawColumn = R_DrawColumnP_ASM; + R_DrawColumnHoriz = R_DrawColumnHorizP_ASM; + R_DrawFuzzColumn = R_DrawFuzzColumnP_ASM; + R_DrawTranslatedColumn = R_DrawTranslatedColumnP_C; + R_DrawShadedColumn = R_DrawShadedColumnP_C; + R_DrawSpan = R_DrawSpanP_ASM; + R_DrawSpanMasked = R_DrawSpanMaskedP_ASM; + if (CPU.Family <= 5) + { + rt_map4cols = rt_map4cols_asm2; + } + else + { + rt_map4cols = rt_map4cols_asm1; + } #else - R_DrawColumnHoriz = R_DrawColumnHorizP_C; - R_DrawColumn = R_DrawColumnP_C; - R_DrawFuzzColumn = R_DrawFuzzColumnP_C; - R_DrawTranslatedColumn = R_DrawTranslatedColumnP_C; - R_DrawShadedColumn = R_DrawShadedColumnP_C; - R_DrawSpan = R_DrawSpanP_C; - R_DrawSpanMasked = R_DrawSpanMaskedP_C; - rt_map4cols = rt_map4cols_c; + R_DrawColumnHoriz = R_DrawColumnHorizP_C; + R_DrawColumn = R_DrawColumnP_C; + R_DrawFuzzColumn = R_DrawFuzzColumnP_C; + R_DrawTranslatedColumn = R_DrawTranslatedColumnP_C; + R_DrawShadedColumn = R_DrawShadedColumnP_C; + R_DrawSpan = R_DrawSpanP_C; + R_DrawSpanMasked = R_DrawSpanMaskedP_C; + rt_map4cols = rt_map4cols_c; #endif - R_DrawSpanTranslucent = R_DrawSpanTranslucentP_C; - R_DrawSpanMaskedTranslucent = R_DrawSpanMaskedTranslucentP_C; - R_DrawSpanAddClamp = R_DrawSpanAddClampP_C; - R_DrawSpanMaskedAddClamp = R_DrawSpanMaskedAddClampP_C; + R_DrawSpanTranslucent = R_DrawSpanTranslucentP_C; + R_DrawSpanMaskedTranslucent = R_DrawSpanMaskedTranslucentP_C; + R_DrawSpanAddClamp = R_DrawSpanAddClampP_C; + R_DrawSpanMaskedAddClamp = R_DrawSpanMaskedAddClampP_C; + R_FillColumn = R_FillColumnP_C; + R_FillAddColumn = R_FillAddColumn_C; + R_FillAddClampColumn = R_FillAddClampColumn_C; + R_FillSubClampColumn = R_FillSubClampColumn_C; + R_FillRevSubClampColumn = R_FillRevSubClampColumn_C; + R_DrawAddColumn = R_DrawAddColumnP_C; + R_DrawTlatedAddColumn = R_DrawTlatedAddColumnP_C; + R_DrawAddClampColumn = R_DrawAddClampColumnP_C; + R_DrawAddClampTranslatedColumn = R_DrawAddClampTranslatedColumnP_C; + R_DrawSubClampColumn = R_DrawSubClampColumnP_C; + R_DrawSubClampTranslatedColumn = R_DrawSubClampTranslatedColumnP_C; + R_DrawRevSubClampColumn = R_DrawRevSubClampColumnP_C; + R_DrawRevSubClampTranslatedColumn = R_DrawRevSubClampTranslatedColumnP_C; + R_FillSpan = R_FillSpan_C; + R_DrawFogBoundary = R_DrawFogBoundary_C; + R_FillColumnHoriz = R_FillColumnHorizP_C; + + R_DrawFogBoundary = R_DrawFogBoundary_C; + R_MapTiltedPlane = R_MapTiltedPlane_C; + R_MapColoredPlane = R_MapColoredPlane_C; + R_DrawParticle = R_DrawParticle_C; + +#ifdef X86_ASM + R_SetupDrawSlab = [](FSWColormap *colormap, float light, int shade) { R_SetupDrawSlabA(colormap->Maps + (GETPALOOKUP(light, shade) << COLORMAPSHIFT)); }; + R_DrawSlab = R_DrawSlabA; +#else + R_SetupDrawSlab = [](FSWColormap *colormap, float light, int shade) { R_SetupDrawSlabC(colormap->Maps + (GETPALOOKUP(light, shade) << COLORMAPSHIFT)); }; + R_DrawSlab = R_DrawSlabC; +#endif + + tmvline1_add = tmvline1_add_C; + tmvline4_add = tmvline4_add_C; + tmvline1_addclamp = tmvline1_addclamp_C; + tmvline4_addclamp = tmvline4_addclamp_C; + tmvline1_subclamp = tmvline1_subclamp_C; + tmvline4_subclamp = tmvline4_subclamp_C; + tmvline1_revsubclamp = tmvline1_revsubclamp_C; + tmvline4_revsubclamp = tmvline4_revsubclamp_C; + +#ifdef X86_ASM + rt_copy1col = rt_copy1col_asm; + rt_copy4cols = rt_copy4cols_asm; + rt_map1col = rt_map1col_asm; + rt_shaded4cols = rt_shaded4cols_asm; + rt_add4cols = rt_add4cols_asm; + rt_addclamp4cols = rt_addclamp4cols_asm; +#else + rt_copy1col = rt_copy1col_c; + rt_copy4cols = rt_copy4cols_c; + rt_map1col = rt_map1col_c; + rt_shaded4cols = rt_shaded4cols_c; + rt_add4cols = rt_add4cols_c; + rt_addclamp4cols = rt_addclamp4cols_c; +#endif + rt_shaded1col = rt_shaded1col_c; + rt_add1col = rt_add1col_c; + rt_addclamp1col = rt_addclamp1col_c; + rt_subclamp1col = rt_subclamp1col_c; + rt_revsubclamp1col = rt_revsubclamp1col_c; + rt_tlate1col = rt_tlate1col_c; + rt_tlateadd1col = rt_tlateadd1col_c; + rt_tlateaddclamp1col = rt_tlateaddclamp1col_c; + rt_tlatesubclamp1col = rt_tlatesubclamp1col_c; + rt_tlaterevsubclamp1col = rt_tlaterevsubclamp1col_c; + rt_subclamp4cols = rt_subclamp4cols_c; + rt_revsubclamp4cols = rt_revsubclamp4cols_c; + rt_tlate4cols = rt_tlate4cols_c; + rt_tlateadd4cols = rt_tlateadd4cols_c; + rt_tlateaddclamp4cols = rt_tlateaddclamp4cols_c; + rt_tlatesubclamp4cols = rt_tlatesubclamp4cols_c; + rt_tlaterevsubclamp4cols = rt_tlaterevsubclamp4cols_c; + rt_initcols = rt_initcols_pal; + rt_span_coverage = rt_span_coverage_pal; + + if (pointers_saved) + { + pointers_saved = false; + dovline1 = dovline1_saved; + doprevline1 = doprevline1_saved; + domvline1 = domvline1_saved; + dovline4 = dovline4_saved; + domvline4 = domvline4_saved; + } + } + + colfunc = basecolfunc = R_DrawColumn; + fuzzcolfunc = R_DrawFuzzColumn; + transcolfunc = R_DrawTranslatedColumn; + spanfunc = R_DrawSpan; + + // [RH] Horizontal column drawers + hcolfunc_pre = R_DrawColumnHoriz; + hcolfunc_post1 = rt_map1col; + hcolfunc_post4 = rt_map4cols; } // [RH] Choose column drawers in a single place @@ -2213,7 +2501,7 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags) { if (flags & STYLEF_ColorIsFixed) { - colfunc = R_FillColumnP; + colfunc = R_FillColumn; hcolfunc_post1 = rt_copy1col; hcolfunc_post4 = rt_copy4cols; } @@ -2235,16 +2523,22 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags) { dc_srcblend = Col2RGB8_Inverse[fglevel>>10]; dc_destblend = Col2RGB8_LessPrecision[bglevel>>10]; + dc_srcalpha = fglevel; + dc_destalpha = bglevel; } else if (op == STYLEOP_Add && fglevel + bglevel <= FRACUNIT) { dc_srcblend = Col2RGB8[fglevel>>10]; dc_destblend = Col2RGB8[bglevel>>10]; + dc_srcalpha = fglevel; + dc_destalpha = bglevel; } else { dc_srcblend = Col2RGB8_LessPrecision[fglevel>>10]; dc_destblend = Col2RGB8_LessPrecision[bglevel>>10]; + dc_srcalpha = fglevel; + dc_destalpha = bglevel; } switch (op) { @@ -2263,13 +2557,13 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags) } else if (dc_translation == NULL) { - colfunc = R_DrawAddColumnP_C; + colfunc = R_DrawAddColumn; hcolfunc_post1 = rt_add1col; hcolfunc_post4 = rt_add4cols; } else { - colfunc = R_DrawTlatedAddColumnP_C; + colfunc = R_DrawTlatedAddColumn; hcolfunc_post1 = rt_tlateadd1col; hcolfunc_post4 = rt_tlateadd4cols; } @@ -2284,13 +2578,13 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags) } else if (dc_translation == NULL) { - colfunc = R_DrawAddClampColumnP_C; + colfunc = R_DrawAddClampColumn; hcolfunc_post1 = rt_addclamp1col; hcolfunc_post4 = rt_addclamp4cols; } else { - colfunc = R_DrawAddClampTranslatedColumnP_C; + colfunc = R_DrawAddClampTranslatedColumn; hcolfunc_post1 = rt_tlateaddclamp1col; hcolfunc_post4 = rt_tlateaddclamp4cols; } @@ -2306,13 +2600,13 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags) } else if (dc_translation == NULL) { - colfunc = R_DrawSubClampColumnP_C; + colfunc = R_DrawSubClampColumn; hcolfunc_post1 = rt_subclamp1col; hcolfunc_post4 = rt_subclamp4cols; } else { - colfunc = R_DrawSubClampTranslatedColumnP_C; + colfunc = R_DrawSubClampTranslatedColumn; hcolfunc_post1 = rt_tlatesubclamp1col; hcolfunc_post4 = rt_tlatesubclamp4cols; } @@ -2331,13 +2625,13 @@ static bool R_SetBlendFunc (int op, fixed_t fglevel, fixed_t bglevel, int flags) } else if (dc_translation == NULL) { - colfunc = R_DrawRevSubClampColumnP_C; + colfunc = R_DrawRevSubClampColumn; hcolfunc_post1 = rt_revsubclamp1col; hcolfunc_post4 = rt_revsubclamp4cols; } else { - colfunc = R_DrawRevSubClampTranslatedColumnP_C; + colfunc = R_DrawRevSubClampTranslatedColumn; hcolfunc_post1 = rt_tlaterevsubclamp1col; hcolfunc_post4 = rt_tlaterevsubclamp4cols; } @@ -2412,11 +2706,15 @@ ESPSResult R_SetPatchStyle (FRenderStyle style, fixed_t alpha, int translation, colfunc = R_DrawShadedColumn; hcolfunc_post1 = rt_shaded1col; hcolfunc_post4 = rt_shaded4cols; - dc_color = fixedcolormap ? fixedcolormap[APART(color)] : basecolormap->Maps[APART(color)]; - dc_colormap = (basecolormap = &ShadeFakeColormap[16-alpha])->Maps; + dc_color = fixedcolormap ? fixedcolormap->Maps[APART(color)] : basecolormap->Maps[APART(color)]; + basecolormap = &ShadeFakeColormap[16-alpha]; if (fixedlightlev >= 0 && fixedcolormap == NULL) { - dc_colormap += fixedlightlev; + R_SetColorMapLight(basecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); + } + else + { + R_SetColorMapLight(basecolormap, 0, 0); } return r_columnmethod ? DoDraw1 : DoDraw0; } @@ -2426,10 +2724,10 @@ ESPSResult R_SetPatchStyle (FRenderStyle style, fixed_t alpha, int translation, if (style.Flags & STYLEF_ColorIsFixed) { - int x = fglevel >> 10; - int r = RPART(color); - int g = GPART(color); - int b = BPART(color); + uint32_t x = fglevel >> 10; + uint32_t r = RPART(color); + uint32_t g = GPART(color); + uint32_t b = BPART(color); // dc_color is used by the rt_* routines. It is indexed into dc_srcblend. dc_color = RGB32k.RGB[r>>3][g>>3][b>>3]; if (style.Flags & STYLEF_InvertSource) @@ -2438,11 +2736,13 @@ ESPSResult R_SetPatchStyle (FRenderStyle style, fixed_t alpha, int translation, g = 255 - g; b = 255 - b; } + uint32_t alpha = clamp(fglevel >> (FRACBITS - 8), 0, 255); + dc_srccolor_bgra = (alpha << 24) | (r << 16) | (g << 8) | b; // dc_srccolor is used by the R_Fill* routines. It is premultiplied // with the alpha. dc_srccolor = ((((r*x)>>4)<<20) | ((g*x)>>4) | ((((b)*x)>>4)<<10)) & 0x3feffbff; - hcolfunc_pre = R_FillColumnHorizP; - dc_colormap = identitymap; + hcolfunc_pre = R_FillColumnHoriz; + R_SetColorMapLight(&identitycolormap, 0, 0); } if (!R_SetBlendFunc (style.BlendOp, fglevel, bglevel, style.Flags)) @@ -2459,25 +2759,25 @@ void R_FinishSetPatchStyle () bool R_GetTransMaskDrawers (fixed_t (**tmvline1)(), void (**tmvline4)()) { - if (colfunc == R_DrawAddColumnP_C) + if (colfunc == R_DrawAddColumn) { *tmvline1 = tmvline1_add; *tmvline4 = tmvline4_add; return true; } - if (colfunc == R_DrawAddClampColumnP_C) + if (colfunc == R_DrawAddClampColumn) { *tmvline1 = tmvline1_addclamp; *tmvline4 = tmvline4_addclamp; return true; } - if (colfunc == R_DrawSubClampColumnP_C) + if (colfunc == R_DrawSubClampColumn) { *tmvline1 = tmvline1_subclamp; *tmvline4 = tmvline4_subclamp; return true; } - if (colfunc == R_DrawRevSubClampColumnP_C) + if (colfunc == R_DrawRevSubClampColumn) { *tmvline1 = tmvline1_revsubclamp; *tmvline4 = tmvline4_revsubclamp; @@ -2486,3 +2786,70 @@ bool R_GetTransMaskDrawers (fixed_t (**tmvline1)(), void (**tmvline4)()) return false; } +void R_SetTranslationMap(lighttable_t *translation) +{ + dc_fcolormap = nullptr; + dc_colormap = translation; + if (r_swtruecolor) + { + dc_shade_constants.light_red = 256; + dc_shade_constants.light_green = 256; + dc_shade_constants.light_blue = 256; + dc_shade_constants.light_alpha = 256; + dc_shade_constants.fade_red = 0; + dc_shade_constants.fade_green = 0; + dc_shade_constants.fade_blue = 0; + dc_shade_constants.fade_alpha = 256; + dc_shade_constants.desaturate = 0; + dc_shade_constants.simple_shade = true; + dc_light = 0; + } +} + +void R_SetColorMapLight(FSWColormap *base_colormap, float light, int shade) +{ + dc_fcolormap = base_colormap; + if (r_swtruecolor) + { + dc_shade_constants.light_red = dc_fcolormap->Color.r * 256 / 255; + dc_shade_constants.light_green = dc_fcolormap->Color.g * 256 / 255; + dc_shade_constants.light_blue = dc_fcolormap->Color.b * 256 / 255; + dc_shade_constants.light_alpha = dc_fcolormap->Color.a * 256 / 255; + dc_shade_constants.fade_red = dc_fcolormap->Fade.r; + dc_shade_constants.fade_green = dc_fcolormap->Fade.g; + dc_shade_constants.fade_blue = dc_fcolormap->Fade.b; + dc_shade_constants.fade_alpha = dc_fcolormap->Fade.a; + dc_shade_constants.desaturate = MIN(abs(dc_fcolormap->Desaturate), 255) * 255 / 256; + dc_shade_constants.simple_shade = (dc_fcolormap->Color.d == 0x00ffffff && dc_fcolormap->Fade.d == 0x00000000 && dc_fcolormap->Desaturate == 0); + dc_colormap = base_colormap->Maps; + dc_light = LIGHTSCALE(light, shade); + } + else + { + dc_colormap = base_colormap->Maps + (GETPALOOKUP(light, shade) << COLORMAPSHIFT); + } +} + +void R_SetDSColorMapLight(FSWColormap *base_colormap, float light, int shade) +{ + ds_fcolormap = base_colormap; + if (r_swtruecolor) + { + ds_shade_constants.light_red = ds_fcolormap->Color.r * 256 / 255; + ds_shade_constants.light_green = ds_fcolormap->Color.g * 256 / 255; + ds_shade_constants.light_blue = ds_fcolormap->Color.b * 256 / 255; + ds_shade_constants.light_alpha = ds_fcolormap->Color.a * 256 / 255; + ds_shade_constants.fade_red = ds_fcolormap->Fade.r; + ds_shade_constants.fade_green = ds_fcolormap->Fade.g; + ds_shade_constants.fade_blue = ds_fcolormap->Fade.b; + ds_shade_constants.fade_alpha = ds_fcolormap->Fade.a; + ds_shade_constants.desaturate = MIN(abs(ds_fcolormap->Desaturate), 255) * 255 / 256; + ds_shade_constants.simple_shade = (ds_fcolormap->Color.d == 0x00ffffff && ds_fcolormap->Fade.d == 0x00000000 && ds_fcolormap->Desaturate == 0); + ds_colormap = base_colormap->Maps; + ds_light = LIGHTSCALE(light, shade); + } + else + { + ds_colormap = base_colormap->Maps + (GETPALOOKUP(light, shade) << COLORMAPSHIFT); + } +} diff --git a/src/r_draw.h b/src/r_draw.h index cb2f68f33..204f2a493 100644 --- a/src/r_draw.h +++ b/src/r_draw.h @@ -25,24 +25,55 @@ #include "r_defs.h" +// Spectre/Invisibility. +#define FUZZTABLE 50 +extern "C" int fuzzoffset[FUZZTABLE + 1]; // [RH] +1 for the assembly routine +extern "C" int fuzzpos; +extern "C" int fuzzviewheight; + +struct FSWColormap; + +struct ShadeConstants +{ + uint16_t light_alpha; + uint16_t light_red; + uint16_t light_green; + uint16_t light_blue; + uint16_t fade_alpha; + uint16_t fade_red; + uint16_t fade_green; + uint16_t fade_blue; + uint16_t desaturate; + bool simple_shade; +}; + extern "C" int ylookup[MAXHEIGHT]; extern "C" int dc_pitch; // [RH] Distance between rows extern "C" lighttable_t*dc_colormap; +extern "C" FSWColormap *dc_fcolormap; +extern "C" ShadeConstants dc_shade_constants; +extern "C" fixed_t dc_light; extern "C" int dc_x; extern "C" int dc_yl; extern "C" int dc_yh; extern "C" fixed_t dc_iscale; extern double dc_texturemid; extern "C" fixed_t dc_texturefrac; +extern "C" uint32_t dc_textureheight; extern "C" int dc_color; // [RH] For flat colors (no texturing) extern "C" DWORD dc_srccolor; +extern "C" uint32_t dc_srccolor_bgra; extern "C" DWORD *dc_srcblend; extern "C" DWORD *dc_destblend; +extern "C" fixed_t dc_srcalpha; +extern "C" fixed_t dc_destalpha; // first pixel in a column extern "C" const BYTE* dc_source; +extern "C" const BYTE* dc_source2; +extern "C" uint32_t dc_texturefracx; extern "C" BYTE *dc_dest, *dc_destorg; extern "C" int dc_count; @@ -50,7 +81,11 @@ extern "C" int dc_count; extern "C" DWORD vplce[4]; extern "C" DWORD vince[4]; extern "C" BYTE* palookupoffse[4]; +extern "C" fixed_t palookuplight[4]; extern "C" const BYTE* bufplce[4]; +extern "C" const BYTE* bufplce2[4]; +extern "C" uint32_t buftexturefracx[4]; +extern "C" uint32_t bufheight[4]; // [RH] Temporary buffer for column drawing extern "C" BYTE *dc_temp; @@ -58,7 +93,6 @@ extern "C" unsigned int dc_tspans[4][MAXHEIGHT]; extern "C" unsigned int *dc_ctspan[4]; extern "C" unsigned int horizspans[4]; - // [RH] Pointers to the different column and span drawers... // The span blitting interface. @@ -67,12 +101,7 @@ extern void (*R_DrawColumn)(void); extern DWORD (*dovline1) (); extern DWORD (*doprevline1) (); -#ifdef X64_ASM -#define dovline4 vlinetallasm4 -extern "C" void vlinetallasm4(); -#else extern void (*dovline4) (); -#endif extern void setupvline (int); extern DWORD (*domvline1) (); @@ -94,8 +123,8 @@ extern void (*R_DrawTranslatedColumn)(void); // Span drawing for rows, floor/ceiling. No Spectre effect needed. extern void (*R_DrawSpan)(void); void R_SetupSpanBits(FTexture *tex); -void R_SetSpanColormap(BYTE *colormap); -void R_SetSpanSource(const BYTE *pixels); +void R_SetSpanColormap(FDynamicColormap *colormap, int shade); +void R_SetSpanSource(FTexture *tex); // Span drawing for masked textures. extern void (*R_DrawSpanMasked)(void); @@ -125,33 +154,33 @@ extern "C" void rt_copy1col_c (int hx, int sx, int yl, int yh); void rt_copy4cols_c (int sx, int yl, int yh); -void rt_shaded1col (int hx, int sx, int yl, int yh); +void rt_shaded1col_c (int hx, int sx, int yl, int yh); void rt_shaded4cols_c (int sx, int yl, int yh); void rt_shaded4cols_asm (int sx, int yl, int yh); void rt_map1col_c (int hx, int sx, int yl, int yh); -void rt_add1col (int hx, int sx, int yl, int yh); -void rt_addclamp1col (int hx, int sx, int yl, int yh); -void rt_subclamp1col (int hx, int sx, int yl, int yh); -void rt_revsubclamp1col (int hx, int sx, int yl, int yh); +void rt_add1col_c (int hx, int sx, int yl, int yh); +void rt_addclamp1col_c (int hx, int sx, int yl, int yh); +void rt_subclamp1col_c (int hx, int sx, int yl, int yh); +void rt_revsubclamp1col_c (int hx, int sx, int yl, int yh); -void rt_tlate1col (int hx, int sx, int yl, int yh); -void rt_tlateadd1col (int hx, int sx, int yl, int yh); -void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh); -void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh); -void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh); +void rt_tlate1col_c (int hx, int sx, int yl, int yh); +void rt_tlateadd1col_c (int hx, int sx, int yl, int yh); +void rt_tlateaddclamp1col_c (int hx, int sx, int yl, int yh); +void rt_tlatesubclamp1col_c (int hx, int sx, int yl, int yh); +void rt_tlaterevsubclamp1col_c (int hx, int sx, int yl, int yh); void rt_map4cols_c (int sx, int yl, int yh); void rt_add4cols_c (int sx, int yl, int yh); void rt_addclamp4cols_c (int sx, int yl, int yh); -void rt_subclamp4cols (int sx, int yl, int yh); -void rt_revsubclamp4cols (int sx, int yl, int yh); +void rt_subclamp4cols_c (int sx, int yl, int yh); +void rt_revsubclamp4cols_c (int sx, int yl, int yh); -void rt_tlate4cols (int sx, int yl, int yh); -void rt_tlateadd4cols (int sx, int yl, int yh); -void rt_tlateaddclamp4cols (int sx, int yl, int yh); -void rt_tlatesubclamp4cols (int sx, int yl, int yh); -void rt_tlaterevsubclamp4cols (int sx, int yl, int yh); +void rt_tlate4cols_c (int sx, int yl, int yh); +void rt_tlateadd4cols_c (int sx, int yl, int yh); +void rt_tlateaddclamp4cols_c (int sx, int yl, int yh); +void rt_tlatesubclamp4cols_c (int sx, int yl, int yh); +void rt_tlaterevsubclamp4cols_c (int sx, int yl, int yh); void rt_copy1col_asm (int hx, int sx, int yl, int yh); void rt_map1col_asm (int hx, int sx, int yl, int yh); @@ -163,30 +192,49 @@ void rt_add4cols_asm (int sx, int yl, int yh); void rt_addclamp4cols_asm (int sx, int yl, int yh); } -extern void (*rt_map4cols)(int sx, int yl, int yh); +extern void (*rt_copy1col)(int hx, int sx, int yl, int yh); +extern void (*rt_copy4cols)(int sx, int yl, int yh); -#ifdef X86_ASM -#define rt_copy1col rt_copy1col_asm -#define rt_copy4cols rt_copy4cols_asm -#define rt_map1col rt_map1col_asm -#define rt_shaded4cols rt_shaded4cols_asm -#define rt_add4cols rt_add4cols_asm -#define rt_addclamp4cols rt_addclamp4cols_asm -#else -#define rt_copy1col rt_copy1col_c -#define rt_copy4cols rt_copy4cols_c -#define rt_map1col rt_map1col_c -#define rt_shaded4cols rt_shaded4cols_c -#define rt_add4cols rt_add4cols_c -#define rt_addclamp4cols rt_addclamp4cols_c -#endif +extern void (*rt_shaded1col)(int hx, int sx, int yl, int yh); +extern void (*rt_shaded4cols)(int sx, int yl, int yh); + +extern void (*rt_map1col)(int hx, int sx, int yl, int yh); +extern void (*rt_add1col)(int hx, int sx, int yl, int yh); +extern void (*rt_addclamp1col)(int hx, int sx, int yl, int yh); +extern void (*rt_subclamp1col)(int hx, int sx, int yl, int yh); +extern void (*rt_revsubclamp1col)(int hx, int sx, int yl, int yh); + +extern void (*rt_tlate1col)(int hx, int sx, int yl, int yh); +extern void (*rt_tlateadd1col)(int hx, int sx, int yl, int yh); +extern void (*rt_tlateaddclamp1col)(int hx, int sx, int yl, int yh); +extern void (*rt_tlatesubclamp1col)(int hx, int sx, int yl, int yh); +extern void (*rt_tlaterevsubclamp1col)(int hx, int sx, int yl, int yh); + +extern void (*rt_map4cols)(int sx, int yl, int yh); +extern void (*rt_add4cols)(int sx, int yl, int yh); +extern void (*rt_addclamp4cols)(int sx, int yl, int yh); +extern void (*rt_subclamp4cols)(int sx, int yl, int yh); +extern void (*rt_revsubclamp4cols)(int sx, int yl, int yh); + +extern void (*rt_tlate4cols)(int sx, int yl, int yh); +extern void (*rt_tlateadd4cols)(int sx, int yl, int yh); +extern void (*rt_tlateaddclamp4cols)(int sx, int yl, int yh); +extern void (*rt_tlatesubclamp4cols)(int sx, int yl, int yh); +extern void (*rt_tlaterevsubclamp4cols)(int sx, int yl, int yh); + +extern void (*rt_initcols)(BYTE *buffer); +extern void (*rt_span_coverage)(int x, int start, int stop); void rt_draw4cols (int sx); // [RH] Preps the temporary horizontal buffer. -void rt_initcols (BYTE *buffer=NULL); +void rt_initcols_pal (BYTE *buffer); -void R_DrawFogBoundary (int x1, int x2, short *uclip, short *dclip); +void rt_span_coverage_pal(int x, int start, int stop); + +extern void (*R_DrawFogBoundary)(int x1, int x2, short *uclip, short *dclip); + +void R_DrawFogBoundary_C (int x1, int x2, short *uclip, short *dclip); #ifdef X86_ASM @@ -218,26 +266,47 @@ void R_DrawSpanMaskedTranslucentP_C (void); void R_DrawTlatedLucentColumnP_C (void); #define R_DrawTlatedLucentColumn R_DrawTlatedLucentColumnP_C -void R_FillColumnP (void); -void R_FillColumnHorizP (void); -void R_FillSpan (void); +extern void(*R_FillColumn)(void); +extern void(*R_FillAddColumn)(void); +extern void(*R_FillAddClampColumn)(void); +extern void(*R_FillSubClampColumn)(void); +extern void(*R_FillRevSubClampColumn)(void); +extern void(*R_DrawAddColumn)(void); +extern void(*R_DrawTlatedAddColumn)(void); +extern void(*R_DrawAddClampColumn)(void); +extern void(*R_DrawAddClampTranslatedColumn)(void); +extern void(*R_DrawSubClampColumn)(void); +extern void(*R_DrawSubClampTranslatedColumn)(void); +extern void(*R_DrawRevSubClampColumn)(void); +extern void(*R_DrawRevSubClampTranslatedColumn)(void); + +extern void(*R_FillSpan)(void); +extern void(*R_FillColumnHoriz)(void); + +void R_FillColumnP_C (void); + +void R_FillColumnHorizP_C (void); +void R_FillSpan_C (void); + +extern void(*R_SetupDrawSlab)(FSWColormap *base_colormap, float light, int shade); +extern void(*R_DrawSlab)(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p); #ifdef X86_ASM -#define R_SetupDrawSlab R_SetupDrawSlabA -#define R_DrawSlab R_DrawSlabA +extern "C" void R_SetupDrawSlabA(const BYTE *colormap); +extern "C" void R_DrawSlabA(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p); #else -#define R_SetupDrawSlab R_SetupDrawSlabC -#define R_DrawSlab R_DrawSlabC +extern "C" void R_SetupDrawSlabC(const BYTE *colormap); +extern "C" void R_DrawSlabC(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p); #endif -extern "C" void R_SetupDrawSlab(const BYTE *colormap); -extern "C" void R_DrawSlab(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p); - extern "C" int ds_y; extern "C" int ds_x1; extern "C" int ds_x2; +extern "C" FSWColormap* ds_fcolormap; extern "C" lighttable_t* ds_colormap; +extern "C" ShadeConstants ds_shade_constants; +extern "C" dsfixed_t ds_light; extern "C" dsfixed_t ds_xfrac; extern "C" dsfixed_t ds_yfrac; @@ -249,12 +318,14 @@ extern "C" fixed_t ds_alpha; // start of a 64*64 tile image extern "C" const BYTE* ds_source; +extern "C" bool ds_source_mipmapped; extern "C" int ds_color; // [RH] For flat color (no texturing) extern BYTE shadetables[/*NUMCOLORMAPS*16*256*/]; extern FDynamicColormap ShadeFakeColormap[16]; extern BYTE identitymap[256]; +extern FDynamicColormap identitycolormap; extern BYTE *dc_translation; // [RH] Added for muliresolution support @@ -278,6 +349,15 @@ inline ESPSResult R_SetPatchStyle(FRenderStyle style, float alpha, int translati // style was STYLE_Shade void R_FinishSetPatchStyle (); +extern fixed_t(*tmvline1_add)(); +extern void(*tmvline4_add)(); +extern fixed_t(*tmvline1_addclamp)(); +extern void(*tmvline4_addclamp)(); +extern fixed_t(*tmvline1_subclamp)(); +extern void(*tmvline4_subclamp)(); +extern fixed_t(*tmvline1_revsubclamp)(); +extern void(*tmvline4_revsubclamp)(); + // transmaskwallscan calls this to find out what column drawers to use bool R_GetTransMaskDrawers (fixed_t (**tmvline1)(), void (**tmvline4)()); @@ -293,4 +373,19 @@ void maskwallscan (int x1, int x2, short *uwal, short *dwal, float *swal, fixed_ // transmaskwallscan is like maskwallscan, but it can also blend to the background void transmaskwallscan (int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, double yrepeat, const BYTE *(*getcol)(FTexture *tex, int col)=R_GetColumn); +// Sets dc_colormap and dc_light to their appropriate values depending on the output format (pal vs true color) +void R_SetColorMapLight(FSWColormap *base_colormap, float light, int shade); + +// Same as R_SetColorMapLight, but for ds_colormap and ds_light +void R_SetDSColorMapLight(FSWColormap *base_colormap, float light, int shade); + +void R_SetTranslationMap(lighttable_t *translation); + +extern bool r_swtruecolor; + +EXTERN_CVAR(Bool, r_multithreaded); +EXTERN_CVAR(Bool, r_magfilter); +EXTERN_CVAR(Bool, r_minfilter); +EXTERN_CVAR(Bool, r_mipmap); + #endif diff --git a/src/r_draw_rgba.cpp b/src/r_draw_rgba.cpp new file mode 100644 index 000000000..0d86ead47 --- /dev/null +++ b/src/r_draw_rgba.cpp @@ -0,0 +1,2969 @@ +// Emacs style mode select -*- C++ -*- +//----------------------------------------------------------------------------- +// +// $Id:$ +// +// Copyright (C) 1993-1996 by id Software, Inc. +// +// This source is available for distribution and/or modification +// only under the terms of the DOOM Source Code License as +// published by id Software. All rights reserved. +// +// The source is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// FITNESS FOR A PARTICULAR PURPOSE. See the DOOM Source Code License +// for more details. +// +// $Log:$ +// +// DESCRIPTION: +// True color span/column drawing functions. +// +//----------------------------------------------------------------------------- + +#include + +#include "templates.h" +#include "doomdef.h" +#include "i_system.h" +#include "w_wad.h" +#include "r_local.h" +#include "v_video.h" +#include "doomstat.h" +#include "st_stuff.h" +#include "g_game.h" +#include "g_level.h" +#include "r_data/r_translate.h" +#include "v_palette.h" +#include "r_data/colormaps.h" +#include "r_plane.h" +#include "r_draw_rgba.h" + +#include "gi.h" +#include "stats.h" +#include "x86.h" +#ifndef NO_SSE +#include +#include +#endif +#include + +extern "C" short spanend[MAXHEIGHT]; +extern float rw_light; +extern float rw_lightstep; +extern int wallshade; + +// Use multiple threads when drawing +CVAR(Bool, r_multithreaded, true, 0); + +// Use linear filtering when scaling up +CVAR(Bool, r_magfilter, false, CVAR_ARCHIVE | CVAR_GLOBALCONFIG); + +// Use linear filtering when scaling down +CVAR(Bool, r_minfilter, true, CVAR_ARCHIVE | CVAR_GLOBALCONFIG); + +// Use mipmapped textures +CVAR(Bool, r_mipmap, true, CVAR_ARCHIVE | CVAR_GLOBALCONFIG); + +#ifndef NO_SSE + +#ifdef _MSC_VER +#pragma warning(disable: 4101) // warning C4101: unreferenced local variable +#endif + +// Generate SSE drawers: +#define VecCommand(name) name##_SSE_Command +#define VEC_SHADE_VARS SSE_SHADE_VARS +#define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT +#define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4 +#define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE +#define VEC_SHADE_INIT SSE_SHADE_INIT +#define VEC_SHADE_INIT4 SSE_SHADE_INIT4 +#define VEC_SHADE SSE_SHADE +#include "r_draw_rgba_sse.h" +/* +// Generate AVX drawers: +#undef VecCommand +#undef VEC_SHADE_SIMPLE_INIT +#undef VEC_SHADE_SIMPLE_INIT4 +#undef VEC_SHADE_SIMPLE +#undef VEC_SHADE_INIT +#undef VEC_SHADE_INIT4 +#undef VEC_SHADE +#define VecCommand(name) name##_AVX_Command +#define VEC_SHADE_SIMPLE_INIT AVX_LINEAR_SHADE_SIMPLE_INIT +#define VEC_SHADE_SIMPLE_INIT4 AVX_LINEAR_SHADE_SIMPLE_INIT4 +#define VEC_SHADE_SIMPLE AVX_LINEAR_SHADE_SIMPLE +#define VEC_SHADE_INIT AVX_LINEAR_SHADE_INIT +#define VEC_SHADE_INIT4 AVX_LINEAR_SHADE_INIT4 +#define VEC_SHADE AVX_LINEAR_SHADE +#include "r_draw_rgba_sse.h" +*/ +#endif + +///////////////////////////////////////////////////////////////////////////// + +#ifndef NO_SSE +__m128i SampleBgra::samplertable[256 * 2]; +#endif + +DrawerCommandQueue *DrawerCommandQueue::Instance() +{ + static DrawerCommandQueue queue; + return &queue; +} + +DrawerCommandQueue::DrawerCommandQueue() +{ +#ifndef NO_SSE + for (int inv_b = 0; inv_b < 16; inv_b++) + { + for (int inv_a = 0; inv_a < 16; inv_a++) + { + int a = 16 - inv_a; + int b = 16 - inv_b; + + int ab = a * b; + int invab = inv_a * b; + int ainvb = a * inv_b; + int invainvb = inv_a * inv_b; + + __m128i ab_invab = _mm_set_epi16(invab, invab, invab, invab, ab, ab, ab, ab); + __m128i ainvb_invainvb = _mm_set_epi16(invainvb, invainvb, invainvb, invainvb, ainvb, ainvb, ainvb, ainvb); + + _mm_store_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2, ab_invab); + _mm_store_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2 + 1, ainvb_invainvb); + } + } +#endif +} + +DrawerCommandQueue::~DrawerCommandQueue() +{ + StopThreads(); +} + +void* DrawerCommandQueue::AllocMemory(size_t size) +{ + // Make sure allocations remain 16-byte aligned + size = (size + 15) / 16 * 16; + + auto queue = Instance(); + if (queue->memorypool_pos + size > memorypool_size) + return nullptr; + + void *data = queue->memorypool + queue->memorypool_pos; + queue->memorypool_pos += size; + return data; +} + +void DrawerCommandQueue::Begin() +{ + auto queue = Instance(); + queue->Finish(); + queue->threaded_render++; +} + +void DrawerCommandQueue::End() +{ + auto queue = Instance(); + queue->Finish(); + if (queue->threaded_render > 0) + queue->threaded_render--; +} + +void DrawerCommandQueue::WaitForWorkers() +{ + Instance()->Finish(); +} + +void DrawerCommandQueue::Finish() +{ + auto queue = Instance(); + if (queue->commands.empty()) + return; + + // Give worker threads something to do: + + std::unique_lock start_lock(queue->start_mutex); + queue->active_commands.swap(queue->commands); + queue->run_id++; + start_lock.unlock(); + + queue->StartThreads(); + queue->start_condition.notify_all(); + + // Do one thread ourselves: + + DrawerThread thread; + thread.core = 0; + thread.num_cores = queue->threads.size() + 1; + + for (int pass = 0; pass < queue->num_passes; pass++) + { + thread.pass_start_y = pass * queue->rows_in_pass; + thread.pass_end_y = (pass + 1) * queue->rows_in_pass; + if (pass + 1 == queue->num_passes) + thread.pass_end_y = MAX(thread.pass_end_y, MAXHEIGHT); + + size_t size = queue->active_commands.size(); + for (size_t i = 0; i < size; i++) + { + auto &command = queue->active_commands[i]; + command->Execute(&thread); + } + } + + // Wait for everyone to finish: + + std::unique_lock end_lock(queue->end_mutex); + queue->end_condition.wait(end_lock, [&]() { return queue->finished_threads == queue->threads.size(); }); + + // Clean up batch: + + for (auto &command : queue->active_commands) + command->~DrawerCommand(); + queue->active_commands.clear(); + queue->memorypool_pos = 0; + queue->finished_threads = 0; +} + +void DrawerCommandQueue::StartThreads() +{ + if (!threads.empty()) + return; + + int num_threads = std::thread::hardware_concurrency(); + if (num_threads == 0) + num_threads = 4; + + threads.resize(num_threads - 1); + + for (int i = 0; i < num_threads - 1; i++) + { + DrawerCommandQueue *queue = this; + DrawerThread *thread = &threads[i]; + thread->core = i + 1; + thread->num_cores = num_threads; + thread->thread = std::thread([=]() + { + int run_id = 0; + while (true) + { + // Wait until we are signalled to run: + std::unique_lock start_lock(queue->start_mutex); + queue->start_condition.wait(start_lock, [&]() { return queue->run_id != run_id || queue->shutdown_flag; }); + if (queue->shutdown_flag) + break; + run_id = queue->run_id; + start_lock.unlock(); + + // Do the work: + for (int pass = 0; pass < queue->num_passes; pass++) + { + thread->pass_start_y = pass * queue->rows_in_pass; + thread->pass_end_y = (pass + 1) * queue->rows_in_pass; + if (pass + 1 == queue->num_passes) + thread->pass_end_y = MAX(thread->pass_end_y, MAXHEIGHT); + + size_t size = queue->active_commands.size(); + for (size_t i = 0; i < size; i++) + { + auto &command = queue->active_commands[i]; + command->Execute(thread); + } + } + + // Notify main thread that we finished: + std::unique_lock end_lock(queue->end_mutex); + queue->finished_threads++; + end_lock.unlock(); + queue->end_condition.notify_all(); + } + }); + } +} + +void DrawerCommandQueue::StopThreads() +{ + std::unique_lock lock(start_mutex); + shutdown_flag = true; + lock.unlock(); + start_condition.notify_all(); + for (auto &thread : threads) + thread.thread.join(); + threads.clear(); + lock.lock(); + shutdown_flag = false; +} + +///////////////////////////////////////////////////////////////////////////// + +class DrawerColumnCommand : public DrawerCommand +{ +public: + int _count; + BYTE * RESTRICT _dest; + int _pitch; + DWORD _iscale; + DWORD _texturefrac; + + DrawerColumnCommand() + { + _count = dc_count; + _dest = dc_dest; + _iscale = dc_iscale; + _texturefrac = dc_texturefrac; + _pitch = dc_pitch; + } + + class LoopIterator + { + public: + int count; + uint32_t *dest; + int pitch; + fixed_t fracstep; + fixed_t frac; + + LoopIterator(DrawerColumnCommand *command, DrawerThread *thread) + { + count = thread->count_for_thread(command->_dest_y, command->_count); + if (count <= 0) + return; + + dest = thread->dest_for_thread(command->_dest_y, command->_pitch, (uint32_t*)command->_dest); + pitch = command->_pitch * thread->num_cores; + + fracstep = command->_iscale * thread->num_cores; + frac = command->_texturefrac + command->_iscale * thread->skipped_by_thread(command->_dest_y); + } + + uint32_t sample_index() + { + return frac >> FRACBITS; + } + + explicit operator bool() + { + return count > 0; + } + + bool next() + { + dest += pitch; + frac += fracstep; + return (--count) != 0; + } + }; +}; + +class DrawColumnRGBACommand : public DrawerColumnCommand +{ + uint32_t _light; + const BYTE * RESTRICT _source; + ShadeConstants _shade_constants; + BYTE * RESTRICT _colormap; + +public: + DrawColumnRGBACommand() + { + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _source = dc_source; + _colormap = dc_colormap; + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_colormap[_source[loop.sample_index()]], _light, _shade_constants); + *loop.dest = BlendBgra::copy(fg); + } while (loop.next()); + } +}; + +class FillColumnRGBACommand : public DrawerColumnCommand +{ + uint32_t _color; + +public: + FillColumnRGBACommand() + { + uint32_t light = LightBgra::calc_light_multiplier(dc_light); + _color = LightBgra::shade_pal_index_simple(dc_color, light); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + *loop.dest = BlendBgra::copy(_color); + } while (loop.next()); + } +}; + +class FillAddColumnRGBACommand : public DrawerColumnCommand +{ + uint32_t _srccolor; + +public: + FillAddColumnRGBACommand() + { + _srccolor = dc_srccolor_bgra; + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + uint32_t alpha = APART(_srccolor); + alpha += alpha >> 7; + + do + { + *loop.dest = BlendBgra::add(_srccolor, *loop.dest, alpha, 256 - alpha); + } while (loop.next()); + } +}; + +class FillAddClampColumnRGBACommand : public DrawerColumnCommand +{ + int _color; + uint32_t _srccolor; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + FillAddClampColumnRGBACommand() + { + _color = dc_color; + _srccolor = dc_srccolor_bgra; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + *loop.dest = BlendBgra::add(_srccolor, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class FillSubClampColumnRGBACommand : public DrawerColumnCommand +{ + uint32_t _srccolor; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + FillSubClampColumnRGBACommand() + { + _srccolor = dc_srccolor_bgra; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + *loop.dest = BlendBgra::sub(_srccolor, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class FillRevSubClampColumnRGBACommand : public DrawerColumnCommand +{ + uint32_t _srccolor; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + FillRevSubClampColumnRGBACommand() + { + _srccolor = dc_srccolor_bgra; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + *loop.dest = BlendBgra::revsub(_srccolor, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawAddColumnRGBACommand : public DrawerColumnCommand +{ + const BYTE * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + uint32_t _srcalpha; + uint32_t _destalpha; + BYTE * RESTRICT _colormap; + +public: + DrawAddColumnRGBACommand() + { + _source = dc_source; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + _colormap = dc_colormap; + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_colormap[_source[loop.sample_index()]], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawTranslatedColumnRGBACommand : public DrawerColumnCommand +{ + fixed_t _light; + ShadeConstants _shade_constants; + BYTE * RESTRICT _translation; + const BYTE * RESTRICT _source; + +public: + DrawTranslatedColumnRGBACommand() + { + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _translation = dc_translation; + _source = dc_source; + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_translation[_source[loop.sample_index()]], _light, _shade_constants); + *loop.dest = BlendBgra::copy(fg); + } while (loop.next()); + } +}; + +class DrawTlatedAddColumnRGBACommand : public DrawerColumnCommand +{ + fixed_t _light; + ShadeConstants _shade_constants; + BYTE * RESTRICT _translation; + const BYTE * RESTRICT _source; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + DrawTlatedAddColumnRGBACommand() + { + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _translation = dc_translation; + _source = dc_source; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_translation[_source[loop.sample_index()]], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawShadedColumnRGBACommand : public DrawerColumnCommand +{ +private: + const BYTE * RESTRICT _source; + lighttable_t * RESTRICT _colormap; + uint32_t _color; + +public: + DrawShadedColumnRGBACommand() + { + _source = dc_source; + _colormap = dc_colormap; + _color = LightBgra::shade_pal_index_simple(dc_color, LightBgra::calc_light_multiplier(dc_light)); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t alpha = clamp(_colormap[_source[loop.sample_index()]], 0, 64) * 4; + uint32_t inv_alpha = 256 - alpha; + *loop.dest = BlendBgra::add(_color, *loop.dest, alpha, inv_alpha); + } while (loop.next()); + } +}; + +class DrawAddClampColumnRGBACommand : public DrawerColumnCommand +{ + const BYTE * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + DrawAddClampColumnRGBACommand() + { + _source = dc_source; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawAddClampTranslatedColumnRGBACommand : public DrawerColumnCommand +{ + BYTE * RESTRICT _translation; + const BYTE * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + DrawAddClampTranslatedColumnRGBACommand() + { + _translation = dc_translation; + _source = dc_source; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_translation[_source[loop.sample_index()]], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawSubClampColumnRGBACommand : public DrawerColumnCommand +{ + const BYTE * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + DrawSubClampColumnRGBACommand() + { + _source = dc_source; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::sub(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawSubClampTranslatedColumnRGBACommand : public DrawerColumnCommand +{ + const BYTE * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + uint32_t _srcalpha; + uint32_t _destalpha; + BYTE * RESTRICT _translation; + +public: + DrawSubClampTranslatedColumnRGBACommand() + { + _source = dc_source; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + _translation = dc_translation; + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_translation[_source[loop.sample_index()]], _light, _shade_constants); + *loop.dest = BlendBgra::sub(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawRevSubClampColumnRGBACommand : public DrawerColumnCommand +{ + const BYTE * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + uint32_t _srcalpha; + uint32_t _destalpha; + +public: + DrawRevSubClampColumnRGBACommand() + { + _source = dc_source; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::revsub(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawRevSubClampTranslatedColumnRGBACommand : public DrawerColumnCommand +{ + const BYTE * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + uint32_t _srcalpha; + uint32_t _destalpha; + BYTE * RESTRICT _translation; + +public: + DrawRevSubClampTranslatedColumnRGBACommand() + { + _source = dc_source; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + _translation = dc_translation; + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_translation[_source[loop.sample_index()]], _light, _shade_constants); + *loop.dest = BlendBgra::revsub(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class DrawFuzzColumnRGBACommand : public DrawerCommand +{ + int _x; + int _yl; + int _yh; + BYTE * RESTRICT _destorg; + int _pitch; + int _fuzzpos; + int _fuzzviewheight; + +public: + DrawFuzzColumnRGBACommand() + { + _x = dc_x; + _yl = dc_yl; + _yh = dc_yh; + _destorg = dc_destorg; + _pitch = dc_pitch; + _fuzzpos = fuzzpos; + _fuzzviewheight = fuzzviewheight; + } + + void Execute(DrawerThread *thread) override + { + int yl = MAX(_yl, 1); + int yh = MIN(_yh, _fuzzviewheight); + + int count = thread->count_for_thread(yl, yh - yl + 1); + + // Zero length. + if (count <= 0) + return; + + uint32_t *dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + _x + (uint32_t*)_destorg); + + int pitch = _pitch * thread->num_cores; + int fuzzstep = thread->num_cores; + int fuzz = (_fuzzpos + thread->skipped_by_thread(yl)) % FUZZTABLE; + + yl += thread->skipped_by_thread(yl); + + // Handle the case where we would go out of bounds at the top: + if (yl < fuzzstep) + { + uint32_t *srcdest = dest + fuzzoffset[fuzz] * fuzzstep + pitch; + //assert(static_cast((srcdest - (uint32_t*)dc_destorg) / (_pitch)) < viewheight); + + uint32_t bg = *srcdest; + + uint32_t red = RPART(bg) * 3 / 4; + uint32_t green = GPART(bg) * 3 / 4; + uint32_t blue = BPART(bg) * 3 / 4; + + *dest = 0xff000000 | (red << 16) | (green << 8) | blue; + dest += pitch; + fuzz += fuzzstep; + fuzz %= FUZZTABLE; + + count--; + if (count == 0) + return; + } + + bool lowerbounds = (yl + (count + fuzzstep - 1) * fuzzstep > _fuzzviewheight); + if (lowerbounds) + count--; + + // Fuzz where fuzzoffset stays within bounds + while (count > 0) + { + int available = (FUZZTABLE - fuzz); + int next_wrap = available / fuzzstep; + if (available % fuzzstep != 0) + next_wrap++; + + int cnt = MIN(count, next_wrap); + count -= cnt; + do + { + uint32_t *srcdest = dest + fuzzoffset[fuzz] * fuzzstep; + //assert(static_cast((srcdest - (uint32_t*)dc_destorg) / (_pitch)) < viewheight); + + uint32_t bg = *srcdest; + + uint32_t red = RPART(bg) * 3 / 4; + uint32_t green = GPART(bg) * 3 / 4; + uint32_t blue = BPART(bg) * 3 / 4; + + *dest = 0xff000000 | (red << 16) | (green << 8) | blue; + dest += pitch; + fuzz += fuzzstep; + } while (--cnt); + + fuzz %= FUZZTABLE; + } + + // Handle the case where we would go out of bounds at the bottom + if (lowerbounds) + { + uint32_t *srcdest = dest + fuzzoffset[fuzz] * fuzzstep - pitch; + //assert(static_cast((srcdest - (uint32_t*)dc_destorg) / (_pitch)) < viewheight); + + uint32_t bg = *srcdest; + + uint32_t red = RPART(bg) * 3 / 4; + uint32_t green = GPART(bg) * 3 / 4; + uint32_t blue = BPART(bg) * 3 / 4; + + *dest = 0xff000000 | (red << 16) | (green << 8) | blue; + } + } +}; + +///////////////////////////////////////////////////////////////////////////// + +class DrawerSpanCommand : public DrawerCommand +{ +public: + fixed_t _xfrac; + fixed_t _yfrac; + fixed_t _xstep; + fixed_t _ystep; + int _x1; + int _x2; + int _y; + int _xbits; + int _ybits; + BYTE * RESTRICT _destorg; + + const uint32_t * RESTRICT _source; + uint32_t _light; + ShadeConstants _shade_constants; + bool _nearest_filter; + + uint32_t _srcalpha; + uint32_t _destalpha; + + DrawerSpanCommand() + { + _xfrac = ds_xfrac; + _yfrac = ds_yfrac; + _xstep = ds_xstep; + _ystep = ds_ystep; + _x1 = ds_x1; + _x2 = ds_x2; + _y = ds_y; + _xbits = ds_xbits; + _ybits = ds_ybits; + _destorg = dc_destorg; + + _source = (const uint32_t*)ds_source; + _light = LightBgra::calc_light_multiplier(ds_light); + _shade_constants = ds_shade_constants; + _nearest_filter = !SampleBgra::span_sampler_setup(_source, _xbits, _ybits, _xstep, _ystep, ds_source_mipmapped); + + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + class LoopIterator + { + public: + uint32_t *dest; + int count; + dsfixed_t xfrac; + dsfixed_t yfrac; + dsfixed_t xstep; + dsfixed_t ystep; + BYTE yshift; + BYTE xshift; + int xmask; + bool is_64x64; + bool skipped; + + LoopIterator(DrawerSpanCommand *command, DrawerThread *thread) + { + dest = ylookup[command->_y] + command->_x1 + (uint32_t*)command->_destorg; + count = command->_x2 - command->_x1 + 1; + xfrac = command->_xfrac; + yfrac = command->_yfrac; + xstep = command->_xstep; + ystep = command->_ystep; + yshift = 32 - command->_ybits; + xshift = yshift - command->_xbits; + xmask = ((1 << command->_xbits) - 1) << command->_ybits; + is_64x64 = command->_xbits == 6 && command->_ybits == 6; + skipped = thread->line_skipped_by_thread(command->_y); + } + + // 64x64 is the most common case by far, so special case it. + int spot64() + { + return ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + } + + int spot() + { + return ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + } + + explicit operator bool() + { + return !skipped && count > 0; + } + + bool next() + { + dest++; + xfrac += xstep; + yfrac += ystep; + return (--count) != 0; + } + }; +}; + +class DrawSpanRGBACommand : public DrawerSpanCommand +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (_nearest_filter) + { + if (loop.is_64x64) + { + do + { + *loop.dest = LightBgra::shade_bgra(_source[loop.spot64()], _light, _shade_constants); + } while (loop.next()); + } + else + { + do + { + *loop.dest = LightBgra::shade_bgra(_source[loop.spot()], _light, _shade_constants); + } while (loop.next()); + } + } + else + { + if (loop.is_64x64) + { + do + { + *loop.dest = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_source, loop.xfrac, loop.yfrac, 26, 26), _light, _shade_constants); + } while (loop.next()); + } + else + { + do + { + *loop.dest = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_source, loop.xfrac, loop.yfrac, 32 - _xbits, 32 - _ybits), _light, _shade_constants); + } while (loop.next()); + } + } + } +}; + +class DrawSpanMaskedRGBACommand : public DrawerSpanCommand +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (_nearest_filter) + { + if (loop.is_64x64) + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot64()], _light, _shade_constants); + *loop.dest = BlendBgra::alpha_blend(fg, *loop.dest); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot()], _light, _shade_constants); + *loop.dest = BlendBgra::alpha_blend(fg, *loop.dest); + } while (loop.next()); + } + } + else + { + if (loop.is_64x64) + { + do + { + uint32_t fg = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_source, loop.xfrac, loop.yfrac, 26, 26), _light, _shade_constants); + *loop.dest = BlendBgra::alpha_blend(fg, *loop.dest); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_source, loop.xfrac, loop.yfrac, 32 - _xbits, 32 - _ybits), _light, _shade_constants); + *loop.dest = BlendBgra::alpha_blend(fg, *loop.dest); + } while (loop.next()); + } + } + } +}; + +class DrawSpanTranslucentRGBACommand : public DrawerSpanCommand +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (loop.is_64x64) + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot64()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } + } +}; + +class DrawSpanMaskedTranslucentRGBACommand : public DrawerSpanCommand +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (loop.is_64x64) + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot64()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } + } +}; + +class DrawSpanAddClampRGBACommand : public DrawerSpanCommand +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (loop.is_64x64) + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot64()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } + } +}; + +class DrawSpanMaskedAddClampRGBACommand : public DrawerSpanCommand +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (loop.is_64x64) + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot64()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.spot()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } + } +}; + +class FillSpanRGBACommand : public DrawerCommand +{ + int _x1; + int _x2; + int _y; + BYTE * RESTRICT _destorg; + fixed_t _light; + int _color; + +public: + FillSpanRGBACommand() + { + _x1 = ds_x1; + _x2 = ds_x2; + _y = ds_y; + _destorg = dc_destorg; + _light = ds_light; + _color = ds_color; + } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(_y)) + return; + + uint32_t *dest = ylookup[_y] + _x1 + (uint32_t*)_destorg; + int count = (_x2 - _x1 + 1); + uint32_t light = LightBgra::calc_light_multiplier(_light); + uint32_t color = LightBgra::shade_pal_index_simple(_color, light); + for (int i = 0; i < count; i++) + dest[i] = color; + } +}; + +///////////////////////////////////////////////////////////////////////////// + +class DrawSlabRGBACommand : public DrawerCommand +{ + int _dx; + fixed_t _v; + int _dy; + fixed_t _vi; + const BYTE *_voxelptr; + uint32_t *_p; + ShadeConstants _shade_constants; + const BYTE *_colormap; + fixed_t _light; + int _pitch; + int _start_y; + +public: + DrawSlabRGBACommand(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p, ShadeConstants shade_constants, const BYTE *colormap, fixed_t light) + { + _dx = dx; + _v = v; + _dy = dy; + _vi = vi; + _voxelptr = vptr; + _p = (uint32_t *)p; + _shade_constants = shade_constants; + _colormap = colormap; + _light = light; + _pitch = dc_pitch; + _start_y = static_cast((p - dc_destorg) / (dc_pitch * 4)); + assert(dx > 0); + } + + void Execute(DrawerThread *thread) override + { + int dx = _dx; + fixed_t v = _v; + int dy = _dy; + fixed_t vi = _vi; + const BYTE *vptr = _voxelptr; + uint32_t *p = _p; + ShadeConstants shade_constants = _shade_constants; + const BYTE *colormap = _colormap; + uint32_t light = LightBgra::calc_light_multiplier(_light); + int pitch = _pitch; + int x; + + dy = thread->count_for_thread(_start_y, dy); + p = thread->dest_for_thread(_start_y, pitch, p); + v += vi * thread->skipped_by_thread(_start_y); + vi *= thread->num_cores; + pitch *= thread->num_cores; + + if (dx == 1) + { + while (dy > 0) + { + *p = LightBgra::shade_pal_index(colormap[vptr[v >> FRACBITS]], light, shade_constants); + p += pitch; + v += vi; + dy--; + } + } + else if (dx == 2) + { + while (dy > 0) + { + uint32_t color = LightBgra::shade_pal_index(colormap[vptr[v >> FRACBITS]], light, shade_constants); + p[0] = color; + p[1] = color; + p += pitch; + v += vi; + dy--; + } + } + else if (dx == 3) + { + while (dy > 0) + { + uint32_t color = LightBgra::shade_pal_index(colormap[vptr[v >> FRACBITS]], light, shade_constants); + p[0] = color; + p[1] = color; + p[2] = color; + p += pitch; + v += vi; + dy--; + } + } + else if (dx == 4) + { + while (dy > 0) + { + uint32_t color = LightBgra::shade_pal_index(colormap[vptr[v >> FRACBITS]], light, shade_constants); + p[0] = color; + p[1] = color; + p[2] = color; + p[3] = color; + p += pitch; + v += vi; + dy--; + } + } + else while (dy > 0) + { + uint32_t color = LightBgra::shade_pal_index(colormap[vptr[v >> FRACBITS]], light, shade_constants); + // The optimizer will probably turn this into a memset call. + // Since dx is not likely to be large, I'm not sure that's a good thing, + // hence the alternatives above. + for (x = 0; x < dx; x++) + { + p[x] = color; + } + p += pitch; + v += vi; + dy--; + } + } +}; + +///////////////////////////////////////////////////////////////////////////// + +class DrawerWall1Command : public DrawerCommand +{ +public: + BYTE * RESTRICT _dest; + int _pitch; + int _count; + DWORD _texturefrac; + uint32_t _texturefracx; + DWORD _iscale; + uint32_t _textureheight; + + const uint32 * RESTRICT _source; + const uint32 * RESTRICT _source2; + uint32_t _light; + ShadeConstants _shade_constants; + + uint32_t _srcalpha; + uint32_t _destalpha; + + DrawerWall1Command() + { + _dest = dc_dest; + _pitch = dc_pitch; + _count = dc_count; + _texturefrac = dc_texturefrac; + _texturefracx = dc_texturefracx; + _iscale = dc_iscale; + _textureheight = dc_textureheight; + + _source = (const uint32 *)dc_source; + _source2 = (const uint32 *)dc_source2; + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + class LoopIterator + { + public: + uint32_t *dest; + int pitch; + int count; + uint32_t fracstep; + uint32_t frac; + uint32_t texturefracx; + uint32_t height; + uint32_t one; + + LoopIterator(DrawerWall1Command *command, DrawerThread *thread) + { + count = thread->count_for_thread(command->_dest_y, command->_count); + if (count <= 0) + return; + + fracstep = command->_iscale * thread->num_cores; + frac = command->_texturefrac + command->_iscale * thread->skipped_by_thread(command->_dest_y); + texturefracx = command->_texturefracx; + dest = thread->dest_for_thread(command->_dest_y, command->_pitch, (uint32_t*)command->_dest); + pitch = command->_pitch * thread->num_cores; + + height = command->_textureheight; + one = ((0x80000000 + height - 1) / height) * 2 + 1; + } + + explicit operator bool() + { + return count > 0; + } + + int sample_index() + { + return ((frac >> FRACBITS) * height) >> FRACBITS; + } + + bool next() + { + frac += fracstep; + dest += pitch; + return (--count) != 0; + } + }; +}; + +class DrawerWall4Command : public DrawerCommand +{ +public: + BYTE * RESTRICT _dest; + int _count; + int _pitch; + ShadeConstants _shade_constants; + uint32_t _vplce[4]; + uint32_t _vince[4]; + uint32_t _buftexturefracx[4]; + uint32_t _bufheight[4]; + const uint32_t * RESTRICT _bufplce[4]; + const uint32_t * RESTRICT _bufplce2[4]; + uint32_t _light[4]; + + uint32_t _srcalpha; + uint32_t _destalpha; + + DrawerWall4Command() + { + _dest = dc_dest; + _count = dc_count; + _pitch = dc_pitch; + _shade_constants = dc_shade_constants; + for (int i = 0; i < 4; i++) + { + _vplce[i] = vplce[i]; + _vince[i] = vince[i]; + _buftexturefracx[i] = buftexturefracx[i]; + _bufheight[i] = bufheight[i]; + _bufplce[i] = (const uint32_t *)bufplce[i]; + _bufplce2[i] = (const uint32_t *)bufplce2[i]; + _light[i] = LightBgra::calc_light_multiplier(palookuplight[i]); + } + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + class LoopIterator + { + public: + uint32_t *dest; + int pitch; + int count; + uint32_t vplce[4]; + uint32_t vince[4]; + uint32_t height[4]; + uint32_t one[4]; + + LoopIterator(DrawerWall4Command *command, DrawerThread *thread) + { + count = thread->count_for_thread(command->_dest_y, command->_count); + if (count <= 0) + return; + + dest = thread->dest_for_thread(command->_dest_y, command->_pitch, (uint32_t*)command->_dest); + pitch = command->_pitch * thread->num_cores; + + int skipped = thread->skipped_by_thread(command->_dest_y); + for (int i = 0; i < 4; i++) + { + vplce[i] = command->_vplce[i] + command->_vince[i] * skipped; + vince[i] = command->_vince[i] * thread->num_cores; + height[i] = command->_bufheight[i]; + one[i] = ((0x80000000 + height[i] - 1) / height[i]) * 2 + 1; + } + } + + explicit operator bool() + { + return count > 0; + } + + int sample_index(int col) + { + return ((vplce[col] >> FRACBITS) * height[col]) >> FRACBITS; + } + + bool next() + { + vplce[0] += vince[0]; + vplce[1] += vince[1]; + vplce[2] += vince[2]; + vplce[3] += vince[3]; + dest += pitch; + return (--count) != 0; + } + }; + +#ifdef NO_SSE + struct NearestSampler + { + FORCEINLINE static uint32_t Sample1(DrawerWall4Command &cmd, LoopIterator &loop, int index) + { + return cmd._bufplce[index][loop.sample_index(index)]; + } + }; + struct LinearSampler + { + FORCEINLINE static uint32_t Sample1(DrawerWall4Command &cmd, LoopIterator &loop, int index) + { + return SampleBgra::sample_bilinear(cmd._bufplce[index], cmd._bufplce2[index], cmd._buftexturefracx[index], loop.vplce[index], loop.one[index], loop.height[index]); + } + }; +#else + struct NearestSampler + { + FORCEINLINE static __m128i Sample4(DrawerWall4Command &cmd, LoopIterator &loop) + { + return _mm_set_epi32(cmd._bufplce[3][loop.sample_index(3)], cmd._bufplce[2][loop.sample_index(2)], cmd._bufplce[1][loop.sample_index(1)], cmd._bufplce[0][loop.sample_index(0)]); + } + }; + + struct LinearSampler + { + FORCEINLINE static __m128i Sample4(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg; + VEC_SAMPLE_BILINEAR4_COLUMN(fg, cmd._bufplce, cmd._bufplce2, cmd._buftexturefracx, loop.vplce, loop.one, loop.height); + return fg; + } + }; +#endif + +#ifdef NO_SSE + template + struct Copy + { + Copy(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::copy(fg); + } + } + }; + + template + struct Mask + { + Mask(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::alpha_blend(fg, loop.dest[i]); + } + } + }; + + template + struct TMaskAdd + { + TMaskAdd(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::add(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha)); + } + } + }; + + template + struct TMaskSub + { + TMaskSub(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::sub(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha)); + } + } + }; + + template + struct TMaskRevSub + { + TMaskRevSub(DrawerWall4Command &cmd, LoopIterator &loop) + { + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_bgra(Sampler::Sample1(cmd, loop, i), cmd._light[i], cmd._shade_constants); + loop.dest[i] = BlendBgra::revsub(fg, loop.dest[i], cmd._srcalpha, calc_blend_bgalpha(fg, cmd._destalpha)); + } + } + }; + + typedef Copy CopyNearestSimple; + typedef Copy CopyLinearSimple; + typedef Copy CopyNearest; + typedef Copy CopyLinear; + typedef Mask MaskNearestSimple; + typedef Mask MaskLinearSimple; + typedef Mask MaskNearest; + typedef Mask MaskLinear; + typedef TMaskAdd TMaskAddNearestSimple; + typedef TMaskAdd TMaskAddLinearSimple; + typedef TMaskAdd TMaskAddNearest; + typedef TMaskAdd TMaskAddLinear; + typedef TMaskSub TMaskSubNearestSimple; + typedef TMaskSub TMaskSubLinearSimple; + typedef TMaskSub TMaskSubNearest; + typedef TMaskSub TMaskSubLinear; + typedef TMaskRevSub TMaskRevSubNearestSimple; + typedef TMaskRevSub TMaskRevSubLinearSimple; + typedef TMaskRevSub TMaskRevSubNearest; + typedef TMaskRevSub TMaskRevSubLinear; +#else + template + struct CopySimple + { + VEC_SHADE_VARS(); + CopySimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)loop.dest, fg); + } + }; + + template + struct Copy + { + VEC_SHADE_VARS(); + Copy(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + VEC_SHADE(fg, cmd._shade_constants); + _mm_storeu_si128((__m128i*)loop.dest, fg); + } + }; + + template + struct MaskSimple + { + VEC_SHADE_VARS(); + MaskSimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + VEC_SHADE_SIMPLE(fg); + VEC_ALPHA_BLEND(fg, bg); + _mm_storeu_si128((__m128i*)loop.dest, fg); + } + }; + + template + struct Mask + { + VEC_SHADE_VARS(); + Mask(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + VEC_SHADE(fg, cmd._shade_constants); + VEC_ALPHA_BLEND(fg, bg); + _mm_storeu_si128((__m128i*)loop.dest, fg); + } + }; + + template + struct TMaskAddSimple + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskAddSimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskAdd + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskAdd(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskSubSimple + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskSubSimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskSub + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskSub(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, bg_alpha_hi), _mm_mullo_epi16(fg_hi, fg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, bg_alpha_lo), _mm_mullo_epi16(fg_lo, fg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskRevSubSimple + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskRevSubSimple(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_SIMPLE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0]); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + template + struct TMaskRevSub + { + VEC_SHADE_VARS(); + VEC_CALC_BLEND_ALPHA_VARS(); + TMaskRevSub(DrawerWall4Command &cmd, LoopIterator &loop) + { + VEC_SHADE_INIT4(cmd._light[3], cmd._light[2], cmd._light[1], cmd._light[0], cmd._shade_constants); + VEC_CALC_BLEND_ALPHA_INIT(cmd._srcalpha, cmd._destalpha); + } + void Blend(DrawerWall4Command &cmd, LoopIterator &loop) + { + __m128i fg = Sampler::Sample4(cmd, loop); + __m128i bg = _mm_loadu_si128((const __m128i*)loop.dest); + + VEC_CALC_BLEND_ALPHA(fg); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + __m128i out_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, fg_alpha_hi), _mm_mullo_epi16(bg_hi, bg_alpha_hi)), 8); + __m128i out_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, fg_alpha_lo), _mm_mullo_epi16(bg_lo, bg_alpha_lo)), 8); + __m128i out = _mm_packus_epi16(out_lo, out_hi); + + _mm_storeu_si128((__m128i*)loop.dest, out); + } + }; + + typedef CopySimple CopyNearestSimple; + typedef CopySimple CopyLinearSimple; + typedef Copy CopyNearest; + typedef Copy CopyLinear; + typedef MaskSimple MaskNearestSimple; + typedef MaskSimple MaskLinearSimple; + typedef Mask MaskNearest; + typedef Mask MaskLinear; + typedef TMaskAddSimple TMaskAddNearestSimple; + typedef TMaskAddSimple TMaskAddLinearSimple; + typedef TMaskAdd TMaskAddNearest; + typedef TMaskAdd TMaskAddLinear; + typedef TMaskSubSimple TMaskSubNearestSimple; + typedef TMaskSubSimple TMaskSubLinearSimple; + typedef TMaskSub TMaskSubNearest; + typedef TMaskSub TMaskSubLinear; + typedef TMaskRevSubSimple TMaskRevSubNearestSimple; + typedef TMaskRevSubSimple TMaskRevSubLinearSimple; + typedef TMaskRevSub TMaskRevSubNearest; + typedef TMaskRevSub TMaskRevSubLinear; +#endif +}; + +typedef DrawerBlendCommand Vlinec4NearestSimpleRGBACommand; +typedef DrawerBlendCommand Vlinec4NearestRGBACommand; +typedef DrawerBlendCommand Vlinec4LinearSimpleRGBACommand; +typedef DrawerBlendCommand Vlinec4LinearRGBACommand; +typedef DrawerBlendCommand Mvlinec4NearestSimpleRGBACommand; +typedef DrawerBlendCommand Mvlinec4NearestRGBACommand; +typedef DrawerBlendCommand Mvlinec4LinearSimpleRGBACommand; +typedef DrawerBlendCommand Mvlinec4LinearRGBACommand; +typedef DrawerBlendCommand Tmvline4AddNearestSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4AddNearestRGBACommand; +typedef DrawerBlendCommand Tmvline4AddLinearSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4AddLinearRGBACommand; +typedef DrawerBlendCommand Tmvline4AddClampNearestSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4AddClampNearestRGBACommand; +typedef DrawerBlendCommand Tmvline4AddClampLinearSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4AddClampLinearRGBACommand; +typedef DrawerBlendCommand Tmvline4SubClampNearestSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4SubClampNearestRGBACommand; +typedef DrawerBlendCommand Tmvline4SubClampLinearSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4SubClampLinearRGBACommand; +typedef DrawerBlendCommand Tmvline4RevSubClampNearestSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4RevSubClampNearestRGBACommand; +typedef DrawerBlendCommand Tmvline4RevSubClampLinearSimpleRGBACommand; +typedef DrawerBlendCommand Tmvline4RevSubClampLinearRGBACommand; + +class Vlinec1RGBACommand : public DrawerWall1Command +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (_source2 == nullptr) + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::copy(fg); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_source, _source2, loop.texturefracx, loop.frac, loop.one, loop.height), _light, _shade_constants); + *loop.dest = BlendBgra::copy(fg); + } while (loop.next()); + } + } +}; + +class Mvlinec1RGBACommand : public DrawerWall1Command +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + + if (_source2 == nullptr) + { + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::alpha_blend(fg, *loop.dest); + } while (loop.next()); + } + else + { + do + { + uint32_t fg = LightBgra::shade_bgra(SampleBgra::sample_bilinear(_source, _source2, loop.texturefracx, loop.frac, loop.one, loop.height), _light, _shade_constants); + *loop.dest = BlendBgra::alpha_blend(fg, *loop.dest); + } while (loop.next()); + } + } +}; + +class Tmvline1AddRGBACommand : public DrawerWall1Command +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } +}; + +class Tmvline1AddClampRGBACommand : public DrawerWall1Command +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } +}; + +class Tmvline1SubClampRGBACommand : public DrawerWall1Command +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::sub(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } +}; + +class Tmvline1RevSubClampRGBACommand : public DrawerWall1Command +{ +public: + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_bgra(_source[loop.sample_index()], _light, _shade_constants); + *loop.dest = BlendBgra::revsub(fg, *loop.dest, _srcalpha, calc_blend_bgalpha(fg, _destalpha)); + } while (loop.next()); + } +}; + +///////////////////////////////////////////////////////////////////////////// + +class DrawFogBoundaryLineRGBACommand : public DrawerCommand +{ + int _y; + int _x; + int _x2; + BYTE * RESTRICT _destorg; + fixed_t _light; + ShadeConstants _shade_constants; + +public: + DrawFogBoundaryLineRGBACommand(int y, int x, int x2) + { + _y = y; + _x = x; + _x2 = x2; + + _destorg = dc_destorg; + _light = dc_light; + _shade_constants = dc_shade_constants; + } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(_y)) + return; + + int y = _y; + int x = _x; + int x2 = _x2; + + uint32_t *dest = ylookup[y] + (uint32_t*)_destorg; + + uint32_t light = LightBgra::calc_light_multiplier(_light); + ShadeConstants constants = _shade_constants; + + do + { + uint32_t red = (dest[x] >> 16) & 0xff; + uint32_t green = (dest[x] >> 8) & 0xff; + uint32_t blue = dest[x] & 0xff; + + if (constants.simple_shade) + { + red = red * light / 256; + green = green * light / 256; + blue = blue * light / 256; + } + else + { + uint32_t inv_light = 256 - light; + uint32_t inv_desaturate = 256 - constants.desaturate; + + uint32_t intensity = ((red * 77 + green * 143 + blue * 37) >> 8) * constants.desaturate; + + red = (red * inv_desaturate + intensity) / 256; + green = (green * inv_desaturate + intensity) / 256; + blue = (blue * inv_desaturate + intensity) / 256; + + red = (constants.fade_red * inv_light + red * light) / 256; + green = (constants.fade_green * inv_light + green * light) / 256; + blue = (constants.fade_blue * inv_light + blue * light) / 256; + + red = (red * constants.light_red) / 256; + green = (green * constants.light_green) / 256; + blue = (blue * constants.light_blue) / 256; + } + + dest[x] = 0xff000000 | (red << 16) | (green << 8) | blue; + } while (++x <= x2); + } +}; + +class DrawTiltedSpanRGBACommand : public DrawerCommand +{ + int _x1; + int _x2; + int _y; + BYTE * RESTRICT _destorg; + fixed_t _light; + ShadeConstants _shade_constants; + FVector3 _plane_sz; + FVector3 _plane_su; + FVector3 _plane_sv; + bool _plane_shade; + int _planeshade; + float _planelightfloat; + fixed_t _pviewx; + fixed_t _pviewy; + int _xbits; + int _ybits; + const uint32_t * RESTRICT _source; + +public: + DrawTiltedSpanRGBACommand(int y, int x1, int x2, const FVector3 &plane_sz, const FVector3 &plane_su, const FVector3 &plane_sv, bool plane_shade, int planeshade, float planelightfloat, fixed_t pviewx, fixed_t pviewy) + { + _x1 = x1; + _x2 = x2; + _y = y; + _destorg = dc_destorg; + _light = ds_light; + _shade_constants = ds_shade_constants; + _plane_sz = plane_sz; + _plane_su = plane_su; + _plane_sv = plane_sv; + _plane_shade = plane_shade; + _planeshade = planeshade; + _planelightfloat = planelightfloat; + _pviewx = pviewx; + _pviewy = pviewy; + _source = (const uint32_t*)ds_source; + _xbits = ds_xbits; + _ybits = ds_ybits; + } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(_y)) + return; + + //#define SPANSIZE 32 + //#define INVSPAN 0.03125f + //#define SPANSIZE 8 + //#define INVSPAN 0.125f + #define SPANSIZE 16 + #define INVSPAN 0.0625f + + int source_width = 1 << _xbits; + int source_height = 1 << _ybits; + + uint32_t *dest = ylookup[_y] + _x1 + (uint32_t*)_destorg; + int count = _x2 - _x1 + 1; + + // Depth (Z) change across the span + double iz = _plane_sz[2] + _plane_sz[1] * (centery - _y) + _plane_sz[0] * (_x1 - centerx); + + // Light change across the span + fixed_t lightstart = _light; + fixed_t lightend = lightstart; + if (_plane_shade) + { + double vis_start = iz * _planelightfloat; + double vis_end = (iz + _plane_sz[0] * count) * _planelightfloat; + + lightstart = LIGHTSCALE(vis_start, _planeshade); + lightend = LIGHTSCALE(vis_end, _planeshade); + } + fixed_t light = lightstart; + fixed_t steplight = (lightend - lightstart) / count; + + // Texture coordinates + double uz = _plane_su[2] + _plane_su[1] * (centery - _y) + _plane_su[0] * (_x1 - centerx); + double vz = _plane_sv[2] + _plane_sv[1] * (centery - _y) + _plane_sv[0] * (_x1 - centerx); + double startz = 1.f / iz; + double startu = uz*startz; + double startv = vz*startz; + double izstep = _plane_sz[0] * SPANSIZE; + double uzstep = _plane_su[0] * SPANSIZE; + double vzstep = _plane_sv[0] * SPANSIZE; + + // Linear interpolate in sizes of SPANSIZE to increase speed + while (count >= SPANSIZE) + { + iz += izstep; + uz += uzstep; + vz += vzstep; + + double endz = 1.f / iz; + double endu = uz*endz; + double endv = vz*endz; + uint32_t stepu = (uint32_t)(SQWORD((endu - startu) * INVSPAN)); + uint32_t stepv = (uint32_t)(SQWORD((endv - startv) * INVSPAN)); + uint32_t u = (uint32_t)(SQWORD(startu) + _pviewx); + uint32_t v = (uint32_t)(SQWORD(startv) + _pviewy); + + for (int i = 0; i < SPANSIZE; i++) + { + uint32_t sx = ((u >> 16) * source_width) >> 16; + uint32_t sy = ((v >> 16) * source_height) >> 16; + uint32_t fg = _source[sy + sx * source_height]; + + if (_shade_constants.simple_shade) + *(dest++) = LightBgra::shade_bgra_simple(fg, LightBgra::calc_light_multiplier(light)); + else + *(dest++) = LightBgra::shade_bgra(fg, LightBgra::calc_light_multiplier(light), _shade_constants); + + u += stepu; + v += stepv; + light += steplight; + } + startu = endu; + startv = endv; + count -= SPANSIZE; + } + + // The last few pixels at the end + while (count > 0) + { + double endz = 1.f / iz; + startu = uz*endz; + startv = vz*endz; + uint32_t u = (uint32_t)(SQWORD(startu) + _pviewx); + uint32_t v = (uint32_t)(SQWORD(startv) + _pviewy); + + uint32_t sx = ((u >> 16) * source_width) >> 16; + uint32_t sy = ((v >> 16) * source_height) >> 16; + uint32_t fg = _source[sy + sx * source_height]; + + if (_shade_constants.simple_shade) + *(dest++) = LightBgra::shade_bgra_simple(fg, LightBgra::calc_light_multiplier(light)); + else + *(dest++) = LightBgra::shade_bgra(fg, LightBgra::calc_light_multiplier(light), _shade_constants); + + iz += _plane_sz[0]; + uz += _plane_su[0]; + vz += _plane_sv[0]; + light += steplight; + count--; + } + } +}; + +class DrawColoredSpanRGBACommand : public DrawerCommand +{ + int _y; + int _x1; + int _x2; + BYTE * RESTRICT _destorg; + fixed_t _light; + int _color; + +public: + DrawColoredSpanRGBACommand(int y, int x1, int x2) + { + _y = y; + _x1 = x1; + _x2 = x2; + + _destorg = dc_destorg; + _light = ds_light; + _color = ds_color; + } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(_y)) + return; + + int y = _y; + int x1 = _x1; + int x2 = _x2; + + uint32_t *dest = ylookup[y] + x1 + (uint32_t*)_destorg; + int count = (x2 - x1 + 1); + uint32_t light = LightBgra::calc_light_multiplier(_light); + uint32_t color = LightBgra::shade_pal_index_simple(_color, light); + for (int i = 0; i < count; i++) + dest[i] = color; + } +}; + +class FillTransColumnRGBACommand : public DrawerCommand +{ + int _x; + int _y1; + int _y2; + int _color; + int _a; + BYTE * RESTRICT _destorg; + int _pitch; + fixed_t _light; + +public: + FillTransColumnRGBACommand(int x, int y1, int y2, int color, int a) + { + _x = x; + _y1 = y1; + _y2 = y2; + _color = color; + _a = a; + + _destorg = dc_destorg; + _pitch = dc_pitch; + } + + void Execute(DrawerThread *thread) override + { + int x = _x; + int y1 = _y1; + int y2 = _y2; + int color = _color; + int a = _a; + + int ycount = thread->count_for_thread(y1, y2 - y1 + 1); + if (ycount <= 0) + return; + + uint32_t fg = GPalette.BaseColors[color].d; + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + uint32_t alpha = a + 1; + uint32_t inv_alpha = 256 - alpha; + + fg_red *= alpha; + fg_green *= alpha; + fg_blue *= alpha; + + int spacing = _pitch * thread->num_cores; + uint32_t *dest = thread->dest_for_thread(y1, _pitch, ylookup[y1] + x + (uint32_t*)_destorg); + + for (int y = 0; y < ycount; y++) + { + uint32_t bg_red = (*dest >> 16) & 0xff; + uint32_t bg_green = (*dest >> 8) & 0xff; + uint32_t bg_blue = (*dest) & 0xff; + + uint32_t red = (fg_red + bg_red * inv_alpha) / 256; + uint32_t green = (fg_green + bg_green * inv_alpha) / 256; + uint32_t blue = (fg_blue + bg_blue * inv_alpha) / 256; + + *dest = 0xff000000 | (red << 16) | (green << 8) | blue; + dest += spacing; + } + } +}; + +ApplySpecialColormapRGBACommand::ApplySpecialColormapRGBACommand(FSpecialColormap *colormap, DFrameBuffer *screen) +{ + buffer = screen->GetBuffer(); + pitch = screen->GetPitch(); + width = screen->GetWidth(); + height = screen->GetHeight(); + + start_red = (int)(colormap->ColorizeStart[0] * 255); + start_green = (int)(colormap->ColorizeStart[1] * 255); + start_blue = (int)(colormap->ColorizeStart[2] * 255); + end_red = (int)(colormap->ColorizeEnd[0] * 255); + end_green = (int)(colormap->ColorizeEnd[1] * 255); + end_blue = (int)(colormap->ColorizeEnd[2] * 255); +} + +#ifdef NO_SSE +void ApplySpecialColormapRGBACommand::Execute(DrawerThread *thread) +{ + int y = thread->skipped_by_thread(0); + int count = thread->count_for_thread(0, height); + while (count > 0) + { + BYTE *pixels = buffer + y * pitch * 4; + for (int x = 0; x < width; x++) + { + int fg_red = pixels[2]; + int fg_green = pixels[1]; + int fg_blue = pixels[0]; + + int gray = (fg_red * 77 + fg_green * 143 + fg_blue * 37) >> 8; + gray += (gray >> 7); // gray*=256/255 + int inv_gray = 256 - gray; + + int red = clamp((start_red * inv_gray + end_red * gray) >> 8, 0, 255); + int green = clamp((start_green * inv_gray + end_green * gray) >> 8, 0, 255); + int blue = clamp((start_blue * inv_gray + end_blue * gray) >> 8, 0, 255); + + pixels[0] = (BYTE)blue; + pixels[1] = (BYTE)green; + pixels[2] = (BYTE)red; + pixels[3] = 0xff; + + pixels += 4; + } + y += thread->num_cores; + count--; + } +} +#else +void ApplySpecialColormapRGBACommand::Execute(DrawerThread *thread) +{ + int y = thread->skipped_by_thread(0); + int count = thread->count_for_thread(0, height); + __m128i gray_weight = _mm_set_epi16(256, 77, 143, 37, 256, 77, 143, 37); + __m128i start_end = _mm_set_epi16(255, start_red, start_green, start_blue, 255, end_red, end_green, end_blue); + while (count > 0) + { + BYTE *pixels = buffer + y * pitch * 4; + int sse_length = width / 4; + for (int x = 0; x < sse_length; x++) + { + // Unpack to integers: + __m128i p = _mm_loadu_si128((const __m128i*)pixels); + + __m128i p16_0 = _mm_unpacklo_epi8(p, _mm_setzero_si128()); + __m128i p16_1 = _mm_unpackhi_epi8(p, _mm_setzero_si128()); + + // Add gray weighting to colors + __m128i mullo0 = _mm_mullo_epi16(p16_0, gray_weight); + __m128i mullo1 = _mm_mullo_epi16(p16_1, gray_weight); + __m128i p32_0 = _mm_unpacklo_epi16(mullo0, _mm_setzero_si128()); + __m128i p32_1 = _mm_unpackhi_epi16(mullo0, _mm_setzero_si128()); + __m128i p32_2 = _mm_unpacklo_epi16(mullo1, _mm_setzero_si128()); + __m128i p32_3 = _mm_unpackhi_epi16(mullo1, _mm_setzero_si128()); + + // Transpose to get color components in individual vectors: + __m128 tmpx = _mm_castsi128_ps(p32_0); + __m128 tmpy = _mm_castsi128_ps(p32_1); + __m128 tmpz = _mm_castsi128_ps(p32_2); + __m128 tmpw = _mm_castsi128_ps(p32_3); + _MM_TRANSPOSE4_PS(tmpx, tmpy, tmpz, tmpw); + __m128i blue = _mm_castps_si128(tmpx); + __m128i green = _mm_castps_si128(tmpy); + __m128i red = _mm_castps_si128(tmpz); + __m128i alpha = _mm_castps_si128(tmpw); + + // Calculate gray and 256-gray values: + __m128i gray = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(red, green), blue), 8); + __m128i inv_gray = _mm_sub_epi32(_mm_set1_epi32(256), gray); + + // p32 = start * inv_gray + end * gray: + __m128i gray0 = _mm_shuffle_epi32(gray, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i gray1 = _mm_shuffle_epi32(gray, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i gray2 = _mm_shuffle_epi32(gray, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i gray3 = _mm_shuffle_epi32(gray, _MM_SHUFFLE(3, 3, 3, 3)); + __m128i inv_gray0 = _mm_shuffle_epi32(inv_gray, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i inv_gray1 = _mm_shuffle_epi32(inv_gray, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i inv_gray2 = _mm_shuffle_epi32(inv_gray, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i inv_gray3 = _mm_shuffle_epi32(inv_gray, _MM_SHUFFLE(3, 3, 3, 3)); + __m128i gray16_0 = _mm_packs_epi32(gray0, inv_gray0); + __m128i gray16_1 = _mm_packs_epi32(gray1, inv_gray1); + __m128i gray16_2 = _mm_packs_epi32(gray2, inv_gray2); + __m128i gray16_3 = _mm_packs_epi32(gray3, inv_gray3); + __m128i gray16_0_mullo = _mm_mullo_epi16(gray16_0, start_end); + __m128i gray16_1_mullo = _mm_mullo_epi16(gray16_1, start_end); + __m128i gray16_2_mullo = _mm_mullo_epi16(gray16_2, start_end); + __m128i gray16_3_mullo = _mm_mullo_epi16(gray16_3, start_end); + __m128i gray16_0_mulhi = _mm_mulhi_epi16(gray16_0, start_end); + __m128i gray16_1_mulhi = _mm_mulhi_epi16(gray16_1, start_end); + __m128i gray16_2_mulhi = _mm_mulhi_epi16(gray16_2, start_end); + __m128i gray16_3_mulhi = _mm_mulhi_epi16(gray16_3, start_end); + p32_0 = _mm_srli_epi32(_mm_add_epi32(_mm_unpacklo_epi16(gray16_0_mullo, gray16_0_mulhi), _mm_unpackhi_epi16(gray16_0_mullo, gray16_0_mulhi)), 8); + p32_1 = _mm_srli_epi32(_mm_add_epi32(_mm_unpacklo_epi16(gray16_1_mullo, gray16_1_mulhi), _mm_unpackhi_epi16(gray16_1_mullo, gray16_1_mulhi)), 8); + p32_2 = _mm_srli_epi32(_mm_add_epi32(_mm_unpacklo_epi16(gray16_2_mullo, gray16_2_mulhi), _mm_unpackhi_epi16(gray16_2_mullo, gray16_2_mulhi)), 8); + p32_3 = _mm_srli_epi32(_mm_add_epi32(_mm_unpacklo_epi16(gray16_3_mullo, gray16_3_mulhi), _mm_unpackhi_epi16(gray16_3_mullo, gray16_3_mulhi)), 8); + + p16_0 = _mm_packs_epi32(p32_0, p32_1); + p16_1 = _mm_packs_epi32(p32_2, p32_3); + p = _mm_packus_epi16(p16_0, p16_1); + + _mm_storeu_si128((__m128i*)pixels, p); + pixels += 16; + } + + for (int x = sse_length * 4; x < width; x++) + { + int fg_red = pixels[2]; + int fg_green = pixels[1]; + int fg_blue = pixels[0]; + + int gray = (fg_red * 77 + fg_green * 143 + fg_blue * 37) >> 8; + gray += (gray >> 7); // gray*=256/255 + int inv_gray = 256 - gray; + + int red = clamp((start_red * inv_gray + end_red * gray) >> 8, 0, 255); + int green = clamp((start_green * inv_gray + end_green * gray) >> 8, 0, 255); + int blue = clamp((start_blue * inv_gray + end_blue * gray) >> 8, 0, 255); + + pixels[0] = (BYTE)blue; + pixels[1] = (BYTE)green; + pixels[2] = (BYTE)red; + pixels[3] = 0xff; + + pixels += 4; + } + + y += thread->num_cores; + count--; + } +} +#endif + +///////////////////////////////////////////////////////////////////////////// + +void R_BeginDrawerCommands() +{ + DrawerCommandQueue::Begin(); +} + +void R_EndDrawerCommands() +{ + DrawerCommandQueue::End(); +} + +void R_DrawColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_FillColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_FillAddColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_FillAddClampColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_FillSubClampColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_FillRevSubClampColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawFuzzColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); + + dc_yl = MAX(dc_yl, 1); + dc_yh = MIN(dc_yh, fuzzviewheight); + if (dc_yl <= dc_yh) + fuzzpos = (fuzzpos + dc_yh - dc_yl + 1) % FUZZTABLE; +} + +void R_DrawAddColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawTranslatedColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawTlatedAddColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawShadedColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawAddClampColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawAddClampTranslatedColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawSubClampColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawSubClampTranslatedColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawRevSubClampColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawRevSubClampTranslatedColumn_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawSpan_rgba() +{ +#ifdef NO_SSE + DrawerCommandQueue::QueueCommand(); +#else + DrawerCommandQueue::QueueCommand(); +#endif +} + +void R_DrawSpanMasked_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawSpanTranslucent_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawSpanMaskedTranslucent_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawSpanAddClamp_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawSpanMaskedAddClamp_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_FillSpan_rgba() +{ + DrawerCommandQueue::QueueCommand(); +} + +void R_DrawTiltedSpan_rgba(int y, int x1, int x2, const FVector3 &plane_sz, const FVector3 &plane_su, const FVector3 &plane_sv, bool plane_shade, int planeshade, float planelightfloat, fixed_t pviewx, fixed_t pviewy) +{ + DrawerCommandQueue::QueueCommand(y, x1, x2, plane_sz, plane_su, plane_sv, plane_shade, planeshade, planelightfloat, pviewx, pviewy); +} + +void R_DrawColoredSpan_rgba(int y, int x1, int x2) +{ + DrawerCommandQueue::QueueCommand(y, x1, x2); +} + +static ShadeConstants slab_rgba_shade_constants; +static const BYTE *slab_rgba_colormap; +static fixed_t slab_rgba_light; + +void R_SetupDrawSlab_rgba(FSWColormap *base_colormap, float light, int shade) +{ + slab_rgba_shade_constants.light_red = base_colormap->Color.r * 256 / 255; + slab_rgba_shade_constants.light_green = base_colormap->Color.g * 256 / 255; + slab_rgba_shade_constants.light_blue = base_colormap->Color.b * 256 / 255; + slab_rgba_shade_constants.light_alpha = base_colormap->Color.a * 256 / 255; + slab_rgba_shade_constants.fade_red = base_colormap->Fade.r; + slab_rgba_shade_constants.fade_green = base_colormap->Fade.g; + slab_rgba_shade_constants.fade_blue = base_colormap->Fade.b; + slab_rgba_shade_constants.fade_alpha = base_colormap->Fade.a; + slab_rgba_shade_constants.desaturate = MIN(abs(base_colormap->Desaturate), 255) * 255 / 256; + slab_rgba_shade_constants.simple_shade = (base_colormap->Color.d == 0x00ffffff && base_colormap->Fade.d == 0x00000000 && base_colormap->Desaturate == 0); + slab_rgba_colormap = base_colormap->Maps; + slab_rgba_light = LIGHTSCALE(light, shade); +} + +void R_DrawSlab_rgba(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p) +{ + DrawerCommandQueue::QueueCommand(dx, v, dy, vi, vptr, p, slab_rgba_shade_constants, slab_rgba_colormap, slab_rgba_light); +} + +DWORD vlinec1_rgba() +{ + DrawerCommandQueue::QueueCommand(); + return dc_texturefrac + dc_count * dc_iscale; +} + +template +void queue_wallcommand() +{ + if (bufplce2[0] == nullptr && dc_shade_constants.simple_shade) + DrawerCommandQueue::QueueCommand(); + else if (bufplce2[0] == nullptr) + DrawerCommandQueue::QueueCommand(); + else if (dc_shade_constants.simple_shade) + DrawerCommandQueue::QueueCommand(); + else + DrawerCommandQueue::QueueCommand(); +} + +void vlinec4_rgba() +{ + queue_wallcommand(); + for (int i = 0; i < 4; i++) + vplce[i] += vince[i] * dc_count; +} + +DWORD mvlinec1_rgba() +{ + DrawerCommandQueue::QueueCommand(); + return dc_texturefrac + dc_count * dc_iscale; +} + +void mvlinec4_rgba() +{ + queue_wallcommand(); + for (int i = 0; i < 4; i++) + vplce[i] += vince[i] * dc_count; +} + +fixed_t tmvline1_add_rgba() +{ + DrawerCommandQueue::QueueCommand(); + return dc_texturefrac + dc_count * dc_iscale; +} + +void tmvline4_add_rgba() +{ + queue_wallcommand(); + for (int i = 0; i < 4; i++) + vplce[i] += vince[i] * dc_count; +} + +fixed_t tmvline1_addclamp_rgba() +{ + DrawerCommandQueue::QueueCommand(); + return dc_texturefrac + dc_count * dc_iscale; +} + +void tmvline4_addclamp_rgba() +{ + queue_wallcommand(); + for (int i = 0; i < 4; i++) + vplce[i] += vince[i] * dc_count; +} + +fixed_t tmvline1_subclamp_rgba() +{ + DrawerCommandQueue::QueueCommand(); + return dc_texturefrac + dc_count * dc_iscale; +} + +void tmvline4_subclamp_rgba() +{ + queue_wallcommand(); + for (int i = 0; i < 4; i++) + vplce[i] += vince[i] * dc_count; +} + +fixed_t tmvline1_revsubclamp_rgba() +{ + DrawerCommandQueue::QueueCommand(); + return dc_texturefrac + dc_count * dc_iscale; +} + +void tmvline4_revsubclamp_rgba() +{ + queue_wallcommand(); + for (int i = 0; i < 4; i++) + vplce[i] += vince[i] * dc_count; +} + +void R_DrawFogBoundarySection_rgba(int y, int y2, int x1) +{ + for (; y < y2; ++y) + { + int x2 = spanend[y]; + DrawerCommandQueue::QueueCommand(y, x1, x2); + } +} + +void R_DrawFogBoundary_rgba(int x1, int x2, short *uclip, short *dclip) +{ + // To do: we do not need to create new spans when using rgba output - instead we should calculate light on a per pixel basis + + // This is essentially the same as R_MapVisPlane but with an extra step + // to create new horizontal spans whenever the light changes enough that + // we need to use a new colormap. + + double lightstep = rw_lightstep; + double light = rw_light + rw_lightstep*(x2 - x1 - 1); + int x = x2 - 1; + int t2 = uclip[x]; + int b2 = dclip[x]; + int rcolormap = GETPALOOKUP(light, wallshade); + int lcolormap; + BYTE *basecolormapdata = basecolormap->Maps; + + if (b2 > t2) + { + clearbufshort(spanend + t2, b2 - t2, x); + } + + R_SetColorMapLight(basecolormap, (float)light, wallshade); + + BYTE *fake_dc_colormap = basecolormap->Maps + (GETPALOOKUP(light, wallshade) << COLORMAPSHIFT); + + for (--x; x >= x1; --x) + { + int t1 = uclip[x]; + int b1 = dclip[x]; + const int xr = x + 1; + int stop; + + light -= rw_lightstep; + lcolormap = GETPALOOKUP(light, wallshade); + if (lcolormap != rcolormap) + { + if (t2 < b2 && rcolormap != 0) + { // Colormap 0 is always the identity map, so rendering it is + // just a waste of time. + R_DrawFogBoundarySection_rgba(t2, b2, xr); + } + if (t1 < t2) t2 = t1; + if (b1 > b2) b2 = b1; + if (t2 < b2) + { + clearbufshort(spanend + t2, b2 - t2, x); + } + rcolormap = lcolormap; + R_SetColorMapLight(basecolormap, (float)light, wallshade); + fake_dc_colormap = basecolormap->Maps + (GETPALOOKUP(light, wallshade) << COLORMAPSHIFT); + } + else + { + if (fake_dc_colormap != basecolormapdata) + { + stop = MIN(t1, b2); + while (t2 < stop) + { + int y = t2++; + DrawerCommandQueue::QueueCommand(y, xr, spanend[y]); + } + stop = MAX(b1, t2); + while (b2 > stop) + { + int y = --b2; + DrawerCommandQueue::QueueCommand(y, xr, spanend[y]); + } + } + else + { + t2 = MAX(t2, MIN(t1, b2)); + b2 = MIN(b2, MAX(b1, t2)); + } + + stop = MIN(t2, b1); + while (t1 < stop) + { + spanend[t1++] = x; + } + stop = MAX(b2, t2); + while (b1 > stop) + { + spanend[--b1] = x; + } + } + + t2 = uclip[x]; + b2 = dclip[x]; + } + if (t2 < b2 && rcolormap != 0) + { + R_DrawFogBoundarySection_rgba(t2, b2, x1); + } +} diff --git a/src/r_draw_rgba.h b/src/r_draw_rgba.h new file mode 100644 index 000000000..df3d0f233 --- /dev/null +++ b/src/r_draw_rgba.h @@ -0,0 +1,994 @@ +// Emacs style mode select -*- C++ -*- +//----------------------------------------------------------------------------- +// +// $Id:$ +// +// Copyright (C) 1993-1996 by id Software, Inc. +// +// This source is available for distribution and/or modification +// only under the terms of the DOOM Source Code License as +// published by id Software. All rights reserved. +// +// The source is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// FITNESS FOR A PARTICULAR PURPOSE. See the DOOM Source Code License +// for more details. +// +// DESCRIPTION: +// System specific interface stuff. +// +//----------------------------------------------------------------------------- + + +#ifndef __R_DRAW_RGBA__ +#define __R_DRAW_RGBA__ + +#include "r_draw.h" +#include "v_palette.h" +#include +#include +#include +#include +#include + +#ifndef NO_SSE +#include +#endif + +///////////////////////////////////////////////////////////////////////////// +// Drawer functions: + +void rt_initcols_rgba(BYTE *buffer); +void rt_span_coverage_rgba(int x, int start, int stop); + +void rt_copy1col_rgba(int hx, int sx, int yl, int yh); +void rt_copy4cols_rgba(int sx, int yl, int yh); +void rt_shaded1col_rgba(int hx, int sx, int yl, int yh); +void rt_shaded4cols_rgba(int sx, int yl, int yh); +void rt_map1col_rgba(int hx, int sx, int yl, int yh); +void rt_add1col_rgba(int hx, int sx, int yl, int yh); +void rt_addclamp1col_rgba(int hx, int sx, int yl, int yh); +void rt_subclamp1col_rgba(int hx, int sx, int yl, int yh); +void rt_revsubclamp1col_rgba(int hx, int sx, int yl, int yh); +void rt_tlate1col_rgba(int hx, int sx, int yl, int yh); +void rt_tlateadd1col_rgba(int hx, int sx, int yl, int yh); +void rt_tlateaddclamp1col_rgba(int hx, int sx, int yl, int yh); +void rt_tlatesubclamp1col_rgba(int hx, int sx, int yl, int yh); +void rt_tlaterevsubclamp1col_rgba(int hx, int sx, int yl, int yh); +void rt_map4cols_rgba(int sx, int yl, int yh); +void rt_add4cols_rgba(int sx, int yl, int yh); +void rt_addclamp4cols_rgba(int sx, int yl, int yh); +void rt_subclamp4cols_rgba(int sx, int yl, int yh); +void rt_revsubclamp4cols_rgba(int sx, int yl, int yh); +void rt_tlate4cols_rgba(int sx, int yl, int yh); +void rt_tlateadd4cols_rgba(int sx, int yl, int yh); +void rt_tlateaddclamp4cols_rgba(int sx, int yl, int yh); +void rt_tlatesubclamp4cols_rgba(int sx, int yl, int yh); +void rt_tlaterevsubclamp4cols_rgba(int sx, int yl, int yh); + +void R_DrawColumnHoriz_rgba(); +void R_DrawColumn_rgba(); +void R_DrawFuzzColumn_rgba(); +void R_DrawTranslatedColumn_rgba(); +void R_DrawShadedColumn_rgba(); + +void R_FillColumn_rgba(); +void R_FillAddColumn_rgba(); +void R_FillAddClampColumn_rgba(); +void R_FillSubClampColumn_rgba(); +void R_FillRevSubClampColumn_rgba(); +void R_DrawAddColumn_rgba(); +void R_DrawTlatedAddColumn_rgba(); +void R_DrawAddClampColumn_rgba(); +void R_DrawAddClampTranslatedColumn_rgba(); +void R_DrawSubClampColumn_rgba(); +void R_DrawSubClampTranslatedColumn_rgba(); +void R_DrawRevSubClampColumn_rgba(); +void R_DrawRevSubClampTranslatedColumn_rgba(); + +void R_DrawSpan_rgba(void); +void R_DrawSpanMasked_rgba(void); +void R_DrawSpanTranslucent_rgba(); +void R_DrawSpanMaskedTranslucent_rgba(); +void R_DrawSpanAddClamp_rgba(); +void R_DrawSpanMaskedAddClamp_rgba(); +void R_FillSpan_rgba(); + +void R_DrawTiltedSpan_rgba(int y, int x1, int x2, const FVector3 &plane_sz, const FVector3 &plane_su, const FVector3 &plane_sv, bool plane_shade, int planeshade, float planelightfloat, fixed_t pviewx, fixed_t pviewy); +void R_DrawColoredSpan_rgba(int y, int x1, int x2); + +void R_SetupDrawSlab_rgba(FSWColormap *base_colormap, float light, int shade); +void R_DrawSlab_rgba(int dx, fixed_t v, int dy, fixed_t vi, const BYTE *vptr, BYTE *p); + +void R_DrawFogBoundary_rgba(int x1, int x2, short *uclip, short *dclip); + +DWORD vlinec1_rgba(); +void vlinec4_rgba(); +DWORD mvlinec1_rgba(); +void mvlinec4_rgba(); +fixed_t tmvline1_add_rgba(); +void tmvline4_add_rgba(); +fixed_t tmvline1_addclamp_rgba(); +void tmvline4_addclamp_rgba(); +fixed_t tmvline1_subclamp_rgba(); +void tmvline4_subclamp_rgba(); +fixed_t tmvline1_revsubclamp_rgba(); +void tmvline4_revsubclamp_rgba(); + +void R_FillColumnHoriz_rgba(); +void R_FillSpan_rgba(); + +///////////////////////////////////////////////////////////////////////////// +// Multithreaded rendering infrastructure: + +// Redirect drawer commands to worker threads +void R_BeginDrawerCommands(); + +// Wait until all drawers finished executing +void R_EndDrawerCommands(); + +struct FSpecialColormap; +class DrawerCommandQueue; + +// Worker data for each thread executing drawer commands +class DrawerThread +{ +public: + std::thread thread; + + // Thread line index of this thread + int core = 0; + + // Number of active threads + int num_cores = 1; + + // Range of rows processed this pass + int pass_start_y = 0; + int pass_end_y = MAXHEIGHT; + + uint32_t dc_temp_rgbabuff_rgba[MAXHEIGHT * 4]; + uint32_t *dc_temp_rgba; + + // Checks if a line is rendered by this thread + bool line_skipped_by_thread(int line) + { + return line < pass_start_y || line >= pass_end_y || line % num_cores != core; + } + + // The number of lines to skip to reach the first line to be rendered by this thread + int skipped_by_thread(int first_line) + { + int pass_skip = MAX(pass_start_y - first_line, 0); + int core_skip = (num_cores - (first_line + pass_skip - core) % num_cores) % num_cores; + return pass_skip + core_skip; + } + + // The number of lines to be rendered by this thread + int count_for_thread(int first_line, int count) + { + int lines_until_pass_end = MAX(pass_end_y - first_line, 0); + count = MIN(count, lines_until_pass_end); + int c = (count - skipped_by_thread(first_line) + num_cores - 1) / num_cores; + return MAX(c, 0); + } + + // Calculate the dest address for the first line to be rendered by this thread + uint32_t *dest_for_thread(int first_line, int pitch, uint32_t *dest) + { + return dest + skipped_by_thread(first_line) * pitch; + } +}; + +// Task to be executed by each worker thread +class DrawerCommand +{ +protected: + int _dest_y; + +public: + DrawerCommand() + { + _dest_y = static_cast((dc_dest - dc_destorg) / (dc_pitch * 4)); + } + + virtual void Execute(DrawerThread *thread) = 0; +}; + +EXTERN_CVAR(Bool, r_multithreaded) +EXTERN_CVAR(Bool, r_mipmap) + +// Manages queueing up commands and executing them on worker threads +class DrawerCommandQueue +{ + enum { memorypool_size = 16 * 1024 * 1024 }; + char memorypool[memorypool_size]; + size_t memorypool_pos = 0; + + std::vector commands; + + std::vector threads; + + std::mutex start_mutex; + std::condition_variable start_condition; + std::vector active_commands; + bool shutdown_flag = false; + int run_id = 0; + + std::mutex end_mutex; + std::condition_variable end_condition; + size_t finished_threads = 0; + + int threaded_render = 0; + DrawerThread single_core_thread; + int num_passes = 1; + int rows_in_pass = MAXHEIGHT; + + void StartThreads(); + void StopThreads(); + void Finish(); + + static DrawerCommandQueue *Instance(); + + DrawerCommandQueue(); + ~DrawerCommandQueue(); + +public: + // Allocate memory valid for the duration of a command execution + static void* AllocMemory(size_t size); + + // Queue command to be executed by drawer worker threads + template + static void QueueCommand(Types &&... args) + { + auto queue = Instance(); + if (queue->threaded_render == 0 || !r_multithreaded) + { + T command(std::forward(args)...); + command.Execute(&queue->single_core_thread); + } + else + { + void *ptr = AllocMemory(sizeof(T)); + if (!ptr) // Out of memory - render what we got + { + queue->Finish(); + ptr = AllocMemory(sizeof(T)); + if (!ptr) + return; + } + T *command = new (ptr)T(std::forward(args)...); + queue->commands.push_back(command); + } + } + + // Redirects all drawing commands to worker threads until End is called + // Begin/End blocks can be nested. + static void Begin(); + + // End redirection and wait until all worker threads finished executing + static void End(); + + // Waits until all worker threads finished executing + static void WaitForWorkers(); +}; + +///////////////////////////////////////////////////////////////////////////// +// Drawer commands: + +class ApplySpecialColormapRGBACommand : public DrawerCommand +{ + BYTE *buffer; + int pitch; + int width; + int height; + int start_red; + int start_green; + int start_blue; + int end_red; + int end_green; + int end_blue; + +public: + ApplySpecialColormapRGBACommand(FSpecialColormap *colormap, DFrameBuffer *screen); + void Execute(DrawerThread *thread) override; +}; + +template +class DrawerBlendCommand : public CommandType +{ +public: + void Execute(DrawerThread *thread) override + { + typename CommandType::LoopIterator loop(this, thread); + if (!loop) return; + BlendMode blend(*this, loop); + do + { + blend.Blend(*this, loop); + } while (loop.next()); + } +}; + +///////////////////////////////////////////////////////////////////////////// +// Pixel shading inline functions: + +// Give the compiler a strong hint we want these functions inlined: +#ifndef FORCEINLINE +#if defined(_MSC_VER) +#define FORCEINLINE __forceinline +#elif defined(__GNUC__) +#define FORCEINLINE __attribute__((always_inline)) inline +#else +#define FORCEINLINE inline +#endif +#endif + +// Promise compiler we have no aliasing of this pointer +#ifndef RESTRICT +#if defined(_MSC_VER) +#define RESTRICT __restrict +#elif defined(__GNUC__) +#define RESTRICT __restrict__ +#else +#define RESTRICT +#endif +#endif + +class LightBgra +{ +public: + // calculates the light constant passed to the shade_pal_index function + FORCEINLINE static uint32_t calc_light_multiplier(dsfixed_t light) + { + return 256 - (light >> (FRACBITS - 8)); + } + + // Calculates a ARGB8 color for the given palette index and light multiplier + FORCEINLINE static uint32_t shade_pal_index_simple(uint32_t index, uint32_t light) + { + const PalEntry &color = GPalette.BaseColors[index]; + uint32_t red = color.r; + uint32_t green = color.g; + uint32_t blue = color.b; + + red = red * light / 256; + green = green * light / 256; + blue = blue * light / 256; + + return 0xff000000 | (red << 16) | (green << 8) | blue; + } + + // Calculates a ARGB8 color for the given palette index, light multiplier and dynamic colormap + FORCEINLINE static uint32_t shade_pal_index(uint32_t index, uint32_t light, const ShadeConstants &constants) + { + const PalEntry &color = GPalette.BaseColors[index]; + uint32_t alpha = color.d & 0xff000000; + uint32_t red = color.r; + uint32_t green = color.g; + uint32_t blue = color.b; + if (constants.simple_shade) + { + red = red * light / 256; + green = green * light / 256; + blue = blue * light / 256; + } + else + { + uint32_t inv_light = 256 - light; + uint32_t inv_desaturate = 256 - constants.desaturate; + + uint32_t intensity = ((red * 77 + green * 143 + blue * 37) >> 8) * constants.desaturate; + + red = (red * inv_desaturate + intensity) / 256; + green = (green * inv_desaturate + intensity) / 256; + blue = (blue * inv_desaturate + intensity) / 256; + + red = (constants.fade_red * inv_light + red * light) / 256; + green = (constants.fade_green * inv_light + green * light) / 256; + blue = (constants.fade_blue * inv_light + blue * light) / 256; + + red = (red * constants.light_red) / 256; + green = (green * constants.light_green) / 256; + blue = (blue * constants.light_blue) / 256; + } + return alpha | (red << 16) | (green << 8) | blue; + } + + FORCEINLINE static uint32_t shade_bgra_simple(uint32_t color, uint32_t light) + { + uint32_t red = RPART(color) * light / 256; + uint32_t green = GPART(color) * light / 256; + uint32_t blue = BPART(color) * light / 256; + return 0xff000000 | (red << 16) | (green << 8) | blue; + } + + FORCEINLINE static uint32_t shade_bgra(uint32_t color, uint32_t light, const ShadeConstants &constants) + { + uint32_t alpha = color & 0xff000000; + uint32_t red = (color >> 16) & 0xff; + uint32_t green = (color >> 8) & 0xff; + uint32_t blue = color & 0xff; + if (constants.simple_shade) + { + red = red * light / 256; + green = green * light / 256; + blue = blue * light / 256; + } + else + { + uint32_t inv_light = 256 - light; + uint32_t inv_desaturate = 256 - constants.desaturate; + + uint32_t intensity = ((red * 77 + green * 143 + blue * 37) >> 8) * constants.desaturate; + + red = (red * inv_desaturate + intensity) / 256; + green = (green * inv_desaturate + intensity) / 256; + blue = (blue * inv_desaturate + intensity) / 256; + + red = (constants.fade_red * inv_light + red * light) / 256; + green = (constants.fade_green * inv_light + green * light) / 256; + blue = (constants.fade_blue * inv_light + blue * light) / 256; + + red = (red * constants.light_red) / 256; + green = (green * constants.light_green) / 256; + blue = (blue * constants.light_blue) / 256; + } + return alpha | (red << 16) | (green << 8) | blue; + } +}; + +class BlendBgra +{ +public: + FORCEINLINE static uint32_t copy(uint32_t fg) + { + return fg; + } + + FORCEINLINE static uint32_t add(uint32_t fg, uint32_t bg, uint32_t srcalpha, uint32_t destalpha) + { + uint32_t red = MIN((RPART(fg) * srcalpha + RPART(bg) * destalpha) >> 8, 255); + uint32_t green = MIN((GPART(fg) * srcalpha + GPART(bg) * destalpha) >> 8, 255); + uint32_t blue = MIN((BPART(fg) * srcalpha + BPART(bg) * destalpha) >> 8, 255); + return 0xff000000 | (red << 16) | (green << 8) | blue; + } + + FORCEINLINE static uint32_t sub(uint32_t fg, uint32_t bg, uint32_t srcalpha, uint32_t destalpha) + { + uint32_t red = clamp((0x10000 - RPART(fg) * srcalpha + RPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256; + uint32_t green = clamp((0x10000 - GPART(fg) * srcalpha + GPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256; + uint32_t blue = clamp((0x10000 - BPART(fg) * srcalpha + BPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256; + return 0xff000000 | (red << 16) | (green << 8) | blue; + } + + FORCEINLINE static uint32_t revsub(uint32_t fg, uint32_t bg, uint32_t srcalpha, uint32_t destalpha) + { + uint32_t red = clamp((0x10000 + RPART(fg) * srcalpha - RPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256; + uint32_t green = clamp((0x10000 + GPART(fg) * srcalpha - GPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256; + uint32_t blue = clamp((0x10000 + BPART(fg) * srcalpha - BPART(bg) * destalpha) >> 8, 256, 256 + 255) - 256; + return 0xff000000 | (red << 16) | (green << 8) | blue; + } + + FORCEINLINE static uint32_t alpha_blend(uint32_t fg, uint32_t bg) + { + uint32_t alpha = APART(fg) + (APART(fg) >> 7); // 255 -> 256 + uint32_t inv_alpha = 256 - alpha; + uint32_t red = MIN(RPART(fg) * alpha + (RPART(bg) * inv_alpha) / 256, 255); + uint32_t green = MIN(GPART(fg) * alpha + (GPART(bg) * inv_alpha) / 256, 255); + uint32_t blue = MIN(BPART(fg) * alpha + (BPART(bg) * inv_alpha) / 256, 255); + return 0xff000000 | (red << 16) | (green << 8) | blue; + } +}; + +class SampleBgra +{ +public: + inline static bool span_sampler_setup(const uint32_t * RESTRICT &source, int &xbits, int &ybits, fixed_t xstep, fixed_t ystep, bool mipmapped) + { + // Is this a magfilter or minfilter? + fixed_t xmagnitude = abs(xstep) >> (32 - xbits - FRACBITS); + fixed_t ymagnitude = abs(ystep) >> (32 - ybits - FRACBITS); + fixed_t magnitude = (xmagnitude + ymagnitude) * 2 + (1 << (FRACBITS - 1)); + bool magnifying = (magnitude >> FRACBITS == 0); + + if (r_mipmap && mipmapped) + { + int level = magnitude >> (FRACBITS + 1); + while (level != 0) + { + if (xbits <= 2 || ybits <= 2) + break; + + source += (1 << (xbits)) * (1 << (ybits)); + xbits -= 1; + ybits -= 1; + level >>= 1; + } + } + + return (magnifying && r_magfilter) || (!magnifying && r_minfilter); + } + + FORCEINLINE static uint32_t sample_bilinear(const uint32_t *col0, const uint32_t *col1, uint32_t texturefracx, uint32_t texturefracy, uint32_t one, uint32_t height) + { + uint32_t frac_y0 = (texturefracy >> FRACBITS) * height; + uint32_t frac_y1 = ((texturefracy + one) >> FRACBITS) * height; + uint32_t y0 = frac_y0 >> FRACBITS; + uint32_t y1 = frac_y1 >> FRACBITS; + + uint32_t p00 = col0[y0]; + uint32_t p01 = col0[y1]; + uint32_t p10 = col1[y0]; + uint32_t p11 = col1[y1]; + + uint32_t inv_b = texturefracx; + uint32_t inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t red = (RPART(p00) * a * b + RPART(p01) * inv_a * b + RPART(p10) * a * inv_b + RPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t green = (GPART(p00) * a * b + GPART(p01) * inv_a * b + GPART(p10) * a * inv_b + GPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t blue = (BPART(p00) * a * b + BPART(p01) * inv_a * b + BPART(p10) * a * inv_b + BPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t alpha = (APART(p00) * a * b + APART(p01) * inv_a * b + APART(p10) * a * inv_b + APART(p11) * inv_a * inv_b + 127) >> 8; + + return (alpha << 24) | (red << 16) | (green << 8) | blue; + } + + FORCEINLINE static uint32_t sample_bilinear(const uint32_t *texture, dsfixed_t xfrac, dsfixed_t yfrac, int xbits, int ybits) + { + int xshift = (32 - xbits); + int yshift = (32 - ybits); + int xmask = (1 << xshift) - 1; + int ymask = (1 << yshift) - 1; + uint32_t x = xfrac >> xbits; + uint32_t y = yfrac >> ybits; + + uint32_t p00 = texture[(y & ymask) + ((x & xmask) << yshift)]; + uint32_t p01 = texture[((y + 1) & ymask) + ((x & xmask) << yshift)]; + uint32_t p10 = texture[(y & ymask) + (((x + 1) & xmask) << yshift)]; + uint32_t p11 = texture[((y + 1) & ymask) + (((x + 1) & xmask) << yshift)]; + + uint32_t inv_b = (xfrac >> (xbits - 4)) & 15; + uint32_t inv_a = (yfrac >> (ybits - 4)) & 15; + uint32_t a = 16 - inv_a; + uint32_t b = 16 - inv_b; + + uint32_t red = (RPART(p00) * a * b + RPART(p01) * inv_a * b + RPART(p10) * a * inv_b + RPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t green = (GPART(p00) * a * b + GPART(p01) * inv_a * b + GPART(p10) * a * inv_b + GPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t blue = (BPART(p00) * a * b + BPART(p01) * inv_a * b + BPART(p10) * a * inv_b + BPART(p11) * inv_a * inv_b + 127) >> 8; + uint32_t alpha = (APART(p00) * a * b + APART(p01) * inv_a * b + APART(p10) * a * inv_b + APART(p11) * inv_a * inv_b + 127) >> 8; + + return (alpha << 24) | (red << 16) | (green << 8) | blue; + } + +#ifndef NO_SSE + static __m128i samplertable[256 * 2]; +#endif +}; + +///////////////////////////////////////////////////////////////////////////// +// SSE/AVX shading macros: + +#define AVX2_SAMPLE_BILINEAR4_COLUMN_INIT(col0, col1, one, height, texturefracx) \ + const uint32_t *baseptr = col0[0]; \ + __m128i coloffsets0 = _mm_setr_epi32(col0[0] - baseptr, col0[1] - baseptr, col0[2] - baseptr, col0[3] - baseptr); \ + __m128i coloffsets1 = _mm_setr_epi32(col1[0] - baseptr, col1[1] - baseptr, col1[2] - baseptr, col1[3] - baseptr); \ + __m128i mone = _mm_loadu_si128((const __m128i*)one); \ + __m128i m127 = _mm_set1_epi16(127); \ + __m128i m16 = _mm_set1_epi32(16); \ + __m128i m15 = _mm_set1_epi32(15); \ + __m128i mheight = _mm_loadu_si128((const __m128i*)height); \ + __m128i mtexturefracx = _mm_loadu_si128((const __m128i*)texturefracx); + +#define AVX2_SAMPLE_BILINEAR4_COLUMN(fg, texturefracy) { \ + __m128i mtexturefracy = _mm_loadu_si128((const __m128i*)texturefracy); \ + __m128i multmp0 = _mm_srli_epi32(mtexturefracy, FRACBITS); \ + __m128i multmp1 = _mm_srli_epi32(_mm_add_epi32(mtexturefracy, mone), FRACBITS); \ + __m128i frac_y0 = _mm_or_si128(_mm_mul_epu32(multmp0, mheight), _mm_slli_si128(_mm_mul_epu32(_mm_srli_si128(multmp0, 4), _mm_srli_si128(mheight, 4)), 4)); \ + __m128i frac_y1 = _mm_or_si128(_mm_mul_epu32(multmp1, mheight), _mm_slli_si128(_mm_mul_epu32(_mm_srli_si128(multmp1, 4), _mm_srli_si128(mheight, 4)), 4)); \ + __m128i y0 = _mm_srli_epi32(frac_y0, FRACBITS); \ + __m128i y1 = _mm_srli_epi32(frac_y1, FRACBITS); \ + __m128i inv_b = mtexturefracx; \ + __m128i inv_a = _mm_and_si128(_mm_srli_epi32(frac_y1, FRACBITS - 4), m15); \ + __m128i a = _mm_sub_epi32(m16, inv_a); \ + __m128i b = _mm_sub_epi32(m16, inv_b); \ + __m128i ab = _mm_mullo_epi16(a, b); \ + __m128i invab = _mm_mullo_epi16(inv_a, b); \ + __m128i ainvb = _mm_mullo_epi16(a, inv_b); \ + __m128i invainvb = _mm_mullo_epi16(inv_a, inv_b); \ + __m128i ab_lo = _mm_shuffle_epi32(ab, _MM_SHUFFLE(1, 1, 0, 0)); \ + __m128i ab_hi = _mm_shuffle_epi32(ab, _MM_SHUFFLE(3, 3, 2, 2)); \ + __m128i invab_lo = _mm_shuffle_epi32(invab, _MM_SHUFFLE(1, 1, 0, 0)); \ + __m128i invab_hi = _mm_shuffle_epi32(invab, _MM_SHUFFLE(3, 3, 2, 2)); \ + __m128i ainvb_lo = _mm_shuffle_epi32(ainvb, _MM_SHUFFLE(1, 1, 0, 0)); \ + __m128i ainvb_hi = _mm_shuffle_epi32(ainvb, _MM_SHUFFLE(3, 3, 2, 2)); \ + __m128i invainvb_lo = _mm_shuffle_epi32(invainvb, _MM_SHUFFLE(1, 1, 0, 0)); \ + __m128i invainvb_hi = _mm_shuffle_epi32(invainvb, _MM_SHUFFLE(3, 3, 2, 2)); \ + ab_lo = _mm_or_si128(ab_lo, _mm_slli_epi32(ab_lo, 16)); \ + ab_hi = _mm_or_si128(ab_hi, _mm_slli_epi32(ab_hi, 16)); \ + invab_lo = _mm_or_si128(invab_lo, _mm_slli_epi32(invab_lo, 16)); \ + invab_hi = _mm_or_si128(invab_hi, _mm_slli_epi32(invab_hi, 16)); \ + ainvb_lo = _mm_or_si128(ainvb_lo, _mm_slli_epi32(ainvb_lo, 16)); \ + ainvb_hi = _mm_or_si128(ainvb_hi, _mm_slli_epi32(ainvb_hi, 16)); \ + invainvb_lo = _mm_or_si128(invainvb_lo, _mm_slli_epi32(invainvb_lo, 16)); \ + invainvb_hi = _mm_or_si128(invainvb_hi, _mm_slli_epi32(invainvb_hi, 16)); \ + __m128i p00 = _mm_i32gather_epi32((const int *)baseptr, _mm_add_epi32(y0, coloffsets0), 4); \ + __m128i p01 = _mm_i32gather_epi32((const int *)baseptr, _mm_add_epi32(y1, coloffsets0), 4); \ + __m128i p10 = _mm_i32gather_epi32((const int *)baseptr, _mm_add_epi32(y0, coloffsets1), 4); \ + __m128i p11 = _mm_i32gather_epi32((const int *)baseptr, _mm_add_epi32(y1, coloffsets1), 4); \ + __m128i p00_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(p00, _mm_setzero_si128()), ab_lo); \ + __m128i p01_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(p01, _mm_setzero_si128()), invab_lo); \ + __m128i p10_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(p10, _mm_setzero_si128()), ainvb_lo); \ + __m128i p11_lo = _mm_mullo_epi16(_mm_unpacklo_epi8(p11, _mm_setzero_si128()), invainvb_lo); \ + __m128i p00_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(p00, _mm_setzero_si128()), ab_hi); \ + __m128i p01_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(p01, _mm_setzero_si128()), invab_hi); \ + __m128i p10_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(p10, _mm_setzero_si128()), ainvb_hi); \ + __m128i p11_hi = _mm_mullo_epi16(_mm_unpackhi_epi8(p11, _mm_setzero_si128()), invainvb_hi); \ + __m128i fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_adds_epu16(p00_lo, p01_lo), _mm_adds_epu16(p10_lo, p11_lo)), m127), 8); \ + __m128i fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_adds_epu16(p00_hi, p01_hi), _mm_adds_epu16(p10_hi, p11_hi)), m127), 8); \ + fg = _mm_packus_epi16(fg_lo, fg_hi); \ +} + +#define VEC_SAMPLE_BILINEAR4_COLUMN(fg, col0, col1, texturefracx, texturefracy, one, height) { \ + __m128i m127 = _mm_set1_epi16(127); \ + fg = _mm_setzero_si128(); \ + for (int i = 0; i < 4; i++) \ + { \ + uint32_t frac_y0 = (texturefracy[i] >> FRACBITS) * height[i]; \ + uint32_t frac_y1 = ((texturefracy[i] + one[i]) >> FRACBITS) * height[i]; \ + uint32_t y0 = (frac_y0 >> FRACBITS); \ + uint32_t y1 = (frac_y1 >> FRACBITS); \ + \ + uint32_t inv_b = texturefracx[i]; \ + uint32_t inv_a = (frac_y1 >> (FRACBITS - 4)) & 15; \ + \ + __m128i ab_invab = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2); \ + __m128i ainvb_invainvb = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2 + 1); \ + \ + __m128i gather = _mm_set_epi32(col1[i][y1], col1[i][y0], col0[i][y1], col0[i][y0]); \ + __m128i p0 = _mm_unpacklo_epi8(gather, _mm_setzero_si128()); \ + __m128i p1 = _mm_unpackhi_epi8(gather, _mm_setzero_si128()); \ + \ + __m128i tmp = _mm_adds_epu16(_mm_mullo_epi16(p0, ab_invab), _mm_mullo_epi16(p1, ainvb_invainvb)); \ + __m128i color = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_srli_si128(tmp, 8), tmp), m127), 8); \ + \ + fg = _mm_or_si128(_mm_srli_si128(fg, 4), _mm_slli_si128(_mm_packus_epi16(color, _mm_setzero_si128()), 12)); \ + } \ +} + +#define VEC_SAMPLE_MIP_NEAREST4_COLUMN(fg, col0, col1, mipfrac, texturefracy, height0, height1) { \ + uint32_t y0[4], y1[4]; \ + for (int i = 0; i < 4; i++) \ + { \ + y0[i] = (texturefracy[i] >> FRACBITS) * height0[i]; \ + y1[i] = (texturefracy[i] >> FRACBITS) * height1[i]; \ + } \ + __m128i p0 = _mm_set_epi32(col0[y0[3]], col0[y0[2]], col0[y0[1]], col0[y0[0]]); \ + __m128i p1 = _mm_set_epi32(col1[y1[3]], col1[y1[2]], col1[y1[1]], col1[y1[0]]); \ + __m128i t = _mm_loadu_si128((const __m128i*)mipfrac); \ + __m128i inv_t = _mm_sub_epi32(_mm_set1_epi32(256), mipfrac); \ + __m128i p0_lo = _mm_unpacklo_epi8(p0, _mm_setzero_si128()); \ + __m128i p0_hi = _mm_unpackhi_epi8(p0, _mm_setzero_si128()); \ + __m128i p1_lo = _mm_unpacklo_epi8(p1, _mm_setzero_si128()); \ + __m128i p1_hi = _mm_unpackhi_epi8(p1, _mm_setzero_si128()); \ + __m128i fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(p0_lo, t), _mm_mullo_epi16(p1_lo, inv_t)), 8); \ + __m128i fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(p0_hi, t), _mm_mullo_epi16(p1_hi, inv_t)), 8); \ + fg = _mm_packus_epi16(fg_lo, fg_hi); \ +} + +#define VEC_SAMPLE_BILINEAR4_SPAN(fg, texture, xfrac, yfrac, xstep, ystep, xbits, ybits) { \ + int xshift = (32 - xbits); \ + int yshift = (32 - ybits); \ + int xmask = (1 << xshift) - 1; \ + int ymask = (1 << yshift) - 1; \ + \ + __m128i m127 = _mm_set1_epi16(127); \ + fg = _mm_setzero_si128(); \ + for (int i = 0; i < 4; i++) \ + { \ + uint32_t x = xfrac >> xbits; \ + uint32_t y = yfrac >> ybits; \ + \ + uint32_t p00 = texture[(y & ymask) + ((x & xmask) << yshift)]; \ + uint32_t p01 = texture[((y + 1) & ymask) + ((x & xmask) << yshift)]; \ + uint32_t p10 = texture[(y & ymask) + (((x + 1) & xmask) << yshift)]; \ + uint32_t p11 = texture[((y + 1) & ymask) + (((x + 1) & xmask) << yshift)]; \ + \ + uint32_t inv_b = (xfrac >> (xbits - 4)) & 15; \ + uint32_t inv_a = (yfrac >> (ybits - 4)) & 15; \ + \ + __m128i ab_invab = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2); \ + __m128i ainvb_invainvb = _mm_load_si128(SampleBgra::samplertable + inv_b * 32 + inv_a * 2 + 1); \ + \ + __m128i p0 = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, p01, p00), _mm_setzero_si128()); \ + __m128i p1 = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, p11, p10), _mm_setzero_si128()); \ + \ + __m128i tmp = _mm_adds_epu16(_mm_mullo_epi16(p0, ab_invab), _mm_mullo_epi16(p1, ainvb_invainvb)); \ + __m128i color = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_srli_si128(tmp, 8), tmp), m127), 8); \ + \ + fg = _mm_or_si128(_mm_srli_si128(fg, 4), _mm_slli_si128(_mm_packus_epi16(color, _mm_setzero_si128()), 12)); \ + \ + xfrac += xstep; \ + yfrac += ystep; \ + } \ +} + +// Calculate constants for a simple shade with gamma correction +#define AVX_LINEAR_SHADE_SIMPLE_INIT(light) \ + __m256 mlight_hi = _mm256_set_ps(1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f), 1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f)); \ + mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \ + __m256 mlight_lo = mlight_hi; \ + __m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \ + __m256 m255 = _mm256_set1_ps(255.0f); + +// Calculate constants for a simple shade with different light levels for each pixel and gamma correction +#define AVX_LINEAR_SHADE_SIMPLE_INIT4(light3, light2, light1, light0) \ + __m256 mlight_hi = _mm256_set_ps(1.0f, light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), 1.0f, light0 * (1.0f/256.0f), light0 * (1.0f/256.0f), light0 * (1.0f/256.0f)); \ + __m256 mlight_lo = _mm256_set_ps(1.0f, light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), 1.0f, light2 * (1.0f/256.0f), light2 * (1.0f/256.0f), light2 * (1.0f/256.0f)); \ + mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \ + mlight_lo = _mm256_mul_ps(mlight_lo, mlight_lo); \ + __m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \ + __m256 m255 = _mm256_set1_ps(255.0f); + +// Simple shade 4 pixels with gamma correction +#define AVX_LINEAR_SHADE_SIMPLE(fg) { \ + __m256i fg_16 = _mm256_set_m128i(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _mm_unpacklo_epi8(fg, _mm_setzero_si128())); \ + __m256 fg_hi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(fg_16, _mm256_setzero_si256())); \ + __m256 fg_lo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(fg_16, _mm256_setzero_si256())); \ + fg_hi = _mm256_mul_ps(fg_hi, mrcp_255); \ + fg_hi = _mm256_mul_ps(fg_hi, fg_hi); \ + fg_hi = _mm256_mul_ps(fg_hi, mlight_hi); \ + fg_hi = _mm256_sqrt_ps(fg_hi); \ + fg_hi = _mm256_mul_ps(fg_hi, m255); \ + fg_lo = _mm256_mul_ps(fg_lo, mrcp_255); \ + fg_lo = _mm256_mul_ps(fg_lo, fg_lo); \ + fg_lo = _mm256_mul_ps(fg_lo, mlight_lo); \ + fg_lo = _mm256_sqrt_ps(fg_lo); \ + fg_lo = _mm256_mul_ps(fg_lo, m255); \ + fg_16 = _mm256_packus_epi32(_mm256_cvtps_epi32(fg_lo), _mm256_cvtps_epi32(fg_hi)); \ + fg = _mm_packus_epi16(_mm256_extractf128_si256(fg_16, 0), _mm256_extractf128_si256(fg_16, 1)); \ +} + +// Calculate constants for a complex shade with gamma correction +#define AVX_LINEAR_SHADE_INIT(light, shade_constants) \ + __m256 mlight_hi = _mm256_set_ps(1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f), 1.0f, light * (1.0f/256.0f), light * (1.0f/256.0f), light * (1.0f/256.0f)); \ + mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \ + __m256 mlight_lo = mlight_hi; \ + __m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \ + __m256 m255 = _mm256_set1_ps(255.0f); \ + __m256 color = _mm256_set_ps( \ + 1.0f, shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f), \ + 1.0f, shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f)); \ + __m256 fade = _mm256_set_ps( \ + 0.0f, shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f), \ + 0.0f, shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f)); \ + __m256 fade_amount_hi = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_hi)); \ + __m256 fade_amount_lo = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_lo)); \ + __m256 inv_desaturate = _mm256_set1_ps((256 - shade_constants.desaturate) * (1.0f/256.0f)); \ + __m128 ss_desaturate = _mm_set_ss(shade_constants.desaturate * (1.0f/256.0f)); \ + __m128 intensity_weight = _mm_set_ps(0.0f, 77.0f/256.0f, 143.0f/256.0f, 37.0f/256.0f); + +// Calculate constants for a complex shade with different light levels for each pixel and gamma correction +#define AVX_LINEAR_SHADE_INIT4(light3, light2, light1, light0, shade_constants) \ + __m256 mlight_hi = _mm256_set_ps(1.0f, light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), light1 * (1.0f/256.0f), 1.0f, light0 * (1.0f/256.0f), light0 * (1.0f/256.0f), light0 * (1.0f/256.0f)); \ + __m256 mlight_lo = _mm256_set_ps(1.0f, light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), light3 * (1.0f/256.0f), 1.0f, light2 * (1.0f/256.0f), light2 * (1.0f/256.0f), light2 * (1.0f/256.0f)); \ + mlight_hi = _mm256_mul_ps(mlight_hi, mlight_hi); \ + mlight_lo = _mm256_mul_ps(mlight_lo, mlight_lo); \ + __m256 mrcp_255 = _mm256_set1_ps(1.0f/255.0f); \ + __m256 m255 = _mm256_set1_ps(255.0f); \ + __m256 color = _mm256_set_ps( \ + 1.0f, shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f), \ + 1.0f, shade_constants.light_red * (1.0f/256.0f), shade_constants.light_green * (1.0f/256.0f), shade_constants.light_blue * (1.0f/256.0f)); \ + __m256 fade = _mm256_set_ps( \ + 0.0f, shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f), \ + 0.0f, shade_constants.fade_red * (1.0f/256.0f), shade_constants.fade_green * (1.0f/256.0f), shade_constants.fade_blue * (1.0f/256.0f)); \ + __m256 fade_amount_hi = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_hi)); \ + __m256 fade_amount_lo = _mm256_mul_ps(fade, _mm256_sub_ps(_mm256_set1_ps(1.0f), mlight_lo)); \ + __m256 inv_desaturate = _mm256_set1_ps((256 - shade_constants.desaturate) * (1.0f/256.0f)); \ + __m128 ss_desaturate = _mm_set_ss(shade_constants.desaturate * (1.0f/256.0f)); \ + __m128 intensity_weight = _mm_set_ps(0.0f, 77.0f/256.0f, 143.0f/256.0f, 37.0f/256.0f); + +// Complex shade 4 pixels with gamma correction +#define AVX_LINEAR_SHADE(fg, shade_constants) { \ + __m256i fg_16 = _mm256_set_m128i(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _mm_unpacklo_epi8(fg, _mm_setzero_si128())); \ + __m256 fg_hi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(fg_16, _mm256_setzero_si256())); \ + __m256 fg_lo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(fg_16, _mm256_setzero_si256())); \ + fg_hi = _mm256_mul_ps(fg_hi, mrcp_255); \ + fg_hi = _mm256_mul_ps(fg_hi, fg_hi); \ + fg_lo = _mm256_mul_ps(fg_lo, mrcp_255); \ + fg_lo = _mm256_mul_ps(fg_lo, fg_lo); \ + \ + __m128 intensity_hi0 = _mm_mul_ps(_mm256_extractf128_ps(fg_hi, 0), intensity_weight); \ + __m128 intensity_hi1 = _mm_mul_ps(_mm256_extractf128_ps(fg_hi, 1), intensity_weight); \ + intensity_hi0 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_hi0, _mm_shuffle_ps(intensity_hi0, intensity_hi0, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_hi0, intensity_hi0, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \ + intensity_hi0 = _mm_shuffle_ps(intensity_hi0, intensity_hi0, _MM_SHUFFLE(0,0,0,0)); \ + intensity_hi1 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_hi1, _mm_shuffle_ps(intensity_hi1, intensity_hi1, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_hi1, intensity_hi1, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \ + intensity_hi1 = _mm_shuffle_ps(intensity_hi1, intensity_hi1, _MM_SHUFFLE(0,0,0,0)); \ + __m256 intensity_hi = _mm256_set_m128(intensity_hi1, intensity_hi0); \ + \ + fg_hi = _mm256_add_ps(_mm256_mul_ps(fg_hi, inv_desaturate), intensity_hi); \ + fg_hi = _mm256_add_ps(_mm256_mul_ps(fg_hi, mlight_hi), fade_amount_hi); \ + fg_hi = _mm256_mul_ps(fg_hi, color); \ + \ + __m128 intensity_lo0 = _mm_mul_ps(_mm256_extractf128_ps(fg_lo, 0), intensity_weight); \ + __m128 intensity_lo1 = _mm_mul_ps(_mm256_extractf128_ps(fg_lo, 1), intensity_weight); \ + intensity_lo0 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_lo0, _mm_shuffle_ps(intensity_lo0, intensity_lo0, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_lo0, intensity_lo0, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \ + intensity_lo0 = _mm_shuffle_ps(intensity_lo0, intensity_lo0, _MM_SHUFFLE(0,0,0,0)); \ + intensity_lo1 = _mm_mul_ss(_mm_add_ss(_mm_add_ss(intensity_lo1, _mm_shuffle_ps(intensity_lo1, intensity_lo1, _MM_SHUFFLE(1,1,1,1))), _mm_shuffle_ps(intensity_lo1, intensity_lo1, _MM_SHUFFLE(2,2,2,2))), ss_desaturate); \ + intensity_lo1 = _mm_shuffle_ps(intensity_lo1, intensity_lo1, _MM_SHUFFLE(0,0,0,0)); \ + __m256 intensity_lo = _mm256_set_m128(intensity_lo1, intensity_lo0); \ + \ + fg_lo = _mm256_add_ps(_mm256_mul_ps(fg_lo, inv_desaturate), intensity_lo); \ + fg_lo = _mm256_add_ps(_mm256_mul_ps(fg_lo, mlight_lo), fade_amount_lo); \ + fg_lo = _mm256_mul_ps(fg_lo, color); \ + \ + fg_hi = _mm256_sqrt_ps(fg_hi); \ + fg_hi = _mm256_mul_ps(fg_hi, m255); \ + fg_lo = _mm256_sqrt_ps(fg_lo); \ + fg_lo = _mm256_mul_ps(fg_lo, m255); \ + fg_16 = _mm256_packus_epi32(_mm256_cvtps_epi32(fg_lo), _mm256_cvtps_epi32(fg_hi)); \ + fg = _mm_packus_epi16(_mm256_extractf128_si256(fg_16, 0), _mm256_extractf128_si256(fg_16, 1)); \ +} + +/* +// Complex shade 8 pixels +#define AVX_SHADE(fg, shade_constants) { \ + __m256i fg_hi = _mm256_unpackhi_epi8(fg, _mm256_setzero_si256()); \ + __m256i fg_lo = _mm256_unpacklo_epi8(fg, _mm256_setzero_si256()); \ + \ + __m256i intensity_hi = _mm256_mullo_epi16(fg_hi, _mm256_set_epi16(0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37)); \ + __m256i intensity_lo = _mm256_mullo_epi16(fg_lo, _mm256_set_epi16(0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37, 0, 77, 143, 37)); \ + __m256i intensity = _mm256_mullo_epi16(_mm256_srli_epi16(_mm256_hadd_epi16(_mm256_hadd_epi16(intensity_lo, intensity_hi), _mm256_setzero_si256()), 8), desaturate); \ + intensity = _mm256_unpacklo_epi16(intensity, intensity); \ + intensity_hi = _mm256_unpackhi_epi32(intensity, intensity); \ + intensity_lo = _mm256_unpacklo_epi32(intensity, intensity); \ + \ + fg_hi = _mm256_srli_epi16(_mm256_adds_epu16(_mm256_mullo_epi16(fg_hi, inv_desaturate), intensity_hi), 8); \ + fg_hi = _mm256_srli_epi16(_mm256_adds_epu16(_mm256_mullo_epi16(fg_hi, mlight), fade_amount), 8); \ + fg_hi = _mm256_srli_epi16(_mm256_mullo_epi16(fg_hi, color), 8); \ + \ + fg_lo = _mm256_srli_epi16(_mm256_adds_epu16(_mm256_mullo_epi16(fg_lo, inv_desaturate), intensity_lo), 8); \ + fg_lo = _mm256_srli_epi16(_mm256_adds_epu16(_mm256_mullo_epi16(fg_lo, mlight), fade_amount), 8); \ + fg_lo = _mm256_srli_epi16(_mm256_mullo_epi16(fg_lo, color), 8); \ + \ + fg = _mm256_packus_epi16(fg_lo, fg_hi); \ +} +*/ + +// Normal premultiplied alpha blend using the alpha from fg +#define VEC_ALPHA_BLEND(fg,bg) { \ + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); \ + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); \ + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); \ + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); \ + __m128i m256 = _mm_set1_epi16(256); \ + __m128i alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(fg_hi, _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(3,3,3,3)); \ + __m128i alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(fg_lo, _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(3,3,3,3)); \ + alpha_hi = _mm_add_epi16(alpha_hi, _mm_srli_epi16(alpha_hi, 7)); \ + alpha_lo = _mm_add_epi16(alpha_lo, _mm_srli_epi16(alpha_lo, 7)); \ + __m128i inv_alpha_hi = _mm_sub_epi16(m256, alpha_hi); \ + __m128i inv_alpha_lo = _mm_sub_epi16(m256, alpha_lo); \ + fg_hi = _mm_mullo_epi16(fg_hi, alpha_hi); \ + fg_hi = _mm_srli_epi16(fg_hi, 8); \ + fg_lo = _mm_mullo_epi16(fg_lo, alpha_lo); \ + fg_lo = _mm_srli_epi16(fg_lo, 8); \ + fg = _mm_packus_epi16(fg_lo, fg_hi); \ + bg_hi = _mm_mullo_epi16(bg_hi, inv_alpha_hi); \ + bg_hi = _mm_srli_epi16(bg_hi, 8); \ + bg_lo = _mm_mullo_epi16(bg_lo, inv_alpha_lo); \ + bg_lo = _mm_srli_epi16(bg_lo, 8); \ + bg = _mm_packus_epi16(bg_lo, bg_hi); \ + fg = _mm_adds_epu8(fg, bg); \ +} + +// Calculates the final alpha values to be used when combined with the source texture alpha channel +FORCEINLINE uint32_t calc_blend_bgalpha(uint32_t fg, uint32_t dest_alpha) +{ + uint32_t alpha = fg >> 24; + alpha += alpha >> 7; + uint32_t inv_alpha = 256 - alpha; + return (dest_alpha * alpha + 256 * inv_alpha + 128) >> 8; +} + +#define VEC_CALC_BLEND_ALPHA_VARS() __m128i msrc_alpha, mdest_alpha, m256, m255, m128; + +#define VEC_CALC_BLEND_ALPHA_INIT(src_alpha, dest_alpha) \ + msrc_alpha = _mm_set1_epi16(src_alpha); \ + mdest_alpha = _mm_set1_epi16(dest_alpha * 255 / 256); \ + m256 = _mm_set1_epi16(256); \ + m255 = _mm_set1_epi16(255); \ + m128 = _mm_set1_epi16(128); + +// Calculates the final alpha values to be used when combined with the source texture alpha channel +#define VEC_CALC_BLEND_ALPHA(fg) \ + __m128i fg_alpha_hi, fg_alpha_lo, bg_alpha_hi, bg_alpha_lo; { \ + __m128i alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_mm_unpackhi_epi8(fg, _mm_setzero_si128()), _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); \ + __m128i alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_mm_unpacklo_epi8(fg, _mm_setzero_si128()), _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); \ + alpha_hi = _mm_add_epi16(alpha_hi, _mm_srli_epi16(alpha_hi, 7)); \ + alpha_lo = _mm_add_epi16(alpha_lo, _mm_srli_epi16(alpha_lo, 7)); \ + bg_alpha_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_mullo_epi16(mdest_alpha, alpha_hi), _mm_mullo_epi16(m255, _mm_sub_epi16(m256, alpha_hi))), m128), 8); \ + bg_alpha_hi = _mm_add_epi16(bg_alpha_hi, _mm_srli_epi16(bg_alpha_hi, 7)); \ + bg_alpha_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(_mm_mullo_epi16(mdest_alpha, alpha_lo), _mm_mullo_epi16(m255, _mm_sub_epi16(m256, alpha_lo))), m128), 8); \ + bg_alpha_lo = _mm_add_epi16(bg_alpha_lo, _mm_srli_epi16(bg_alpha_lo, 7)); \ + fg_alpha_hi = msrc_alpha; \ + fg_alpha_lo = msrc_alpha; \ + } + +#define SSE_SHADE_VARS() __m128i mlight_hi, mlight_lo, color, fade, fade_amount_hi, fade_amount_lo, inv_desaturate; + +// Calculate constants for a simple shade +#define SSE_SHADE_SIMPLE_INIT(light) \ + mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \ + mlight_lo = mlight_hi; + +// Calculate constants for a simple shade with different light levels for each pixel +#define SSE_SHADE_SIMPLE_INIT4(light3, light2, light1, light0) \ + mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \ + mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2); + +// Simple shade 4 pixels +#define SSE_SHADE_SIMPLE(fg) { \ + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); \ + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); \ + fg_hi = _mm_mullo_epi16(fg_hi, mlight_hi); \ + fg_hi = _mm_srli_epi16(fg_hi, 8); \ + fg_lo = _mm_mullo_epi16(fg_lo, mlight_lo); \ + fg_lo = _mm_srli_epi16(fg_lo, 8); \ + fg = _mm_packus_epi16(fg_lo, fg_hi); \ +} + +// Calculate constants for a complex shade +#define SSE_SHADE_INIT(light, shade_constants) \ + mlight_hi = _mm_set_epi16(256, light, light, light, 256, light, light, light); \ + mlight_lo = mlight_hi; \ + color = _mm_set_epi16( \ + 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \ + 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \ + fade = _mm_set_epi16( \ + 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \ + 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \ + fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \ + fade_amount_lo = fade_amount_hi; \ + inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \ + +// Calculate constants for a complex shade with different light levels for each pixel +#define SSE_SHADE_INIT4(light3, light2, light1, light0, shade_constants) \ + mlight_hi = _mm_set_epi16(256, light1, light1, light1, 256, light0, light0, light0); \ + mlight_lo = _mm_set_epi16(256, light3, light3, light3, 256, light2, light2, light2); \ + color = _mm_set_epi16( \ + 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue, \ + 256, shade_constants.light_red, shade_constants.light_green, shade_constants.light_blue); \ + fade = _mm_set_epi16( \ + 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue, \ + 0, shade_constants.fade_red, shade_constants.fade_green, shade_constants.fade_blue); \ + fade_amount_hi = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_hi)); \ + fade_amount_lo = _mm_mullo_epi16(fade, _mm_subs_epu16(_mm_set1_epi16(256), mlight_lo)); \ + inv_desaturate = _mm_set1_epi16(256 - shade_constants.desaturate); \ + +// Complex shade 4 pixels +#define SSE_SHADE(fg, shade_constants) { \ + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); \ + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); \ + \ + __m128i intensity_hi = _mm_mullo_epi16(fg_hi, _mm_set_epi16(0, 77, 143, 37, 0, 77, 143, 37)); \ + uint16_t intensity_hi0 = ((_mm_extract_epi16(intensity_hi, 2) + _mm_extract_epi16(intensity_hi, 1) + _mm_extract_epi16(intensity_hi, 0)) >> 8) * shade_constants.desaturate; \ + uint16_t intensity_hi1 = ((_mm_extract_epi16(intensity_hi, 6) + _mm_extract_epi16(intensity_hi, 5) + _mm_extract_epi16(intensity_hi, 4)) >> 8) * shade_constants.desaturate; \ + intensity_hi = _mm_set_epi16(intensity_hi1, intensity_hi1, intensity_hi1, intensity_hi1, intensity_hi0, intensity_hi0, intensity_hi0, intensity_hi0); \ + \ + fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, inv_desaturate), intensity_hi), 8); \ + fg_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mlight_hi), fade_amount_hi), 8); \ + fg_hi = _mm_srli_epi16(_mm_mullo_epi16(fg_hi, color), 8); \ + \ + __m128i intensity_lo = _mm_mullo_epi16(fg_lo, _mm_set_epi16(0, 77, 143, 37, 0, 77, 143, 37)); \ + uint16_t intensity_lo0 = ((_mm_extract_epi16(intensity_lo, 2) + _mm_extract_epi16(intensity_lo, 1) + _mm_extract_epi16(intensity_lo, 0)) >> 8) * shade_constants.desaturate; \ + uint16_t intensity_lo1 = ((_mm_extract_epi16(intensity_lo, 6) + _mm_extract_epi16(intensity_lo, 5) + _mm_extract_epi16(intensity_lo, 4)) >> 8) * shade_constants.desaturate; \ + intensity_lo = _mm_set_epi16(intensity_lo1, intensity_lo1, intensity_lo1, intensity_lo1, intensity_lo0, intensity_lo0, intensity_lo0, intensity_lo0); \ + \ + fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, inv_desaturate), intensity_lo), 8); \ + fg_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mlight_lo), fade_amount_lo), 8); \ + fg_lo = _mm_srli_epi16(_mm_mullo_epi16(fg_lo, color), 8); \ + \ + fg = _mm_packus_epi16(fg_lo, fg_hi); \ +} + +#endif diff --git a/src/r_draw_rgba_sse.h b/src/r_draw_rgba_sse.h new file mode 100644 index 000000000..4ee557693 --- /dev/null +++ b/src/r_draw_rgba_sse.h @@ -0,0 +1,367 @@ +// +// SSE/AVX intrinsics based drawers for the r_draw family of drawers. +// +// Note: This header file is intentionally not guarded by a __R_DRAW_RGBA_SSE__ define. +// It is because the code is nearly identical for SSE vs AVX. The file is included +// multiple times by r_draw_rgba.cpp with different defines that changes the class +// names outputted and the type of intrinsics used. + +#ifdef _MSC_VER +#pragma warning(disable: 4752) // warning C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX +#endif + +class VecCommand(DrawSpanRGBA) : public DrawerCommand +{ + const uint32_t * RESTRICT _source; + fixed_t _xfrac; + fixed_t _yfrac; + fixed_t _xstep; + fixed_t _ystep; + int _x1; + int _x2; + int _y; + int _xbits; + int _ybits; + BYTE * RESTRICT _destorg; + fixed_t _light; + ShadeConstants _shade_constants; + bool _nearest_filter; + +public: + VecCommand(DrawSpanRGBA)() + { + _source = (const uint32_t*)ds_source; + _xfrac = ds_xfrac; + _yfrac = ds_yfrac; + _xstep = ds_xstep; + _ystep = ds_ystep; + _x1 = ds_x1; + _x2 = ds_x2; + _y = ds_y; + _xbits = ds_xbits; + _ybits = ds_ybits; + _destorg = dc_destorg; + _light = ds_light; + _shade_constants = ds_shade_constants; + _nearest_filter = !SampleBgra::span_sampler_setup(_source, _xbits, _ybits, _xstep, _ystep, ds_source_mipmapped); + } + + void Execute(DrawerThread *thread) override + { + if (thread->line_skipped_by_thread(_y)) + return; + + dsfixed_t xfrac; + dsfixed_t yfrac; + dsfixed_t xstep; + dsfixed_t ystep; + uint32_t* dest; + const uint32_t* source = _source; + int count; + int spot; + + xfrac = _xfrac; + yfrac = _yfrac; + + dest = ylookup[_y] + _x1 + (uint32_t*)_destorg; + + count = _x2 - _x1 + 1; + + xstep = _xstep; + ystep = _ystep; + + uint32_t light = LightBgra::calc_light_multiplier(_light); + ShadeConstants shade_constants = _shade_constants; + + if (_nearest_filter) + { + if (_xbits == 6 && _ybits == 6) + { + // 64x64 is the most common case by far, so special case it. + + int sse_count = count / 4; + count -= sse_count * 4; + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + + while (sse_count--) + { + // Current texture index in u,v. + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p0 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p1 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p2 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p3 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + // Lookup pixel from flat texture tile, + // re-index using light/colormap. + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + + // Next step in u,v. + dest += 4; + } + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + + while (sse_count--) + { + // Current texture index in u,v. + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p0 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p1 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p2 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + uint32_t p3 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + // Lookup pixel from flat texture tile, + // re-index using light/colormap. + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + + // Next step in u,v. + dest += 4; + } + } + + if (count == 0) + return; + + do + { + // Current texture index in u,v. + spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); + + // Lookup pixel from flat texture tile + *dest++ = LightBgra::shade_bgra(source[spot], light, shade_constants); + + // Next step in u,v. + xfrac += xstep; + yfrac += ystep; + } while (--count); + } + else + { + BYTE yshift = 32 - _ybits; + BYTE xshift = yshift - _xbits; + int xmask = ((1 << _xbits) - 1) << _ybits; + + int sse_count = count / 4; + count -= sse_count * 4; + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + + while (sse_count--) + { + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p0 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p1 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p2 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p3 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + // Lookup pixel from flat texture tile + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + + while (sse_count--) + { + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p0 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p1 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p2 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p3 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + // Lookup pixel from flat texture tile + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + + if (count == 0) + return; + + do + { + // Current texture index in u,v. + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + + // Lookup pixel from flat texture tile + *dest++ = LightBgra::shade_bgra(source[spot], light, shade_constants); + + // Next step in u,v. + xfrac += xstep; + yfrac += ystep; + } while (--count); + } + } + else + { + if (_xbits == 6 && _ybits == 6) + { + // 64x64 is the most common case by far, so special case it. + + int sse_count = count / 4; + count -= sse_count * 4; + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + while (sse_count--) + { + __m128i fg; + VEC_SAMPLE_BILINEAR4_SPAN(fg, source, xfrac, yfrac, xstep, ystep, 26, 26); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + while (sse_count--) + { + __m128i fg; + VEC_SAMPLE_BILINEAR4_SPAN(fg, source, xfrac, yfrac, xstep, ystep, 26, 26); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + + if (count == 0) + return; + + do + { + *dest++ = LightBgra::shade_bgra(SampleBgra::sample_bilinear(source, xfrac, yfrac, 26, 26), light, shade_constants); + xfrac += xstep; + yfrac += ystep; + } while (--count); + } + else + { + int sse_count = count / 4; + count -= sse_count * 4; + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + while (sse_count--) + { + __m128i fg; + int tmpx = 32 - _xbits; + int tmpy = 32 - _ybits; + VEC_SAMPLE_BILINEAR4_SPAN(fg, source, xfrac, yfrac, xstep, ystep, tmpx, tmpy); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + while (sse_count--) + { + __m128i fg; + int tmpx = 32 - _xbits; + int tmpy = 32 - _ybits; + VEC_SAMPLE_BILINEAR4_SPAN(fg, source, xfrac, yfrac, xstep, ystep, tmpx, tmpy); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + + if (count == 0) + return; + + do + { + *dest++ = LightBgra::shade_bgra(SampleBgra::sample_bilinear(source, xfrac, yfrac, 32 - _xbits, 32 - _ybits), light, shade_constants); + xfrac += xstep; + yfrac += ystep; + } while (--count); + } + } + } +}; diff --git a/src/r_drawt.cpp b/src/r_drawt.cpp index e8faff0ce..837093044 100644 --- a/src/r_drawt.cpp +++ b/src/r_drawt.cpp @@ -313,21 +313,21 @@ void rt_Translate4cols(const BYTE *translation, int yl, int yh) } // Translates one span at hx to the screen at sx. -void rt_tlate1col (int hx, int sx, int yl, int yh) +void rt_tlate1col_c (int hx, int sx, int yl, int yh) { rt_Translate1col(dc_translation, hx, yl, yh); rt_map1col(hx, sx, yl, yh); } // Translates all four spans to the screen starting at sx. -void rt_tlate4cols (int sx, int yl, int yh) +void rt_tlate4cols_c (int sx, int yl, int yh) { rt_Translate4cols(dc_translation, yl, yh); rt_map4cols(sx, yl, yh); } // Adds one span at hx to the screen at sx without clamping. -void rt_add1col (int hx, int sx, int yl, int yh) +void rt_add1col_c (int hx, int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -417,21 +417,21 @@ void rt_add4cols_c (int sx, int yl, int yh) } // Translates and adds one span at hx to the screen at sx without clamping. -void rt_tlateadd1col (int hx, int sx, int yl, int yh) +void rt_tlateadd1col_c (int hx, int sx, int yl, int yh) { rt_Translate1col(dc_translation, hx, yl, yh); rt_add1col(hx, sx, yl, yh); } // Translates and adds all four spans to the screen starting at sx without clamping. -void rt_tlateadd4cols (int sx, int yl, int yh) +void rt_tlateadd4cols_c (int sx, int yl, int yh) { rt_Translate4cols(dc_translation, yl, yh); rt_add4cols(sx, yl, yh); } // Shades one span at hx to the screen at sx. -void rt_shaded1col (int hx, int sx, int yl, int yh) +void rt_shaded1col_c (int hx, int sx, int yl, int yh) { DWORD *fgstart; BYTE *colormap; @@ -507,7 +507,7 @@ void rt_shaded4cols_c (int sx, int yl, int yh) } // Adds one span at hx to the screen at sx with clamping. -void rt_addclamp1col (int hx, int sx, int yl, int yh) +void rt_addclamp1col_c (int hx, int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -556,13 +556,14 @@ void rt_addclamp4cols_c (int sx, int yl, int yh) return; count++; - DWORD *fg2rgb = dc_srcblend; - DWORD *bg2rgb = dc_destblend; dest = ylookup[yl] + sx + dc_destorg; source = &dc_temp[yl*4]; pitch = dc_pitch; colormap = dc_colormap; + DWORD *fg2rgb = dc_srcblend; + DWORD *bg2rgb = dc_destblend; + do { DWORD a = fg2rgb[colormap[source[0]]] + bg2rgb[dest[0]]; DWORD b = a; @@ -607,21 +608,21 @@ void rt_addclamp4cols_c (int sx, int yl, int yh) } // Translates and adds one span at hx to the screen at sx with clamping. -void rt_tlateaddclamp1col (int hx, int sx, int yl, int yh) +void rt_tlateaddclamp1col_c (int hx, int sx, int yl, int yh) { rt_Translate1col(dc_translation, hx, yl, yh); rt_addclamp1col(hx, sx, yl, yh); } // Translates and adds all four spans to the screen starting at sx with clamping. -void rt_tlateaddclamp4cols (int sx, int yl, int yh) +void rt_tlateaddclamp4cols_c (int sx, int yl, int yh) { rt_Translate4cols(dc_translation, yl, yh); rt_addclamp4cols(sx, yl, yh); } // Subtracts one span at hx to the screen at sx with clamping. -void rt_subclamp1col (int hx, int sx, int yl, int yh) +void rt_subclamp1col_c (int hx, int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -656,7 +657,7 @@ void rt_subclamp1col (int hx, int sx, int yl, int yh) } // Subtracts all four spans to the screen starting at sx with clamping. -void rt_subclamp4cols (int sx, int yl, int yh) +void rt_subclamp4cols_c (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -716,21 +717,21 @@ void rt_subclamp4cols (int sx, int yl, int yh) } // Translates and subtracts one span at hx to the screen at sx with clamping. -void rt_tlatesubclamp1col (int hx, int sx, int yl, int yh) +void rt_tlatesubclamp1col_c (int hx, int sx, int yl, int yh) { rt_Translate1col(dc_translation, hx, yl, yh); rt_subclamp1col(hx, sx, yl, yh); } // Translates and subtracts all four spans to the screen starting at sx with clamping. -void rt_tlatesubclamp4cols (int sx, int yl, int yh) +void rt_tlatesubclamp4cols_c (int sx, int yl, int yh) { rt_Translate4cols(dc_translation, yl, yh); rt_subclamp4cols(sx, yl, yh); } // Subtracts one span at hx from the screen at sx with clamping. -void rt_revsubclamp1col (int hx, int sx, int yl, int yh) +void rt_revsubclamp1col_c (int hx, int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -765,7 +766,7 @@ void rt_revsubclamp1col (int hx, int sx, int yl, int yh) } // Subtracts all four spans from the screen starting at sx with clamping. -void rt_revsubclamp4cols (int sx, int yl, int yh) +void rt_revsubclamp4cols_c (int sx, int yl, int yh) { BYTE *colormap; BYTE *source; @@ -825,14 +826,14 @@ void rt_revsubclamp4cols (int sx, int yl, int yh) } // Translates and subtracts one span at hx from the screen at sx with clamping. -void rt_tlaterevsubclamp1col (int hx, int sx, int yl, int yh) +void rt_tlaterevsubclamp1col_c (int hx, int sx, int yl, int yh) { rt_Translate1col(dc_translation, hx, yl, yh); rt_revsubclamp1col(hx, sx, yl, yh); } // Translates and subtracts all four spans from the screen starting at sx with clamping. -void rt_tlaterevsubclamp4cols (int sx, int yl, int yh) +void rt_tlaterevsubclamp4cols_c (int sx, int yl, int yh) { rt_Translate4cols(dc_translation, yl, yh); rt_revsubclamp4cols(sx, yl, yh); @@ -855,18 +856,21 @@ void rt_draw4cols (int sx) } #ifdef X86_ASM - // Setup assembly routines for changed colormaps or other parameters. - if (hcolfunc_post4 == rt_shaded4cols) + if (!r_swtruecolor) { - R_SetupShadedCol(); - } - else if (hcolfunc_post4 == rt_addclamp4cols || hcolfunc_post4 == rt_tlateaddclamp4cols) - { - R_SetupAddClampCol(); - } - else if (hcolfunc_post4 == rt_add4cols || hcolfunc_post4 == rt_tlateadd4cols) - { - R_SetupAddCol(); + // Setup assembly routines for changed colormaps or other parameters. + if (hcolfunc_post4 == rt_shaded4cols) + { + R_SetupShadedCol(); + } + else if (hcolfunc_post4 == rt_addclamp4cols || hcolfunc_post4 == rt_tlateaddclamp4cols) + { + R_SetupAddClampCol(); + } + else if (hcolfunc_post4 == rt_add4cols || hcolfunc_post4 == rt_tlateadd4cols) + { + R_SetupAddCol(); + } } #endif @@ -1002,7 +1006,7 @@ void rt_draw4cols (int sx) // Before each pass through a rendering loop that uses these routines, // call this function to set up the span pointers. -void rt_initcols (BYTE *buff) +void rt_initcols_pal (BYTE *buff) { int y; @@ -1011,6 +1015,14 @@ void rt_initcols (BYTE *buff) horizspan[y] = dc_ctspan[y] = &dc_tspans[y][0]; } +void rt_span_coverage_pal(int x, int start, int stop) +{ + unsigned int **tspan = &dc_ctspan[x & 3]; + (*tspan)[0] = start; + (*tspan)[1] = stop; + *tspan += 2; +} + // Stretches a column into a temporary buffer which is later // drawn to the screen along with up to three other columns. void R_DrawColumnHorizP_C (void) @@ -1073,7 +1085,7 @@ void R_DrawColumnHorizP_C (void) } // [RH] Just fills a column with a given color -void R_FillColumnHorizP (void) +void R_FillColumnHorizP_C (void) { int count = dc_count; BYTE color = dc_color; @@ -1108,6 +1120,7 @@ void R_FillColumnHorizP (void) void R_DrawMaskedColumnHoriz (const BYTE *column, const FTexture::Span *span) { + int pixelsize = r_swtruecolor ? 4 : 1; const fixed_t texturemid = FLOAT2FIXED(dc_texturemid); while (span->Length != 0) { @@ -1177,7 +1190,7 @@ void R_DrawMaskedColumnHoriz (const BYTE *column, const FTexture::Span *span) } } dc_source = column + top; - dc_dest = ylookup[dc_yl] + dc_x + dc_destorg; + dc_dest = (ylookup[dc_yl] + dc_x) * pixelsize + dc_destorg; dc_count = dc_yh - dc_yl + 1; hcolfunc_pre (); } diff --git a/src/r_drawt_rgba.cpp b/src/r_drawt_rgba.cpp new file mode 100644 index 000000000..45bd5c029 --- /dev/null +++ b/src/r_drawt_rgba.cpp @@ -0,0 +1,995 @@ +/* +** r_drawt_rgba.cpp +** Faster column drawers for modern processors, true color edition +** +**--------------------------------------------------------------------------- +** Copyright 1998-2006 Randy Heit +** All rights reserved. +** +** Redistribution and use in source and binary forms, with or without +** modification, are permitted provided that the following conditions +** are met: +** +** 1. Redistributions of source code must retain the above copyright +** notice, this list of conditions and the following disclaimer. +** 2. Redistributions in binary form must reproduce the above copyright +** notice, this list of conditions and the following disclaimer in the +** documentation and/or other materials provided with the distribution. +** 3. The name of the author may not be used to endorse or promote products +** derived from this software without specific prior written permission. +** +** THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +** OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +** IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +** INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +** NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**--------------------------------------------------------------------------- +** +** True color versions of the similar functions in r_drawt.cpp +** Please see r_drawt.cpp for a description of the globals used. +*/ + +#include "templates.h" +#include "doomtype.h" +#include "doomdef.h" +#include "r_defs.h" +#include "r_draw.h" +#include "r_main.h" +#include "r_things.h" +#include "v_video.h" +#include "r_draw_rgba.h" +#ifndef NO_SSE +#include +#endif + +extern unsigned int dc_tspans[4][MAXHEIGHT]; +extern unsigned int *dc_ctspan[4]; +extern unsigned int *horizspan[4]; + +#ifndef NO_SSE + +#ifdef _MSC_VER +#pragma warning(disable: 4101) // warning C4101: unreferenced local variable +#endif + +// Generate SSE drawers: +#define VecCommand(name) name##_SSE_Command +#define VEC_SHADE_VARS SSE_SHADE_VARS +#define VEC_SHADE_SIMPLE_INIT SSE_SHADE_SIMPLE_INIT +#define VEC_SHADE_SIMPLE_INIT4 SSE_SHADE_SIMPLE_INIT4 +#define VEC_SHADE_SIMPLE SSE_SHADE_SIMPLE +#define VEC_SHADE_INIT SSE_SHADE_INIT +#define VEC_SHADE_INIT4 SSE_SHADE_INIT4 +#define VEC_SHADE SSE_SHADE +#include "r_drawt_rgba_sse.h" +/* +// Generate AVX drawers: +#undef VecCommand +#undef VEC_SHADE_SIMPLE_INIT +#undef VEC_SHADE_SIMPLE_INIT4 +#undef VEC_SHADE_SIMPLE +#undef VEC_SHADE_INIT +#undef VEC_SHADE_INIT4 +#undef VEC_SHADE +#define VecCommand(name) name##_AVX_Command +#define VEC_SHADE_SIMPLE_INIT AVX_LINEAR_SHADE_SIMPLE_INIT +#define VEC_SHADE_SIMPLE_INIT4 AVX_LINEAR_SHADE_SIMPLE_INIT4 +#define VEC_SHADE_SIMPLE AVX_LINEAR_SHADE_SIMPLE +#define VEC_SHADE_INIT AVX_LINEAR_SHADE_INIT +#define VEC_SHADE_INIT4 AVX_LINEAR_SHADE_INIT4 +#define VEC_SHADE AVX_LINEAR_SHADE +#include "r_drawt_rgba_sse.h" +*/ +#endif + +///////////////////////////////////////////////////////////////////////////// + +class DrawerRt1colCommand : public DrawerCommand +{ +public: + int hx; + int sx; + int yl; + int yh; + BYTE * RESTRICT _destorg; + int _pitch; + + uint32_t _light; + ShadeConstants _shade_constants; + BYTE * RESTRICT _colormap; + + uint32_t _srcalpha; + uint32_t _destalpha; + + DrawerRt1colCommand(int hx, int sx, int yl, int yh) + { + this->hx = hx; + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _destorg = dc_destorg; + _pitch = dc_pitch; + + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _colormap = dc_colormap; + + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + class LoopIterator + { + public: + uint32_t *source; + uint32_t *dest; + int count; + int pitch, sincr; + + LoopIterator(DrawerRt1colCommand *command, DrawerThread *thread) + { + count = thread->count_for_thread(command->yl, (command->yh - command->yl + 1)); + if (count <= 0) + return; + + dest = thread->dest_for_thread(command->yl, command->_pitch, ylookup[command->yl] + command->sx + (uint32_t*)command->_destorg); + source = &thread->dc_temp_rgba[command->yl * 4 + command->hx] + thread->skipped_by_thread(command->yl) * 4; + pitch = command->_pitch * thread->num_cores; + sincr = thread->num_cores * 4; + } + + explicit operator bool() + { + return count > 0; + } + + bool next() + { + dest += pitch; + source += sincr; + return (--count) != 0; + } + }; +}; + +class DrawerRt4colsCommand : public DrawerCommand +{ +public: + int sx; + int yl; + int yh; + uint32_t _light; + ShadeConstants _shade_constants; + BYTE * RESTRICT _destorg; + int _pitch; + BYTE * RESTRICT _colormap; + uint32_t _srcalpha; + uint32_t _destalpha; + + DrawerRt4colsCommand(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _light = LightBgra::calc_light_multiplier(dc_light); + _shade_constants = dc_shade_constants; + _destorg = dc_destorg; + _pitch = dc_pitch; + _colormap = dc_colormap; + + _srcalpha = dc_srcalpha >> (FRACBITS - 8); + _destalpha = dc_destalpha >> (FRACBITS - 8); + } + + class LoopIterator + { + public: + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + LoopIterator(DrawerRt4colsCommand *command, DrawerThread *thread) + { + count = thread->count_for_thread(command->yl, command->yh - command->yl + 1); + if (count <= 0) + return; + + dest = thread->dest_for_thread(command->yl, command->_pitch, ylookup[command->yl] + command->sx + (uint32_t*)command->_destorg); + source = &thread->dc_temp_rgba[command->yl * 4] + thread->skipped_by_thread(command->yl) * 4; + pitch = command->_pitch * thread->num_cores; + sincr = thread->num_cores * 4; + } + + explicit operator bool() + { + return count > 0; + } + + bool next() + { + dest += pitch; + source += sincr; + return (--count) != 0; + } + }; +}; + +class RtCopy1colRGBACommand : public DrawerRt1colCommand +{ +public: + RtCopy1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = GPalette.BaseColors[*loop.source]; + *loop.dest = BlendBgra::copy(fg); + } while (loop.next()); + } +}; + +class RtMap1colRGBACommand : public DrawerRt1colCommand +{ +public: + RtMap1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_colormap[*loop.source], _light, _shade_constants); + *loop.dest = BlendBgra::copy(fg); + } while (loop.next()); + } +}; + +class RtMap4colsRGBACommand : public DrawerRt4colsCommand +{ +public: + RtMap4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_pal_index(_colormap[loop.source[i]], _light, _shade_constants); + loop.dest[i] = BlendBgra::copy(fg); + } + } while (loop.next()); + } +}; + +class RtAdd1colRGBACommand : public DrawerRt1colCommand +{ +public: + RtAdd1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(_colormap[*loop.source], _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class RtAdd4colsRGBACommand : public DrawerRt4colsCommand +{ +public: + RtAdd4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_pal_index(_colormap[loop.source[i]], _light, _shade_constants); + loop.dest[i] = BlendBgra::add(fg, loop.dest[i], _srcalpha, _destalpha); + } + } while (loop.next()); + } +}; + +class RtShaded1colRGBACommand : public DrawerRt1colCommand +{ + uint32_t _color; + +public: + RtShaded1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh) + { + _color = LightBgra::shade_pal_index(dc_color, _light, _shade_constants); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t alpha = _colormap[*loop.source] * 4; + uint32_t inv_alpha = 256 - alpha; + *loop.dest = BlendBgra::add(_color, *loop.dest, alpha, inv_alpha); + } while (loop.next()); + } +}; + +class RtShaded4colsRGBACommand : public DrawerRt4colsCommand +{ + uint32_t _color; + +public: + RtShaded4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh) + { + _color = LightBgra::shade_pal_index(dc_color, _light, _shade_constants); + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + for (int i = 0; i < 4; i++) + { + uint32_t alpha = _colormap[loop.source[i]] * 4; + uint32_t inv_alpha = 256 - alpha; + loop.dest[i] = BlendBgra::add(_color, loop.dest[i], alpha, inv_alpha); + } + } while (loop.next()); + } +}; + +class RtAddClamp1colRGBACommand : public DrawerRt1colCommand +{ +public: + RtAddClamp1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(*loop.source, _light, _shade_constants); + *loop.dest = BlendBgra::add(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class RtAddClamp4colsRGBACommand : public DrawerRt4colsCommand +{ +public: + RtAddClamp4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_pal_index(loop.source[i], _light, _shade_constants); + loop.dest[i] = BlendBgra::add(fg, loop.dest[i], _srcalpha, _destalpha); + } + } while (loop.next()); + } +}; + +class RtSubClamp1colRGBACommand : public DrawerRt1colCommand +{ +public: + RtSubClamp1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(*loop.source, _light, _shade_constants); + *loop.dest = BlendBgra::sub(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class RtSubClamp4colsRGBACommand : public DrawerRt4colsCommand +{ +public: + RtSubClamp4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_pal_index(loop.source[i], _light, _shade_constants); + loop.dest[i] = BlendBgra::sub(fg, loop.dest[i], _srcalpha, _destalpha); + } + } while (loop.next()); + } +}; + +class RtRevSubClamp1colRGBACommand : public DrawerRt1colCommand +{ +public: + RtRevSubClamp1colRGBACommand(int hx, int sx, int yl, int yh) : DrawerRt1colCommand(hx, sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + uint32_t fg = LightBgra::shade_pal_index(*loop.source, _light, _shade_constants); + *loop.dest = BlendBgra::revsub(fg, *loop.dest, _srcalpha, _destalpha); + } while (loop.next()); + } +}; + +class RtRevSubClamp4colsRGBACommand : public DrawerRt4colsCommand +{ +public: + RtRevSubClamp4colsRGBACommand(int sx, int yl, int yh) : DrawerRt4colsCommand(sx, yl, yh) + { + } + + void Execute(DrawerThread *thread) override + { + LoopIterator loop(this, thread); + if (!loop) return; + do + { + for (int i = 0; i < 4; i++) + { + uint32_t fg = LightBgra::shade_pal_index(loop.source[i], _light, _shade_constants); + loop.dest[i] = BlendBgra::revsub(fg, loop.dest[i], _srcalpha, _destalpha); + } + } while (loop.next()); + } +}; + +class RtTranslate1colRGBACommand : public DrawerCommand +{ + const BYTE * RESTRICT translation; + int hx; + int yl; + int yh; + +public: + RtTranslate1colRGBACommand(const BYTE *translation, int hx, int yl, int yh) + { + this->translation = translation; + this->hx = hx; + this->yl = yl; + this->yh = yh; + } + + void Execute(DrawerThread *thread) override + { + int count = yh - yl + 1; + uint32_t *source = &thread->dc_temp_rgba[yl*4 + hx]; + + // Things we do to hit the compiler's optimizer with a clue bat: + // 1. Parallelism is explicitly spelled out by using a separate + // C instruction for each assembly instruction. GCC lets me + // have four temporaries, but VC++ spills to the stack with + // more than two. Two is probably optimal, anyway. + // 2. The results of the translation lookups are explicitly + // stored in byte-sized variables. This causes the VC++ code + // to use byte mov instructions in most cases; for apparently + // random reasons, it will use movzx for some places. GCC + // ignores this and uses movzx always. + + // Do 8 rows at a time. + for (int count8 = count >> 3; count8; --count8) + { + int c0, c1; + BYTE b0, b1; + + c0 = source[0]; c1 = source[4]; + b0 = translation[c0]; b1 = translation[c1]; + source[0] = b0; source[4] = b1; + + c0 = source[8]; c1 = source[12]; + b0 = translation[c0]; b1 = translation[c1]; + source[8] = b0; source[12] = b1; + + c0 = source[16]; c1 = source[20]; + b0 = translation[c0]; b1 = translation[c1]; + source[16] = b0; source[20] = b1; + + c0 = source[24]; c1 = source[28]; + b0 = translation[c0]; b1 = translation[c1]; + source[24] = b0; source[28] = b1; + + source += 32; + } + // Finish by doing 1 row at a time. + for (count &= 7; count; --count, source += 4) + { + source[0] = translation[source[0]]; + } + } +}; + +class RtTranslate4colsRGBACommand : public DrawerCommand +{ + const BYTE * RESTRICT translation; + int yl; + int yh; + +public: + RtTranslate4colsRGBACommand(const BYTE *translation, int yl, int yh) + { + this->translation = translation; + this->yl = yl; + this->yh = yh; + } + + void Execute(DrawerThread *thread) override + { + int count = yh - yl + 1; + uint32_t *source = &thread->dc_temp_rgba[yl*4]; + int c0, c1; + BYTE b0, b1; + + // Do 2 rows at a time. + for (int count8 = count >> 1; count8; --count8) + { + c0 = source[0]; c1 = source[1]; + b0 = translation[c0]; b1 = translation[c1]; + source[0] = b0; source[1] = b1; + + c0 = source[2]; c1 = source[3]; + b0 = translation[c0]; b1 = translation[c1]; + source[2] = b0; source[3] = b1; + + c0 = source[4]; c1 = source[5]; + b0 = translation[c0]; b1 = translation[c1]; + source[4] = b0; source[5] = b1; + + c0 = source[6]; c1 = source[7]; + b0 = translation[c0]; b1 = translation[c1]; + source[6] = b0; source[7] = b1; + + source += 8; + } + // Do the final row if count was odd. + if (count & 1) + { + c0 = source[0]; c1 = source[1]; + b0 = translation[c0]; b1 = translation[c1]; + source[0] = b0; source[1] = b1; + + c0 = source[2]; c1 = source[3]; + b0 = translation[c0]; b1 = translation[c1]; + source[2] = b0; source[3] = b1; + } + } +}; + +class RtInitColsRGBACommand : public DrawerCommand +{ + BYTE * RESTRICT buff; + +public: + RtInitColsRGBACommand(BYTE *buff) + { + this->buff = buff; + } + + void Execute(DrawerThread *thread) override + { + thread->dc_temp_rgba = buff == NULL ? thread->dc_temp_rgbabuff_rgba : (uint32_t*)buff; + } +}; + +class DrawColumnHorizRGBACommand : public DrawerCommand +{ + int _count; + fixed_t _iscale; + fixed_t _texturefrac; + const BYTE * RESTRICT _source; + int _x; + int _yl; + int _yh; + +public: + DrawColumnHorizRGBACommand() + { + _count = dc_count; + _iscale = dc_iscale; + _texturefrac = dc_texturefrac; + _source = dc_source; + _x = dc_x; + _yl = dc_yl; + _yh = dc_yh; + } + + void Execute(DrawerThread *thread) override + { + int count = _count; + uint32_t *dest; + fixed_t fracstep; + fixed_t frac; + + if (count <= 0) + return; + + { + int x = _x & 3; + dest = &thread->dc_temp_rgba[x + 4 * _yl]; + } + fracstep = _iscale; + frac = _texturefrac; + + const BYTE *source = _source; + + if (count & 1) { + *dest = source[frac >> FRACBITS]; dest += 4; frac += fracstep; + } + if (count & 2) { + dest[0] = source[frac >> FRACBITS]; frac += fracstep; + dest[4] = source[frac >> FRACBITS]; frac += fracstep; + dest += 8; + } + if (count & 4) { + dest[0] = source[frac >> FRACBITS]; frac += fracstep; + dest[4] = source[frac >> FRACBITS]; frac += fracstep; + dest[8] = source[frac >> FRACBITS]; frac += fracstep; + dest[12] = source[frac >> FRACBITS]; frac += fracstep; + dest += 16; + } + count >>= 3; + if (!count) return; + + do + { + dest[0] = source[frac >> FRACBITS]; frac += fracstep; + dest[4] = source[frac >> FRACBITS]; frac += fracstep; + dest[8] = source[frac >> FRACBITS]; frac += fracstep; + dest[12] = source[frac >> FRACBITS]; frac += fracstep; + dest[16] = source[frac >> FRACBITS]; frac += fracstep; + dest[20] = source[frac >> FRACBITS]; frac += fracstep; + dest[24] = source[frac >> FRACBITS]; frac += fracstep; + dest[28] = source[frac >> FRACBITS]; frac += fracstep; + dest += 32; + } while (--count); + } +}; + +class FillColumnHorizRGBACommand : public DrawerCommand +{ + int _x; + int _yl; + int _yh; + int _count; + int _color; + +public: + FillColumnHorizRGBACommand() + { + _x = dc_x; + _count = dc_count; + _color = dc_color; + _yl = dc_yl; + _yh = dc_yh; + } + + void Execute(DrawerThread *thread) override + { + int count = _count; + int color = _color; + uint32_t *dest; + + if (count <= 0) + return; + + { + int x = _x & 3; + dest = &thread->dc_temp_rgba[x + 4 * _yl]; + } + + if (count & 1) { + *dest = color; + dest += 4; + } + if (!(count >>= 1)) + return; + do { + dest[0] = color; dest[4] = color; + dest += 8; + } while (--count); + } +}; + +///////////////////////////////////////////////////////////////////////////// + +// Copies one span at hx to the screen at sx. +void rt_copy1col_rgba (int hx, int sx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(hx, sx, yl, yh); +} + +// Copies all four spans to the screen starting at sx. +void rt_copy4cols_rgba (int sx, int yl, int yh) +{ + // To do: we could do this with SSE using __m128i + rt_copy1col_rgba(0, sx, yl, yh); + rt_copy1col_rgba(1, sx + 1, yl, yh); + rt_copy1col_rgba(2, sx + 2, yl, yh); + rt_copy1col_rgba(3, sx + 3, yl, yh); +} + +// Maps one span at hx to the screen at sx. +void rt_map1col_rgba (int hx, int sx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(hx, sx, yl, yh); +} + +// Maps all four spans to the screen starting at sx. +void rt_map4cols_rgba (int sx, int yl, int yh) +{ +#ifdef NO_SSE + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#else + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#endif +} + +void rt_Translate1col_rgba(const BYTE *translation, int hx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(translation, hx, yl, yh); +} + +void rt_Translate4cols_rgba(const BYTE *translation, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(translation, yl, yh); +} + +// Translates one span at hx to the screen at sx. +void rt_tlate1col_rgba (int hx, int sx, int yl, int yh) +{ + rt_Translate1col_rgba(dc_translation, hx, yl, yh); + rt_map1col(hx, sx, yl, yh); +} + +// Translates all four spans to the screen starting at sx. +void rt_tlate4cols_rgba (int sx, int yl, int yh) +{ + rt_Translate4cols_rgba(dc_translation, yl, yh); + rt_map4cols(sx, yl, yh); +} + +// Adds one span at hx to the screen at sx without clamping. +void rt_add1col_rgba (int hx, int sx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(hx, sx, yl, yh); +} + +// Adds all four spans to the screen starting at sx without clamping. +void rt_add4cols_rgba (int sx, int yl, int yh) +{ +#ifdef NO_SSE + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#else + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#endif +} + +// Translates and adds one span at hx to the screen at sx without clamping. +void rt_tlateadd1col_rgba (int hx, int sx, int yl, int yh) +{ + rt_Translate1col_rgba(dc_translation, hx, yl, yh); + rt_add1col(hx, sx, yl, yh); +} + +// Translates and adds all four spans to the screen starting at sx without clamping. +void rt_tlateadd4cols_rgba(int sx, int yl, int yh) +{ + rt_Translate4cols_rgba(dc_translation, yl, yh); + rt_add4cols(sx, yl, yh); +} + +// Shades one span at hx to the screen at sx. +void rt_shaded1col_rgba (int hx, int sx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(hx, sx, yl, yh); +} + +// Shades all four spans to the screen starting at sx. +void rt_shaded4cols_rgba (int sx, int yl, int yh) +{ +#ifdef NO_SSE + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#else + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#endif +} + +// Adds one span at hx to the screen at sx with clamping. +void rt_addclamp1col_rgba (int hx, int sx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(hx, sx, yl, yh); +} + +// Adds all four spans to the screen starting at sx with clamping. +void rt_addclamp4cols_rgba (int sx, int yl, int yh) +{ +#ifdef NO_SSE + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#else + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#endif +} + +// Translates and adds one span at hx to the screen at sx with clamping. +void rt_tlateaddclamp1col_rgba (int hx, int sx, int yl, int yh) +{ + rt_Translate1col_rgba(dc_translation, hx, yl, yh); + rt_addclamp1col_rgba(hx, sx, yl, yh); +} + +// Translates and adds all four spans to the screen starting at sx with clamping. +void rt_tlateaddclamp4cols_rgba (int sx, int yl, int yh) +{ + rt_Translate4cols_rgba(dc_translation, yl, yh); + rt_addclamp4cols(sx, yl, yh); +} + +// Subtracts one span at hx to the screen at sx with clamping. +void rt_subclamp1col_rgba (int hx, int sx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(hx, sx, yl, yh); +} + +// Subtracts all four spans to the screen starting at sx with clamping. +void rt_subclamp4cols_rgba (int sx, int yl, int yh) +{ +#ifdef NO_SSE + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#else + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#endif +} + +// Translates and subtracts one span at hx to the screen at sx with clamping. +void rt_tlatesubclamp1col_rgba (int hx, int sx, int yl, int yh) +{ + rt_Translate1col_rgba(dc_translation, hx, yl, yh); + rt_subclamp1col_rgba(hx, sx, yl, yh); +} + +// Translates and subtracts all four spans to the screen starting at sx with clamping. +void rt_tlatesubclamp4cols_rgba (int sx, int yl, int yh) +{ + rt_Translate4cols_rgba(dc_translation, yl, yh); + rt_subclamp4cols_rgba(sx, yl, yh); +} + +// Subtracts one span at hx from the screen at sx with clamping. +void rt_revsubclamp1col_rgba (int hx, int sx, int yl, int yh) +{ + DrawerCommandQueue::QueueCommand(hx, sx, yl, yh); +} + +// Subtracts all four spans from the screen starting at sx with clamping. +void rt_revsubclamp4cols_rgba (int sx, int yl, int yh) +{ +#ifdef NO_SSE + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#else + DrawerCommandQueue::QueueCommand(sx, yl, yh); +#endif +} + +// Translates and subtracts one span at hx from the screen at sx with clamping. +void rt_tlaterevsubclamp1col_rgba (int hx, int sx, int yl, int yh) +{ + rt_Translate1col_rgba(dc_translation, hx, yl, yh); + rt_revsubclamp1col_rgba(hx, sx, yl, yh); +} + +// Translates and subtracts all four spans from the screen starting at sx with clamping. +void rt_tlaterevsubclamp4cols_rgba (int sx, int yl, int yh) +{ + rt_Translate4cols_rgba(dc_translation, yl, yh); + rt_revsubclamp4cols_rgba(sx, yl, yh); +} + +// Before each pass through a rendering loop that uses these routines, +// call this function to set up the span pointers. +void rt_initcols_rgba (BYTE *buff) +{ + for (int y = 3; y >= 0; y--) + horizspan[y] = dc_ctspan[y] = &dc_tspans[y][0]; + + DrawerCommandQueue::QueueCommand(buff); +} + +void rt_span_coverage_rgba(int x, int start, int stop) +{ + unsigned int **tspan = &dc_ctspan[x & 3]; + (*tspan)[0] = start; + (*tspan)[1] = stop; + *tspan += 2; +} + +// Stretches a column into a temporary buffer which is later +// drawn to the screen along with up to three other columns. +void R_DrawColumnHoriz_rgba (void) +{ + if (dc_count <= 0) + return; + + int x = dc_x & 3; + unsigned int **span = &dc_ctspan[x]; + (*span)[0] = dc_yl; + (*span)[1] = dc_yh; + *span += 2; + + DrawerCommandQueue::QueueCommand(); +} + +// [RH] Just fills a column with a given color +void R_FillColumnHoriz_rgba (void) +{ + if (dc_count <= 0) + return; + + int x = dc_x & 3; + unsigned int **span = &dc_ctspan[x]; + (*span)[0] = dc_yl; + (*span)[1] = dc_yh; + *span += 2; + + DrawerCommandQueue::QueueCommand(); +} diff --git a/src/r_drawt_rgba_sse.h b/src/r_drawt_rgba_sse.h new file mode 100644 index 000000000..7a02f2282 --- /dev/null +++ b/src/r_drawt_rgba_sse.h @@ -0,0 +1,757 @@ +// +// SSE/AVX intrinsics based drawers for the r_drawt family of drawers. +// +// Note: This header file is intentionally not guarded by a __R_DRAWT_RGBA_SSE__ define. +// It is because the code is nearly identical for SSE vs AVX. The file is included +// multiple times by r_drawt_rgba.cpp with different defines that changes the class +// names outputted and the type of intrinsics used. + +#ifdef _MSC_VER +#pragma warning(disable: 4752) // warning C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX +#endif + +class VecCommand(RtMap4colsRGBA) : public DrawerCommand +{ + int sx; + int yl; + int yh; + fixed_t _light; + ShadeConstants _shade_constants; + BYTE * RESTRICT _destorg; + int _pitch; + BYTE * RESTRICT _colormap; + +public: + VecCommand(RtMap4colsRGBA)(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _light = dc_light; + _shade_constants = dc_shade_constants; + _destorg = dc_destorg; + _pitch = dc_pitch; + _colormap = dc_colormap; + } + + void Execute(DrawerThread *thread) override + { + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) + return; + + ShadeConstants shade_constants = _shade_constants; + uint32_t light = LightBgra::calc_light_multiplier(_light); + uint32_t *palette = (uint32_t*)GPalette.BaseColors; + + dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = _pitch * thread->num_cores; + sincr = thread->num_cores * 4; + + BYTE *colormap = _colormap; + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + + if (count & 1) { + uint32_t p0 = colormap[source[0]]; + uint32_t p1 = colormap[source[1]]; + uint32_t p2 = colormap[source[2]]; + uint32_t p3 = colormap[source[3]]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + + source += sincr; + dest += pitch; + } + if (!(count >>= 1)) + return; + + do { + // shade_pal_index 0-3 + { + uint32_t p0 = colormap[source[0]]; + uint32_t p1 = colormap[source[1]]; + uint32_t p2 = colormap[source[2]]; + uint32_t p3 = colormap[source[3]]; + + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + } + + // shade_pal_index 4-7 (pitch) + { + uint32_t p0 = colormap[source[sincr]]; + uint32_t p1 = colormap[source[sincr + 1]]; + uint32_t p2 = colormap[source[sincr + 2]]; + uint32_t p3 = colormap[source[sincr + 3]]; + + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)(dest + pitch), fg); + } + + source += sincr * 2; + dest += pitch * 2; + } while (--count); + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + + if (count & 1) { + uint32_t p0 = colormap[source[0]]; + uint32_t p1 = colormap[source[1]]; + uint32_t p2 = colormap[source[2]]; + uint32_t p3 = colormap[source[3]]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + + source += sincr; + dest += pitch; + } + if (!(count >>= 1)) + return; + + do { + // shade_pal_index 0-3 + { + uint32_t p0 = colormap[source[0]]; + uint32_t p1 = colormap[source[1]]; + uint32_t p2 = colormap[source[2]]; + uint32_t p3 = colormap[source[3]]; + + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + } + + // shade_pal_index 4-7 (pitch) + { + uint32_t p0 = colormap[source[sincr]]; + uint32_t p1 = colormap[source[sincr + 1]]; + uint32_t p2 = colormap[source[sincr + 2]]; + uint32_t p3 = colormap[source[sincr + 3]]; + + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)(dest + pitch), fg); + } + + source += sincr * 2; + dest += pitch * 2; + } while (--count); + } + } +}; + +class VecCommand(RtAdd4colsRGBA) : public DrawerCommand +{ + int sx; + int yl; + int yh; + BYTE * RESTRICT _destorg; + int _pitch; + fixed_t _light; + ShadeConstants _shade_constants; + BYTE * RESTRICT _colormap; + fixed_t _srcalpha; + fixed_t _destalpha; + +public: + VecCommand(RtAdd4colsRGBA)(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _destorg = dc_destorg; + _pitch = dc_pitch; + _light = dc_light; + _shade_constants = dc_shade_constants; + _colormap = dc_colormap; + _srcalpha = dc_srcalpha; + _destalpha = dc_destalpha; + } + + void Execute(DrawerThread *thread) override + { + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) + return; + + dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = _pitch * thread->num_cores; + sincr = 4 * thread->num_cores; + + uint32_t light = LightBgra::calc_light_multiplier(_light); + uint32_t *palette = (uint32_t*)GPalette.BaseColors; + BYTE *colormap = _colormap; + + uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + + ShadeConstants shade_constants = _shade_constants; + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = colormap[source[0]]; + uint32_t p1 = colormap[source[1]]; + uint32_t p2 = colormap[source[2]]; + uint32_t p3 = colormap[source[3]]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * fg_alpha + bg_red * bg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = colormap[source[0]]; + uint32_t p1 = colormap[source[1]]; + uint32_t p2 = colormap[source[2]]; + uint32_t p3 = colormap[source[3]]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * fg_alpha + bg_red * bg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + } +}; + +class VecCommand(RtShaded4colsRGBA) : public DrawerCommand +{ + int sx; + int yl; + int yh; + lighttable_t * RESTRICT _colormap; + int _color; + BYTE * RESTRICT _destorg; + int _pitch; + fixed_t _light; + +public: + VecCommand(RtShaded4colsRGBA)(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _colormap = dc_colormap; + _color = dc_color; + _destorg = dc_destorg; + _pitch = dc_pitch; + _light = dc_light; + } + + void Execute(DrawerThread *thread) override + { + BYTE *colormap; + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) + return; + + colormap = _colormap; + dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = _pitch * thread->num_cores; + sincr = 4 * thread->num_cores; + + __m128i fg = _mm_unpackhi_epi8(_mm_set1_epi32(LightBgra::shade_pal_index_simple(_color, LightBgra::calc_light_multiplier(_light))), _mm_setzero_si128()); + __m128i alpha_one = _mm_set1_epi16(64); + + do { + uint32_t p0 = colormap[source[0]]; + uint32_t p1 = colormap[source[1]]; + uint32_t p2 = colormap[source[2]]; + uint32_t p3 = colormap[source[3]]; + + __m128i alpha_hi = _mm_set_epi16(64, p3, p3, p3, 64, p2, p2, p2); + __m128i alpha_lo = _mm_set_epi16(64, p1, p1, p1, 64, p0, p0, p0); + __m128i inv_alpha_hi = _mm_subs_epu16(alpha_one, alpha_hi); + __m128i inv_alpha_lo = _mm_subs_epu16(alpha_one, alpha_lo); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * alpha + bg_red * inv_alpha) / 64: + __m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg, alpha_hi), _mm_mullo_epi16(bg_hi, inv_alpha_hi)), 6); + __m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg, alpha_lo), _mm_mullo_epi16(bg_lo, inv_alpha_lo)), 6); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } +}; + +class VecCommand(RtAddClamp4colsRGBA) : public DrawerCommand +{ + int sx; + int yl; + int yh; + BYTE * RESTRICT _destorg; + int _pitch; + fixed_t _light; + fixed_t _srcalpha; + fixed_t _destalpha; + ShadeConstants _shade_constants; + +public: + VecCommand(RtAddClamp4colsRGBA)(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _destorg = dc_destorg; + _pitch = dc_pitch; + _light = dc_light; + _srcalpha = dc_srcalpha; + _destalpha = dc_destalpha; + _shade_constants = dc_shade_constants; + } + + void Execute(DrawerThread *thread) override + { + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) + return; + + dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = _pitch * thread->num_cores; + sincr = 4 * thread->num_cores; + + uint32_t light = LightBgra::calc_light_multiplier(_light); + uint32_t *palette = (uint32_t*)GPalette.BaseColors; + + uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + + ShadeConstants shade_constants = _shade_constants; + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * fg_alpha + bg_red * bg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * fg_alpha + bg_red * bg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_adds_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + } +}; + +class VecCommand(RtSubClamp4colsRGBA) : public DrawerCommand +{ + int sx; + int yl; + int yh; + BYTE * RESTRICT _destorg; + int _pitch; + fixed_t _light; + fixed_t _srcalpha; + fixed_t _destalpha; + ShadeConstants _shade_constants; + +public: + VecCommand(RtSubClamp4colsRGBA)(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _destorg = dc_destorg; + _pitch = dc_pitch; + _light = dc_light; + _srcalpha = dc_srcalpha; + _destalpha = dc_destalpha; + _shade_constants = dc_shade_constants; + } + + void Execute(DrawerThread *thread) override + { + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) + return; + + dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = _pitch * thread->num_cores; + sincr = 4 * thread->num_cores; + + uint32_t light = LightBgra::calc_light_multiplier(_light); + uint32_t *palette = (uint32_t*)GPalette.BaseColors; + ShadeConstants shade_constants = _shade_constants; + + uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (bg_red * bg_alpha - fg_red * fg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, mbg_alpha), _mm_mullo_epi16(fg_hi, mfg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, mbg_alpha), _mm_mullo_epi16(fg_lo, mfg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (bg_red * bg_alpha - fg_red * fg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_hi, mbg_alpha), _mm_mullo_epi16(fg_hi, mfg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(bg_lo, mbg_alpha), _mm_mullo_epi16(fg_lo, mfg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + } +}; + +class VecCommand(RtRevSubClamp4colsRGBA) : public DrawerCommand +{ + int sx; + int yl; + int yh; + BYTE * RESTRICT _destorg; + int _pitch; + fixed_t _light; + fixed_t _srcalpha; + fixed_t _destalpha; + ShadeConstants _shade_constants; + +public: + VecCommand(RtRevSubClamp4colsRGBA)(int sx, int yl, int yh) + { + this->sx = sx; + this->yl = yl; + this->yh = yh; + + _destorg = dc_destorg; + _pitch = dc_pitch; + _light = dc_light; + _srcalpha = dc_srcalpha; + _destalpha = dc_destalpha; + _shade_constants = dc_shade_constants; + } + + void Execute(DrawerThread *thread) override + { + uint32_t *source; + uint32_t *dest; + int count; + int pitch; + int sincr; + + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) + return; + + dest = thread->dest_for_thread(yl, _pitch, ylookup[yl] + sx + (uint32_t*)_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = _pitch * thread->num_cores; + sincr = 4 * thread->num_cores; + + uint32_t light = LightBgra::calc_light_multiplier(_light); + uint32_t *palette = (uint32_t*)GPalette.BaseColors; + ShadeConstants shade_constants = _shade_constants; + + uint32_t fg_alpha = _srcalpha >> (FRACBITS - 8); + uint32_t bg_alpha = _destalpha >> (FRACBITS - 8); + + if (shade_constants.simple_shade) + { + VEC_SHADE_VARS(); + VEC_SHADE_SIMPLE_INIT(light); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE_SIMPLE(fg); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * fg_alpha - bg_red * bg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + else + { + VEC_SHADE_VARS(); + VEC_SHADE_INIT(light, shade_constants); + + __m128i mfg_alpha = _mm_set_epi16(256, fg_alpha, fg_alpha, fg_alpha, 256, fg_alpha, fg_alpha, fg_alpha); + __m128i mbg_alpha = _mm_set_epi16(256, bg_alpha, bg_alpha, bg_alpha, 256, bg_alpha, bg_alpha, bg_alpha); + + do { + uint32_t p0 = source[0]; + uint32_t p1 = source[1]; + uint32_t p2 = source[2]; + uint32_t p3 = source[3]; + + // shade_pal_index: + __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); + VEC_SHADE(fg, shade_constants); + + __m128i fg_hi = _mm_unpackhi_epi8(fg, _mm_setzero_si128()); + __m128i fg_lo = _mm_unpacklo_epi8(fg, _mm_setzero_si128()); + + // unpack bg: + __m128i bg = _mm_loadu_si128((const __m128i*)dest); + __m128i bg_hi = _mm_unpackhi_epi8(bg, _mm_setzero_si128()); + __m128i bg_lo = _mm_unpacklo_epi8(bg, _mm_setzero_si128()); + + // (fg_red * fg_alpha - bg_red * bg_alpha) / 256: + __m128i color_hi = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_hi, mfg_alpha), _mm_mullo_epi16(bg_hi, mbg_alpha)), 8); + __m128i color_lo = _mm_srli_epi16(_mm_subs_epu16(_mm_mullo_epi16(fg_lo, mfg_alpha), _mm_mullo_epi16(bg_lo, mbg_alpha)), 8); + + __m128i color = _mm_packus_epi16(color_lo, color_hi); + _mm_storeu_si128((__m128i*)dest, color); + + source += sincr; + dest += pitch; + } while (--count); + } + } +}; diff --git a/src/r_main.cpp b/src/r_main.cpp index 1e0de7ecc..e86ffe3b3 100644 --- a/src/r_main.cpp +++ b/src/r_main.cpp @@ -40,6 +40,7 @@ #include "r_segs.h" #include "r_3dfloors.h" #include "r_sky.h" +#include "r_draw_rgba.h" #include "st_stuff.h" #include "c_cvars.h" #include "c_dispatch.h" @@ -103,6 +104,8 @@ bool r_dontmaplines; CVAR (String, r_viewsize, "", CVAR_NOSET) CVAR (Bool, r_shadercolormaps, true, CVAR_ARCHIVE) +bool r_swtruecolor; + double r_BaseVisibility; double r_WallVisibility; double r_FloorVisibility; @@ -116,7 +119,7 @@ double FocalLengthX; double FocalLengthY; FDynamicColormap*basecolormap; // [RH] colormap currently drawing with int fixedlightlev; -lighttable_t *fixedcolormap; +FSWColormap *fixedcolormap; FSpecialColormap *realfixedcolormap; double WallTMapScale2; @@ -396,16 +399,6 @@ void R_InitRenderer() R_InitPlanes (); R_InitShadeMaps(); R_InitColumnDrawers (); - - colfunc = basecolfunc = R_DrawColumn; - fuzzcolfunc = R_DrawFuzzColumn; - transcolfunc = R_DrawTranslatedColumn; - spanfunc = R_DrawSpan; - - // [RH] Horizontal column drawers - hcolfunc_pre = R_DrawColumnHoriz; - hcolfunc_post1 = rt_map1col; - hcolfunc_post4 = rt_map4cols; } //========================================================================== @@ -466,16 +459,16 @@ void R_SetupColormap(player_t *player) if (player->fixedcolormap >= 0 && player->fixedcolormap < (int)SpecialColormaps.Size()) { realfixedcolormap = &SpecialColormaps[player->fixedcolormap]; - if (RenderTarget == screen && (DFrameBuffer *)screen->Accel2D && r_shadercolormaps) + if (RenderTarget == screen && (r_swtruecolor || ((DFrameBuffer *)screen->Accel2D && r_shadercolormaps))) { // Render everything fullbright. The copy to video memory will // apply the special colormap, so it won't be restricted to the // palette. - fixedcolormap = realcolormaps; + fixedcolormap = &realcolormaps; } else { - fixedcolormap = SpecialColormaps[player->fixedcolormap].Colormap; + fixedcolormap = &SpecialColormaps[player->fixedcolormap]; } } else if (player->fixedlightlevel >= 0 && player->fixedlightlevel < NUMCOLORMAPS) @@ -486,7 +479,7 @@ void R_SetupColormap(player_t *player) // [RH] Inverse light for shooting the Sigil if (fixedcolormap == NULL && extralight == INT_MIN) { - fixedcolormap = SpecialColormaps[INVERSECOLORMAP].Colormap; + fixedcolormap = &SpecialColormaps[INVERSECOLORMAP]; extralight = 0; } } @@ -575,6 +568,9 @@ void R_HighlightPortal (PortalDrawseg* pds) // [ZZ] NO OVERFLOW CHECKS HERE // I believe it won't break. if it does, blame me. :( + if (r_swtruecolor) // Assuming this is just a debug function + return; + BYTE color = (BYTE)BestColor((DWORD *)GPalette.BaseColors, 255, 0, 0, 0, 255); BYTE* pixels = RenderTarget->GetBuffer(); @@ -622,12 +618,26 @@ void R_EnterPortal (PortalDrawseg* pds, int depth) int Ytop = pds->ceilingclip[x-pds->x1]; int Ybottom = pds->floorclip[x-pds->x1]; - BYTE *dest = RenderTarget->GetBuffer() + x + Ytop * spacing; - - for (int y = Ytop; y <= Ybottom; y++) + if (r_swtruecolor) { - *dest = color; - dest += spacing; + uint32_t *dest = (uint32_t*)RenderTarget->GetBuffer() + x + Ytop * spacing; + + uint32_t c = GPalette.BaseColors[color].d; + for (int y = Ytop; y <= Ybottom; y++) + { + *dest = c; + dest += spacing; + } + } + else + { + BYTE *dest = RenderTarget->GetBuffer() + x + Ytop * spacing; + + for (int y = Ytop; y <= Ybottom; y++) + { + *dest = color; + dest += spacing; + } } } @@ -796,7 +806,8 @@ void R_SetupBuffer () static BYTE *lastbuff = NULL; int pitch = RenderTarget->GetPitch(); - BYTE *lineptr = RenderTarget->GetBuffer() + viewwindowy*pitch + viewwindowx; + int pixelsize = r_swtruecolor ? 4 : 1; + BYTE *lineptr = RenderTarget->GetBuffer() + (viewwindowy*pitch + viewwindowx) * pixelsize; if (dc_pitch != pitch || lineptr != lastbuff) { @@ -846,10 +857,10 @@ void R_RenderActorView (AActor *actor, bool dontmaplines) // [RH] Show off segs if r_drawflat is 1 if (r_drawflat) { - hcolfunc_pre = R_FillColumnHorizP; + hcolfunc_pre = R_FillColumnHoriz; hcolfunc_post1 = rt_copy1col; hcolfunc_post4 = rt_copy4cols; - colfunc = R_FillColumnP; + colfunc = R_FillColumn; spanfunc = R_FillSpan; } else @@ -924,7 +935,7 @@ void R_RenderActorView (AActor *actor, bool dontmaplines) // If we don't want shadered colormaps, NULL it now so that the // copy to the screen does not use a special colormap shader. - if (!r_shadercolormaps) + if (!r_shadercolormaps && !r_swtruecolor) { realfixedcolormap = NULL; } @@ -942,6 +953,15 @@ void R_RenderViewToCanvas (AActor *actor, DCanvas *canvas, int x, int y, int width, int height, bool dontmaplines) { const bool savedviewactive = viewactive; + const bool savedoutputformat = r_swtruecolor; + + if (r_swtruecolor != canvas->IsBgra()) + { + r_swtruecolor = canvas->IsBgra(); + R_InitColumnDrawers(); + } + + R_BeginDrawerCommands(); viewwidth = width; RenderTarget = canvas; @@ -954,13 +974,22 @@ void R_RenderViewToCanvas (AActor *actor, DCanvas *canvas, R_RenderActorView (actor, dontmaplines); + R_EndDrawerCommands(); + RenderTarget = screen; bRenderingToCanvas = false; R_ExecuteSetViewSize (); screen->Lock (true); R_SetupBuffer (); screen->Unlock (); + viewactive = savedviewactive; + r_swtruecolor = savedoutputformat; + + if (r_swtruecolor != canvas->IsBgra()) + { + R_InitColumnDrawers(); + } } //========================================================================== diff --git a/src/r_main.h b/src/r_main.h index 24103393d..8d1867526 100644 --- a/src/r_main.h +++ b/src/r_main.h @@ -82,6 +82,16 @@ extern bool r_dontmaplines; // Change R_CalcTiltedLighting() when this changes. #define GETPALOOKUP(vis,shade) (clamp (((shade)-FLOAT2FIXED(MIN(MAXLIGHTVIS,double(vis))))>>FRACBITS, 0, NUMCOLORMAPS-1)) +// Calculate the light multiplier for dc_light/ds_light +// This is used instead of GETPALOOKUP when ds_colormap/dc_colormap is set to the base colormap +// Returns a value between 0 and 1 in fixed point +#define LIGHTSCALE(vis,shade) FLOAT2FIXED(clamp((FIXED2DBL(shade) - (MIN(MAXLIGHTVIS,double(vis)))) / NUMCOLORMAPS, 0.0, (NUMCOLORMAPS-1)/(double)NUMCOLORMAPS)) + +// Converts fixedlightlev into a shade value +#define FIXEDLIGHT2SHADE(lightlev) (((lightlev) >> COLORMAPSHIFT) << FRACBITS) + +extern bool r_swtruecolor; + extern double GlobVis; void R_SetVisibility(double visibility); @@ -96,7 +106,7 @@ extern double r_SpriteVisibility; extern int r_actualextralight; extern bool foggy; extern int fixedlightlev; -extern lighttable_t* fixedcolormap; +extern FSWColormap* fixedcolormap; extern FSpecialColormap*realfixedcolormap; diff --git a/src/r_plane.cpp b/src/r_plane.cpp index ff23492ab..706d6fad7 100644 --- a/src/r_plane.cpp +++ b/src/r_plane.cpp @@ -58,6 +58,7 @@ #include "r_3dfloors.h" #include "v_palette.h" #include "r_data/colormaps.h" +#include "r_draw_rgba.h" #ifdef _MSC_VER #pragma warning(disable:4244) @@ -227,12 +228,11 @@ void R_MapPlane (int y, int x1) if (plane_shade) { // Determine lighting based on the span's distance from the viewer. - ds_colormap = basecolormap->Maps + (GETPALOOKUP ( - GlobVis * fabs(CenterY - y), planeshade) << COLORMAPSHIFT); + R_SetDSColorMapLight(basecolormap, GlobVis * fabs(CenterY - y), planeshade); } #ifdef X86_ASM - if (ds_colormap != ds_curcolormap) + if (!r_swtruecolor && ds_colormap != ds_curcolormap) R_SetSpanColormap_ASM (ds_colormap); #endif @@ -355,7 +355,7 @@ void R_CalcTiltedLighting (double lval, double lend, int width) // //========================================================================== -void R_MapTiltedPlane (int y, int x1) +void R_MapTiltedPlane_C (int y, int x1) { int x2 = spanend[y]; int width = x2 - x1; @@ -392,7 +392,7 @@ void R_MapTiltedPlane (int y, int x1) u = SQWORD(uz*z) + pviewx; v = SQWORD(vz*z) + pviewy; - ds_colormap = tiltlighting[i]; + R_SetDSColorMapLight(tiltlighting[i], 0, 0); fb[i++] = ds_colormap[ds_source[(v >> vshift) | ((u >> ushift) & umask)]]; iz += plane_sz[0]; uz += plane_su[0]; @@ -478,17 +478,27 @@ void R_MapTiltedPlane (int y, int x1) #endif } +void R_MapTiltedPlane_rgba (int y, int x1) +{ + R_DrawTiltedSpan_rgba(y, x1, spanend[y], plane_sz, plane_su, plane_sv, plane_shade, planeshade, planelightfloat, pviewx, pviewy); +} + //========================================================================== // // R_MapColoredPlane // //========================================================================== -void R_MapColoredPlane (int y, int x1) +void R_MapColoredPlane_C (int y, int x1) { memset (ylookup[y] + x1 + dc_destorg, ds_color, spanend[y] - x1 + 1); } +void R_MapColoredPlane_rgba(int y, int x1) +{ + R_DrawColoredSpan_rgba(y, x1, spanend[y]); +} + //========================================================================== // // R_ClearPlanes @@ -841,15 +851,24 @@ extern FTexture *rw_pic; // Allow for layer skies up to 512 pixels tall. This is overkill, // since the most anyone can ever see of the sky is 500 pixels. // We need 4 skybufs because wallscan can draw up to 4 columns at a time. +// Need two versions - one for true color and one for palette static BYTE skybuf[4][512]; +static uint32_t skybuf_bgra[4][512]; static DWORD lastskycol[4]; +static DWORD lastskycol_bgra[4]; static int skycolplace; +static int skycolplace_bgra; // Get a column of sky when there is only one sky texture. static const BYTE *R_GetOneSkyColumn (FTexture *fronttex, int x) { angle_t column = (skyangle + xtoviewangle[x]) ^ skyflip; - return fronttex->GetColumn((UMulScale16(column, frontcyl) + frontpos) >> FRACBITS, NULL); + int tx = (UMulScale16(column, frontcyl) + frontpos) >> FRACBITS; + + if (!r_swtruecolor) + return fronttex->GetColumn(tx, NULL); + else + return (const BYTE *)fronttex->GetColumnBgra(tx, NULL); } // Get a column of sky when there are two overlapping sky textures @@ -864,38 +883,77 @@ static const BYTE *R_GetTwoSkyColumns (FTexture *fronttex, int x) DWORD skycol = (angle1 << 16) | angle2; int i; - for (i = 0; i < 4; ++i) + if (!r_swtruecolor) { - if (lastskycol[i] == skycol) + for (i = 0; i < 4; ++i) { - return skybuf[i]; + if (lastskycol[i] == skycol) + { + return skybuf[i]; + } } + + lastskycol[skycolplace] = skycol; + BYTE *composite = skybuf[skycolplace]; + skycolplace = (skycolplace + 1) & 3; + + // The ordering of the following code has been tuned to allow VC++ to optimize + // it well. In particular, this arrangement lets it keep count in a register + // instead of on the stack. + const BYTE *front = fronttex->GetColumn(angle1, NULL); + const BYTE *back = backskytex->GetColumn(angle2, NULL); + + int count = MIN(512, MIN(backskytex->GetHeight(), fronttex->GetHeight())); + i = 0; + do + { + if (front[i]) + { + composite[i] = front[i]; + } + else + { + composite[i] = back[i]; + } + } while (++i, --count); + return composite; } - - lastskycol[skycolplace] = skycol; - BYTE *composite = skybuf[skycolplace]; - skycolplace = (skycolplace + 1) & 3; - - // The ordering of the following code has been tuned to allow VC++ to optimize - // it well. In particular, this arrangement lets it keep count in a register - // instead of on the stack. - const BYTE *front = fronttex->GetColumn (angle1, NULL); - const BYTE *back = backskytex->GetColumn (angle2, NULL); - - int count = MIN (512, MIN (backskytex->GetHeight(), fronttex->GetHeight())); - i = 0; - do + else { - if (front[i]) + return R_GetOneSkyColumn(fronttex, x); + for (i = 0; i < 4; ++i) { - composite[i] = front[i]; + if (lastskycol_bgra[i] == skycol) + { + return (BYTE*)(skybuf_bgra[i]); + } } - else + + lastskycol_bgra[skycolplace_bgra] = skycol; + uint32_t *composite = skybuf_bgra[skycolplace_bgra]; + skycolplace_bgra = (skycolplace_bgra + 1) & 3; + + // The ordering of the following code has been tuned to allow VC++ to optimize + // it well. In particular, this arrangement lets it keep count in a register + // instead of on the stack. + const uint32_t *front = (const uint32_t *)fronttex->GetColumnBgra(angle1, NULL); + const uint32_t *back = (const uint32_t *)backskytex->GetColumnBgra(angle2, NULL); + + int count = MIN(512, MIN(backskytex->GetHeight(), fronttex->GetHeight())); + i = 0; + do { - composite[i] = back[i]; - } - } while (++i, --count); - return composite; + if (front[i]) + { + composite[i] = front[i]; + } + else + { + composite[i] = back[i]; + } + } while (++i, --count); + return (BYTE*)composite; + } } static void R_DrawSky (visplane_t *pl) @@ -930,6 +988,7 @@ static void R_DrawSky (visplane_t *pl) for (x = 0; x < 4; ++x) { lastskycol[x] = 0xffffffff; + lastskycol_bgra[x] = 0xffffffff; } rw_pic = frontskytex; @@ -943,6 +1002,7 @@ static void R_DrawSky (visplane_t *pl) for (x = 0; x < 4; ++x) { lastskycol[x] = 0xffffffff; + lastskycol_bgra[x] = 0xffffffff; } wallscan (pl->left, pl->right, (short *)pl->top, (short *)pl->bottom, swall, lwall, frontyScale, backskytex == NULL ? R_GetOneSkyColumn : R_GetTwoSkyColumns); @@ -951,7 +1011,7 @@ static void R_DrawSky (visplane_t *pl) { // The texture does not tile nicely frontyScale *= skyscale; frontiScale = 1 / frontyScale; - R_DrawSkyStriped (pl); + //R_DrawSkyStriped (pl); } } @@ -980,6 +1040,7 @@ static void R_DrawSkyStriped (visplane_t *pl) for (x = 0; x < 4; ++x) { lastskycol[x] = 0xffffffff; + lastskycol_bgra[x] = 0xffffffff; } wallscan (pl->left, pl->right, top, bot, swall, lwall, rw_pic->Scale.Y, backskytex == NULL ? R_GetOneSkyColumn : R_GetTwoSkyColumns); @@ -1098,7 +1159,7 @@ void R_DrawSinglePlane (visplane_t *pl, fixed_t alpha, bool additive, bool maske R_SetupSpanBits(tex); double xscale = pl->xform.xScale * tex->Scale.X; double yscale = pl->xform.yScale * tex->Scale.Y; - ds_source = tex->GetPixels (); + R_SetSpanSource(tex); basecolormap = pl->colormap; planeshade = LIGHT2SHADE(pl->lightlevel); @@ -1461,12 +1522,13 @@ void R_DrawSkyPlane (visplane_t *pl) bool fakefixed = false; if (fixedcolormap) { - dc_colormap = fixedcolormap; + R_SetColorMapLight(fixedcolormap, 0, 0); } else { fakefixed = true; - fixedcolormap = dc_colormap = NormalLight.Maps; + fixedcolormap = &NormalLight; + R_SetColorMapLight(fixedcolormap, 0, 0); } R_DrawSky (pl); @@ -1484,7 +1546,7 @@ void R_DrawSkyPlane (visplane_t *pl) void R_DrawNormalPlane (visplane_t *pl, double _xscale, double _yscale, fixed_t alpha, bool additive, bool masked) { #ifdef X86_ASM - if (ds_source != ds_cursource) + if (!r_swtruecolor && ds_source != ds_cursource) { R_SetSpanSource_ASM (ds_source); } @@ -1547,12 +1609,21 @@ void R_DrawNormalPlane (visplane_t *pl, double _xscale, double _yscale, fixed_t planeheight = fabs(pl->height.Zat0() - ViewPos.Z); GlobVis = r_FloorVisibility / planeheight; + ds_light = 0; if (fixedlightlev >= 0) - ds_colormap = basecolormap->Maps + fixedlightlev, plane_shade = false; + { + R_SetDSColorMapLight(basecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); + plane_shade = false; + } else if (fixedcolormap) - ds_colormap = fixedcolormap, plane_shade = false; + { + R_SetDSColorMapLight(fixedcolormap, 0, 0); + plane_shade = false; + } else + { plane_shade = true; + } if (spanfunc != R_FillSpan) { @@ -1565,12 +1636,16 @@ void R_DrawNormalPlane (visplane_t *pl, double _xscale, double _yscale, fixed_t spanfunc = R_DrawSpanMaskedTranslucent; dc_srcblend = Col2RGB8[alpha>>10]; dc_destblend = Col2RGB8[(OPAQUE-alpha)>>10]; + dc_srcalpha = alpha; + dc_destalpha = OPAQUE - alpha; } else { spanfunc = R_DrawSpanMaskedAddClamp; dc_srcblend = Col2RGB8_LessPrecision[alpha>>10]; dc_destblend = Col2RGB8_LessPrecision[FRACUNIT>>10]; + dc_srcalpha = alpha; + dc_destalpha = OPAQUE - alpha; } } else @@ -1587,12 +1662,16 @@ void R_DrawNormalPlane (visplane_t *pl, double _xscale, double _yscale, fixed_t spanfunc = R_DrawSpanTranslucent; dc_srcblend = Col2RGB8[alpha>>10]; dc_destblend = Col2RGB8[(OPAQUE-alpha)>>10]; + dc_srcalpha = alpha; + dc_destalpha = OPAQUE - alpha; } else { spanfunc = R_DrawSpanAddClamp; dc_srcblend = Col2RGB8_LessPrecision[alpha>>10]; dc_destblend = Col2RGB8_LessPrecision[FRACUNIT>>10]; + dc_srcalpha = alpha; + dc_destalpha = OPAQUE - alpha; } } else @@ -1708,11 +1787,20 @@ void R_DrawTiltedPlane (visplane_t *pl, double _xscale, double _yscale, fixed_t planelightfloat = -planelightfloat; if (fixedlightlev >= 0) - ds_colormap = basecolormap->Maps + fixedlightlev, plane_shade = false; + { + R_SetDSColorMapLight(basecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); + plane_shade = false; + } else if (fixedcolormap) - ds_colormap = fixedcolormap, plane_shade = false; + { + R_SetDSColorMapLight(fixedcolormap, 0, 0); + plane_shade = false; + } else - ds_colormap = basecolormap->Maps, plane_shade = true; + { + R_SetDSColorMapLight(basecolormap, 0, 0); + plane_shade = true; + } if (!plane_shade) { @@ -1723,9 +1811,16 @@ void R_DrawTiltedPlane (visplane_t *pl, double _xscale, double _yscale, fixed_t } #if defined(X86_ASM) - if (ds_source != ds_curtiltedsource) - R_SetTiltedSpanSource_ASM (ds_source); - R_MapVisPlane (pl, R_DrawTiltedPlane_ASM); + if (!r_swtruecolor) + { + if (ds_source != ds_curtiltedsource) + R_SetTiltedSpanSource_ASM(ds_source); + R_MapVisPlane(pl, R_DrawTiltedPlane_ASM); + } + else + { + R_MapVisPlane(pl, R_MapTiltedPlane); + } #else R_MapVisPlane (pl, R_MapTiltedPlane); #endif diff --git a/src/r_plane.h b/src/r_plane.h index d4db3dc09..b199d3477 100644 --- a/src/r_plane.h +++ b/src/r_plane.h @@ -93,6 +93,14 @@ void R_DrawNormalPlane (visplane_t *pl, double xscale, double yscale, fixed_t al void R_DrawTiltedPlane (visplane_t *pl, double xscale, double yscale, fixed_t alpha, bool additive, bool masked); void R_MapVisPlane (visplane_t *pl, void (*mapfunc)(int y, int x1)); +extern void(*R_MapColoredPlane)(int y, int x1); +extern void(*R_MapTiltedPlane)(int y, int x1); + +void R_MapTiltedPlane_C(int y, int x1); +void R_MapTiltedPlane_rgba(int y, int x); +void R_MapColoredPlane_C(int y, int x1); +void R_MapColoredPlane_rgba(int y, int x1); + visplane_t *R_FindPlane ( const secplane_t &height, FTextureID picnum, diff --git a/src/r_segs.cpp b/src/r_segs.cpp index edb1949b6..078f1d921 100644 --- a/src/r_segs.cpp +++ b/src/r_segs.cpp @@ -50,6 +50,7 @@ #include "r_plane.h" #include "r_segs.h" #include "r_3dfloors.h" +#include "r_draw.h" #include "v_palette.h" #include "r_data/colormaps.h" @@ -177,7 +178,7 @@ static void BlastMaskedColumn (void (*blastfunc)(const BYTE *pixels, const FText // calculate lighting if (fixedcolormap == NULL && fixedlightlev < 0) { - dc_colormap = basecolormap->Maps + (GETPALOOKUP (rw_light, wallshade) << COLORMAPSHIFT); + R_SetColorMapLight(basecolormap, rw_light, wallshade); } dc_iscale = xs_Fix<16>::ToFix(MaskedSWall[dc_x] * MaskedScaleY); @@ -313,9 +314,9 @@ void R_RenderMaskedSegRange (drawseg_t *ds, int x1, int x2) rw_scalestep = ds->iscalestep; if (fixedlightlev >= 0) - dc_colormap = basecolormap->Maps + fixedlightlev; + R_SetColorMapLight(basecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); else if (fixedcolormap != NULL) - dc_colormap = fixedcolormap; + R_SetColorMapLight(fixedcolormap, 0, 0); // find positioning texheight = tex->GetScaledHeightDouble(); @@ -461,7 +462,7 @@ void R_RenderMaskedSegRange (drawseg_t *ds, int x1, int x2) while (dc_x < stop) { - rt_initcols(); + rt_initcols(nullptr); BlastMaskedColumn (R_DrawMaskedColumnHoriz, tex); dc_x++; BlastMaskedColumn (R_DrawMaskedColumnHoriz, tex); dc_x++; BlastMaskedColumn (R_DrawMaskedColumnHoriz, tex); dc_x++; @@ -630,9 +631,9 @@ void R_RenderFakeWall(drawseg_t *ds, int x1, int x2, F3DFloor *rover) } if (fixedlightlev >= 0) - dc_colormap = basecolormap->Maps + fixedlightlev; + R_SetColorMapLight(basecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); else if (fixedcolormap != NULL) - dc_colormap = fixedcolormap; + R_SetColorMapLight(fixedcolormap, 0, 0); WallC.sz1 = ds->sz1; WallC.sz2 = ds->sz2; @@ -1065,50 +1066,308 @@ void R_RenderFakeWallRange (drawseg_t *ds, int x1, int x2) return; } -// prevlineasm1 is like vlineasm1 but skips the loop if only drawing one pixel -inline fixed_t prevline1 (fixed_t vince, BYTE *colormap, int count, fixed_t vplce, const BYTE *bufplce, BYTE *dest) +EXTERN_CVAR(Bool, r_mipmap) + +struct WallscanSampler { - dc_iscale = vince; - dc_colormap = colormap; - dc_count = count; - dc_texturefrac = vplce; - dc_source = bufplce; - dc_dest = dest; - return doprevline1 (); + WallscanSampler() { } + WallscanSampler(int y1, float swal, double yrepeat, fixed_t xoffset, FTexture *texture, const BYTE*(*getcol)(FTexture *texture, int x)); + + uint32_t uv_pos; + uint32_t uv_step; + uint32_t uv_max; + + const BYTE *source; + const BYTE *source2; + uint32_t texturefracx; + uint32_t height; +}; + +WallscanSampler::WallscanSampler(int y1, float swal, double yrepeat, fixed_t xoffset, FTexture *texture, const BYTE*(*getcol)(FTexture *texture, int x)) +{ + if (!r_swtruecolor) + { + height = texture->GetHeight(); + int uv_fracbits = 32 - texture->HeightBits; + uv_max = height << uv_fracbits; + + // Find start uv in [0-base_height[ range. + // Not using xs_ToFixed because it rounds the result and we need something that always rounds down to stay within the range. + double uv_stepd = swal * yrepeat; + double v = (dc_texturemid + uv_stepd * (y1 - CenterY + 0.5)) / height; + v = v - floor(v); + v *= height; + v *= (1 << uv_fracbits); + + uv_pos = (uint32_t)v; + uv_step = xs_ToFixed(uv_fracbits, uv_stepd); + if (uv_step == 0) // To prevent divide by zero elsewhere + uv_step = 1; + + source = getcol(texture, xoffset >> FRACBITS); + source2 = nullptr; + texturefracx = 0; + } + else + { + // Normalize to 0-1 range: + double uv_stepd = swal * yrepeat; + double v = (dc_texturemid + uv_stepd * (y1 - CenterY + 0.5)) / texture->GetHeight(); + v = v - floor(v); + double v_step = uv_stepd / texture->GetHeight(); + + if (isnan(v) || isnan(v_step)) // this should never happen, but it apparently does.. + { + uv_stepd = 0.0; + v = 0.0; + v_step = 0.0; + } + + // Convert to uint32: + uv_pos = (uint32_t)(v * 0x100000000LL); + uv_step = (uint32_t)(v_step * 0x100000000LL); + uv_max = 0; + + // Texture mipmap and filter selection: + if (getcol != R_GetColumn) + { + source = getcol(texture, xoffset >> FRACBITS); + source2 = nullptr; + height = texture->GetHeight(); + texturefracx = 0; + } + else + { + double magnitude = abs(uv_stepd * 2); + bool magnifying = magnitude < 1.0f; + + int mipmap_offset = 0; + int mip_width = texture->GetWidth(); + int mip_height = texture->GetHeight(); + if (r_mipmap && texture->Mipmapped()) + { + uint32_t xpos = (uint32_t)((((uint64_t)xoffset) << FRACBITS) / mip_width); + double texture_bias = 1.7f; + double level = MAX(magnitude - 3.0, 0.0); + while (level > texture_bias) + { + mipmap_offset += mip_width * mip_height; + level *= 0.5f; + mip_width = MAX(mip_width >> 1, 1); + mip_height = MAX(mip_height >> 1, 1); + } + xoffset = (xpos >> FRACBITS) * mip_width; + } + + const uint32_t *pixels = texture->GetPixelsBgra() + mipmap_offset; + + bool filter_nearest = (magnifying && !r_magfilter) || (!magnifying && !r_minfilter); + if (filter_nearest) + { + int tx = (xoffset >> FRACBITS) % mip_width; + if (tx < 0) + tx += mip_width; + source = (BYTE*)(pixels + tx * mip_height); + source2 = nullptr; + height = mip_height; + texturefracx = 0; + } + else + { + int tx0 = (xoffset >> FRACBITS) % mip_width; + if (tx0 < 0) + tx0 += mip_width; + int tx1 = (tx0 + 1) % mip_width; + source = (BYTE*)(pixels + tx0 * mip_height); + source2 = (BYTE*)(pixels + tx1 * mip_height); + height = mip_height; + texturefracx = (xoffset >> (FRACBITS - 4)) & 15; + } + } + } } -void wallscan (int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, - double yrepeat, const BYTE *(*getcol)(FTexture *tex, int x)) +// Draw a column with support for non-power-of-two ranges +void wallscan_drawcol1(int x, int y1, int y2, WallscanSampler &sampler, DWORD(*draw1column)()) { - int x, fracbits; - int y1ve[4], y2ve[4], u4, d4, z; - char bad; - float light = rw_light - rw_lightstep; - SDWORD xoffset; - BYTE *basecolormapdata; - double iscale; - - // This function also gets used to draw skies. Unlike BUILD, skies are - // drawn by visplane instead of by bunch, so these checks are invalid. - //if ((uwal[x1] > viewheight) && (uwal[x2] > viewheight)) return; - //if ((dwal[x1] < 0) && (dwal[x2] < 0)) return; - - if (rw_pic->UseType == FTexture::TEX_Null) + if (r_swtruecolor) { - return; + int count = y2 - y1; + + dc_source = sampler.source; + dc_source2 = sampler.source2; + dc_texturefracx = sampler.texturefracx; + dc_dest = (ylookup[y1] + x) * 4 + dc_destorg; + dc_count = count; + dc_iscale = sampler.uv_step; + dc_texturefrac = sampler.uv_pos; + dc_textureheight = sampler.height; + draw1column(); + + uint64_t step64 = sampler.uv_step; + uint64_t pos64 = sampler.uv_pos; + sampler.uv_pos = (uint32_t)(pos64 + step64 * count); } + else + { + if (sampler.uv_max == 0) // power of two + { + int count = y2 - y1; -//extern cycle_t WallScanCycles; -//clock (WallScanCycles); + dc_source = sampler.source; + dc_source2 = sampler.source2; + dc_texturefracx = sampler.texturefracx; + dc_dest = (ylookup[y1] + x) + dc_destorg; + dc_count = count; + dc_iscale = sampler.uv_step; + dc_texturefrac = sampler.uv_pos; + draw1column(); - rw_pic->GetHeight(); // Make sure texture size is loaded - fracbits = 32 - rw_pic->HeightBits; - setupvline(fracbits); - xoffset = rw_offset; - basecolormapdata = basecolormap->Maps; + uint64_t step64 = sampler.uv_step; + uint64_t pos64 = sampler.uv_pos; + sampler.uv_pos = (uint32_t)(pos64 + step64 * count); + } + else + { + uint32_t uv_pos = sampler.uv_pos; - x = x1; - //while ((umost[x] > dmost[x]) && (x < x2)) x++; + uint32_t left = y2 - y1; + while (left > 0) + { + uint32_t available = sampler.uv_max - uv_pos; + uint32_t next_uv_wrap = available / sampler.uv_step; + if (available % sampler.uv_step != 0) + next_uv_wrap++; + uint32_t count = MIN(left, next_uv_wrap); + + dc_source = sampler.source; + dc_source2 = sampler.source2; + dc_texturefracx = sampler.texturefracx; + dc_dest = (ylookup[y1] + x) + dc_destorg; + dc_count = count; + dc_iscale = sampler.uv_step; + dc_texturefrac = uv_pos; + draw1column(); + + left -= count; + uv_pos += sampler.uv_step * count; + if (uv_pos >= sampler.uv_max) + uv_pos -= sampler.uv_max; + } + + sampler.uv_pos = uv_pos; + } + } +} + +// Draw four columns with support for non-power-of-two ranges +void wallscan_drawcol4(int x, int y1, int y2, WallscanSampler *sampler, void(*draw4columns)()) +{ + if (r_swtruecolor) + { + int count = y2 - y1; + for (int i = 0; i < 4; i++) + { + bufplce[i] = sampler[i].source; + bufplce2[i] = sampler[i].source2; + buftexturefracx[i] = sampler[i].texturefracx; + bufheight[i] = sampler[i].height; + vplce[i] = sampler[i].uv_pos; + vince[i] = sampler[i].uv_step; + + uint64_t step64 = sampler[i].uv_step; + uint64_t pos64 = sampler[i].uv_pos; + sampler[i].uv_pos = (uint32_t)(pos64 + step64 * count); + } + dc_dest = (ylookup[y1] + x) * 4 + dc_destorg; + dc_count = count; + draw4columns(); + } + else + { + if (sampler[0].uv_max == 0) // power of two, no wrap handling needed + { + int count = y2 - y1; + for (int i = 0; i < 4; i++) + { + bufplce[i] = sampler[i].source; + bufplce2[i] = sampler[i].source2; + buftexturefracx[i] = sampler[i].texturefracx; + vplce[i] = sampler[i].uv_pos; + vince[i] = sampler[i].uv_step; + + uint64_t step64 = sampler[i].uv_step; + uint64_t pos64 = sampler[i].uv_pos; + sampler[i].uv_pos = (uint32_t)(pos64 + step64 * count); + } + dc_dest = (ylookup[y1] + x) + dc_destorg; + dc_count = count; + draw4columns(); + } + else + { + dc_dest = (ylookup[y1] + x) + dc_destorg; + for (int i = 0; i < 4; i++) + { + bufplce[i] = sampler[i].source; + bufplce2[i] = sampler[i].source2; + buftexturefracx[i] = sampler[i].texturefracx; + } + + uint32_t left = y2 - y1; + while (left > 0) + { + // Find which column wraps first + uint32_t count = left; + for (int i = 0; i < 4; i++) + { + uint32_t available = sampler[i].uv_max - sampler[i].uv_pos; + uint32_t next_uv_wrap = available / sampler[i].uv_step; + if (available % sampler[i].uv_step != 0) + next_uv_wrap++; + count = MIN(next_uv_wrap, count); + } + + // Draw until that column wraps + for (int i = 0; i < 4; i++) + { + vplce[i] = sampler[i].uv_pos; + vince[i] = sampler[i].uv_step; + } + dc_count = count; + draw4columns(); + + // Wrap the uv position + for (int i = 0; i < 4; i++) + { + sampler[i].uv_pos += sampler[i].uv_step * count; + if (sampler[i].uv_pos >= sampler[i].uv_max) + sampler[i].uv_pos -= sampler[i].uv_max; + } + + left -= count; + } + } + } +} + +typedef DWORD(*Draw1ColumnFuncPtr)(); +typedef void(*Draw4ColumnsFuncPtr)(); + +void wallscan_any( + int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, double yrepeat, + const BYTE *(*getcol)(FTexture *tex, int x), + void(setupwallscan(int bits, Draw1ColumnFuncPtr &draw1, Draw4ColumnsFuncPtr &draw2))) +{ + if (rw_pic->UseType == FTexture::TEX_Null) + return; + + fixed_t xoffset = rw_offset; + rw_pic->GetHeight(); // To ensure that rw_pic->HeightBits has been set + + DWORD(*draw1column)(); + void(*draw4columns)(); + setupwallscan(r_swtruecolor ? FRACBITS : 32 - rw_pic->HeightBits, draw1column, draw4columns); bool fixed = (fixedcolormap != NULL || fixedlightlev >= 0); if (fixed) @@ -1117,132 +1376,195 @@ void wallscan (int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *l palookupoffse[1] = dc_colormap; palookupoffse[2] = dc_colormap; palookupoffse[3] = dc_colormap; + palookuplight[0] = 0; + palookuplight[1] = 0; + palookuplight[2] = 0; + palookuplight[3] = 0; } - for(; (x < x2) && (x & 3); ++x) + if (fixedcolormap) + R_SetColorMapLight(fixedcolormap, 0, 0); + else + R_SetColorMapLight(basecolormap, 0, 0); + + float light = rw_light; + + // Calculate where 4 column alignment begins and ends: + int aligned_x1 = clamp((x1 + 3) / 4 * 4, x1, x2); + int aligned_x2 = clamp(x2 / 4 * 4, x1, x2); + + // First unaligned columns: + for (int x = x1; x < aligned_x1; x++, light += rw_lightstep) { - light += rw_lightstep; - y1ve[0] = uwal[x];//max(uwal[x],umost[x]); - y2ve[0] = dwal[x];//min(dwal[x],dmost[x]); - if (y2ve[0] <= y1ve[0]) continue; - assert (y1ve[0] < viewheight); - assert (y2ve[0] <= viewheight); + int y1 = uwal[x]; + int y2 = dwal[x]; + if (y2 <= y1) + continue; if (!fixed) - { // calculate lighting - dc_colormap = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } + R_SetColorMapLight(basecolormap, light, wallshade); - dc_source = getcol (rw_pic, (lwal[x] + xoffset) >> FRACBITS); - dc_dest = ylookup[y1ve[0]] + x + dc_destorg; - dc_count = y2ve[0] - y1ve[0]; - iscale = swal[x] * yrepeat; - dc_iscale = xs_ToFixed(fracbits, iscale); - dc_texturefrac = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[0] - CenterY + 0.5)); - - dovline1(); + WallscanSampler sampler(y1, swal[x], yrepeat, lwal[x] + xoffset, rw_pic, getcol); + wallscan_drawcol1(x, y1, y2, sampler, draw1column); } - for(; x < x2-3; x += 4) + // The aligned columns + for (int x = aligned_x1; x < aligned_x2; x += 4) { - bad = 0; - for (z = 3; z>= 0; --z) - { - y1ve[z] = uwal[x+z];//max(uwal[x+z],umost[x+z]); - y2ve[z] = dwal[x+z];//min(dwal[x+z],dmost[x+z])-1; - if (y2ve[z] <= y1ve[z]) { bad += 1<> FRACBITS); - iscale = swal[x + z] * yrepeat; - vince[z] = xs_ToFixed(fracbits, iscale); - vplce[z] = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[z] - CenterY + 0.5)); - } - if (bad == 15) + float lights[4]; + for (int i = 0; i < 4; i++) { - light += rw_lightstep * 4; + lights[i] = light; + light += rw_lightstep; + } + + WallscanSampler sampler[4]; + for (int i = 0; i < 4; i++) + sampler[i] = WallscanSampler(y1[i], swal[x + i], yrepeat, lwal[x + i] + xoffset, rw_pic, getcol); + + // Figure out where we vertically can start and stop drawing 4 columns in one go + int middle_y1 = y1[0]; + int middle_y2 = y2[0]; + for (int i = 1; i < 4; i++) + { + middle_y1 = MAX(y1[i], middle_y1); + middle_y2 = MIN(y2[i], middle_y2); + } + + // If we got an empty column in our set we cannot draw 4 columns in one go: + bool empty_column_in_set = false; + int bilinear_count = 0; + for (int i = 0; i < 4; i++) + { + if (y2[i] <= y1[i]) + empty_column_in_set = true; + if (sampler[i].source2) + bilinear_count++; + } + + if (empty_column_in_set || middle_y2 <= middle_y1 || (bilinear_count > 0 && bilinear_count < 4)) + { + for (int i = 0; i < 4; i++) + { + if (y2[i] <= y1[i]) + continue; + + if (!fixed) + R_SetColorMapLight(basecolormap, lights[i], wallshade); + wallscan_drawcol1(x + i, y1[i], y2[i], sampler[i], draw1column); + } continue; } - if (!fixed) + // Draw the first rows where not all 4 columns are active + for (int i = 0; i < 4; i++) { - for (z = 0; z < 4; ++z) - { - light += rw_lightstep; - palookupoffse[z] = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } + if (!fixed) + R_SetColorMapLight(basecolormap, lights[i], wallshade); + + if (y1[i] < middle_y1) + wallscan_drawcol1(x + i, y1[i], middle_y1, sampler[i], draw1column); } - u4 = MAX(MAX(y1ve[0],y1ve[1]),MAX(y1ve[2],y1ve[3])); - d4 = MIN(MIN(y2ve[0],y2ve[1]),MIN(y2ve[2],y2ve[3])); - - if ((bad != 0) || (u4 >= d4)) + // Draw the area where all 4 columns are active + if (!fixed) { - for (z = 0; z < 4; ++z) + for (int i = 0; i < 4; i++) { - if (!(bad & 1)) + if (r_swtruecolor) { - prevline1(vince[z],palookupoffse[z],y2ve[z]-y1ve[z],vplce[z],bufplce[z],ylookup[y1ve[z]]+x+z+dc_destorg); + palookupoffse[i] = basecolormap->Maps; + palookuplight[i] = LIGHTSCALE(lights[i], wallshade); + } + else + { + palookupoffse[i] = basecolormap->Maps + (GETPALOOKUP(lights[i], wallshade) << COLORMAPSHIFT); + palookuplight[i] = 0; } - bad >>= 1; - } - continue; - } - - for (z = 0; z < 4; ++z) - { - if (u4 > y1ve[z]) - { - vplce[z] = prevline1(vince[z],palookupoffse[z],u4-y1ve[z],vplce[z],bufplce[z],ylookup[y1ve[z]]+x+z+dc_destorg); } } + wallscan_drawcol4(x, middle_y1, middle_y2, sampler, draw4columns); - if (d4 > u4) + // Draw the last rows where not all 4 columns are active + for (int i = 0; i < 4; i++) { - dc_count = d4-u4; - dc_dest = ylookup[u4]+x+dc_destorg; - dovline4(); - } + if (!fixed) + R_SetColorMapLight(basecolormap, lights[i], wallshade); - BYTE *i = x+ylookup[d4]+dc_destorg; - for (z = 0; z < 4; ++z) - { - if (y2ve[z] > d4) - { - prevline1(vince[z],palookupoffse[0],y2ve[z]-d4,vplce[z],bufplce[z],i+z); - } + if (middle_y2 < y2[i]) + wallscan_drawcol1(x + i, middle_y2, y2[i], sampler[i], draw1column); } } - for(;x> FRACBITS); - dc_dest = ylookup[y1ve[0]] + x + dc_destorg; - dc_count = y2ve[0] - y1ve[0]; - iscale = swal[x] * yrepeat; - dc_iscale = xs_ToFixed(fracbits, iscale); - dc_texturefrac = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[0] - CenterY + 0.5)); - - dovline1(); + WallscanSampler sampler(y1, swal[x], yrepeat, lwal[x] + xoffset, rw_pic, getcol); + wallscan_drawcol1(x, y1, y2, sampler, draw1column); } -//unclock (WallScanCycles); - NetUpdate (); } +void wallscan(int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, double yrepeat, const BYTE *(*getcol)(FTexture *tex, int x)) +{ + wallscan_any(x1, x2, uwal, dwal, swal, lwal, yrepeat, getcol, [](int bits, Draw1ColumnFuncPtr &line1, Draw4ColumnsFuncPtr &line4) + { + setupvline(bits); + line1 = dovline1; + line4 = dovline4; + }); +} + +void maskwallscan(int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, double yrepeat, const BYTE *(*getcol)(FTexture *tex, int x)) +{ + if (!rw_pic->bMasked) // Textures that aren't masked can use the faster wallscan. + { + wallscan(x1, x2, uwal, dwal, swal, lwal, yrepeat, getcol); + } + else + { + wallscan_any(x1, x2, uwal, dwal, swal, lwal, yrepeat, getcol, [](int bits, Draw1ColumnFuncPtr &line1, Draw4ColumnsFuncPtr &line4) + { + setupmvline(bits); + line1 = domvline1; + line4 = domvline4; + }); + } +} + +void transmaskwallscan(int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, double yrepeat, const BYTE *(*getcol)(FTexture *tex, int x)) +{ + static fixed_t(*tmvline1)(); + static void(*tmvline4)(); + if (!R_GetTransMaskDrawers(&tmvline1, &tmvline4)) + { + // The current translucency is unsupported, so draw with regular maskwallscan instead. + maskwallscan(x1, x2, uwal, dwal, swal, lwal, yrepeat, getcol); + } + else + { + wallscan_any(x1, x2, uwal, dwal, swal, lwal, yrepeat, getcol, [](int bits, Draw1ColumnFuncPtr &line1, Draw4ColumnsFuncPtr &line4) + { + setuptmvline(bits); + line1 = reinterpret_cast(tmvline1); + line4 = tmvline4; + }); + } +} + void wallscan_striped (int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, double yrepeat) { FDynamicColormap *startcolormap = basecolormap; @@ -1416,358 +1738,6 @@ static void wallscan_np2_ds(drawseg_t *ds, int x1, int x2, short *uwal, short *d } } -inline fixed_t mvline1 (fixed_t vince, BYTE *colormap, int count, fixed_t vplce, const BYTE *bufplce, BYTE *dest) -{ - dc_iscale = vince; - dc_colormap = colormap; - dc_count = count; - dc_texturefrac = vplce; - dc_source = bufplce; - dc_dest = dest; - return domvline1 (); -} - -void maskwallscan (int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, - double yrepeat, const BYTE *(*getcol)(FTexture *tex, int x)) -{ - int x, fracbits; - BYTE *p; - int y1ve[4], y2ve[4], u4, d4, startx, dax, z; - char bad; - float light = rw_light - rw_lightstep; - SDWORD xoffset; - BYTE *basecolormapdata; - double iscale; - - if (rw_pic->UseType == FTexture::TEX_Null) - { - return; - } - - if (!rw_pic->bMasked) - { // Textures that aren't masked can use the faster wallscan. - wallscan (x1, x2, uwal, dwal, swal, lwal, yrepeat, getcol); - return; - } - -//extern cycle_t WallScanCycles; -//clock (WallScanCycles); - - rw_pic->GetHeight(); // Make sure texture size is loaded - fracbits = 32- rw_pic->HeightBits; - setupmvline(fracbits); - xoffset = rw_offset; - basecolormapdata = basecolormap->Maps; - - x = startx = x1; - p = x + dc_destorg; - - bool fixed = (fixedcolormap != NULL || fixedlightlev >= 0); - if (fixed) - { - palookupoffse[0] = dc_colormap; - palookupoffse[1] = dc_colormap; - palookupoffse[2] = dc_colormap; - palookupoffse[3] = dc_colormap; - } - - for(; (x < x2) && ((size_t)p & 3); ++x, ++p) - { - light += rw_lightstep; - y1ve[0] = uwal[x];//max(uwal[x],umost[x]); - y2ve[0] = dwal[x];//min(dwal[x],dmost[x]); - if (y2ve[0] <= y1ve[0]) continue; - - if (!fixed) - { // calculate lighting - dc_colormap = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } - - dc_source = getcol (rw_pic, (lwal[x] + xoffset) >> FRACBITS); - dc_dest = ylookup[y1ve[0]] + p; - dc_count = y2ve[0] - y1ve[0]; - iscale = swal[x] * yrepeat; - dc_iscale = xs_ToFixed(fracbits, iscale); - dc_texturefrac = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[0] - CenterY + 0.5)); - - domvline1(); - } - - for(; x < x2-3; x += 4, p+= 4) - { - bad = 0; - for (z = 3, dax = x+3; z >= 0; --z, --dax) - { - y1ve[z] = uwal[dax]; - y2ve[z] = dwal[dax]; - if (y2ve[z] <= y1ve[z]) { bad += 1<> FRACBITS); - iscale = swal[dax] * yrepeat; - vince[z] = xs_ToFixed(fracbits, iscale); - vplce[z] = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[z] - CenterY + 0.5)); - } - if (bad == 15) - { - light += rw_lightstep * 4; - continue; - } - - if (!fixed) - { - for (z = 0; z < 4; ++z) - { - light += rw_lightstep; - palookupoffse[z] = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } - } - - u4 = MAX(MAX(y1ve[0],y1ve[1]),MAX(y1ve[2],y1ve[3])); - d4 = MIN(MIN(y2ve[0],y2ve[1]),MIN(y2ve[2],y2ve[3])); - - if ((bad != 0) || (u4 >= d4)) - { - for (z = 0; z < 4; ++z) - { - if (!(bad & 1)) - { - mvline1(vince[z],palookupoffse[z],y2ve[z]-y1ve[z],vplce[z],bufplce[z],ylookup[y1ve[z]]+p+z); - } - bad >>= 1; - } - continue; - } - - for (z = 0; z < 4; ++z) - { - if (u4 > y1ve[z]) - { - vplce[z] = mvline1(vince[z],palookupoffse[z],u4-y1ve[z],vplce[z],bufplce[z],ylookup[y1ve[z]]+p+z); - } - } - - if (d4 > u4) - { - dc_count = d4-u4; - dc_dest = ylookup[u4]+p; - domvline4(); - } - - BYTE *i = p+ylookup[d4]; - for (z = 0; z < 4; ++z) - { - if (y2ve[z] > d4) - { - mvline1(vince[z],palookupoffse[0],y2ve[z]-d4,vplce[z],bufplce[z],i+z); - } - } - } - for(; x < x2; ++x, ++p) - { - light += rw_lightstep; - y1ve[0] = uwal[x]; - y2ve[0] = dwal[x]; - if (y2ve[0] <= y1ve[0]) continue; - - if (!fixed) - { // calculate lighting - dc_colormap = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } - - dc_source = getcol (rw_pic, (lwal[x] + xoffset) >> FRACBITS); - dc_dest = ylookup[y1ve[0]] + p; - dc_count = y2ve[0] - y1ve[0]; - iscale = swal[x] * yrepeat; - dc_iscale = xs_ToFixed(fracbits, iscale); - dc_texturefrac = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[0] - CenterY + 0.5)); - - domvline1(); - } - -//unclock(WallScanCycles); - - NetUpdate (); -} - -inline void preptmvline1 (fixed_t vince, BYTE *colormap, int count, fixed_t vplce, const BYTE *bufplce, BYTE *dest) -{ - dc_iscale = vince; - dc_colormap = colormap; - dc_count = count; - dc_texturefrac = vplce; - dc_source = bufplce; - dc_dest = dest; -} - -void transmaskwallscan (int x1, int x2, short *uwal, short *dwal, float *swal, fixed_t *lwal, - double yrepeat, const BYTE *(*getcol)(FTexture *tex, int x)) -{ - fixed_t (*tmvline1)(); - void (*tmvline4)(); - int x, fracbits; - BYTE *p; - int y1ve[4], y2ve[4], u4, d4, startx, dax, z; - char bad; - float light = rw_light - rw_lightstep; - SDWORD xoffset; - BYTE *basecolormapdata; - double iscale; - - if (rw_pic->UseType == FTexture::TEX_Null) - { - return; - } - - if (!R_GetTransMaskDrawers (&tmvline1, &tmvline4)) - { - // The current translucency is unsupported, so draw with regular maskwallscan instead. - maskwallscan (x1, x2, uwal, dwal, swal, lwal, yrepeat, getcol); - return; - } - -//extern cycle_t WallScanCycles; -//clock (WallScanCycles); - - rw_pic->GetHeight(); // Make sure texture size is loaded - fracbits = 32 - rw_pic->HeightBits; - setuptmvline(fracbits); - xoffset = rw_offset; - basecolormapdata = basecolormap->Maps; - fixed_t centeryfrac = FLOAT2FIXED(CenterY); - - x = startx = x1; - p = x + dc_destorg; - - bool fixed = (fixedcolormap != NULL || fixedlightlev >= 0); - if (fixed) - { - palookupoffse[0] = dc_colormap; - palookupoffse[1] = dc_colormap; - palookupoffse[2] = dc_colormap; - palookupoffse[3] = dc_colormap; - } - - for(; (x < x2) && ((size_t)p & 3); ++x, ++p) - { - light += rw_lightstep; - y1ve[0] = uwal[x];//max(uwal[x],umost[x]); - y2ve[0] = dwal[x];//min(dwal[x],dmost[x]); - if (y2ve[0] <= y1ve[0]) continue; - - if (!fixed) - { // calculate lighting - dc_colormap = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } - - dc_source = getcol (rw_pic, (lwal[x] + xoffset) >> FRACBITS); - dc_dest = ylookup[y1ve[0]] + p; - dc_count = y2ve[0] - y1ve[0]; - iscale = swal[x] * yrepeat; - dc_iscale = xs_ToFixed(fracbits, iscale); - dc_texturefrac = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[0] - CenterY + 0.5)); - - tmvline1(); - } - - for(; x < x2-3; x += 4, p+= 4) - { - bad = 0; - for (z = 3, dax = x+3; z >= 0; --z, --dax) - { - y1ve[z] = uwal[dax]; - y2ve[z] = dwal[dax]; - if (y2ve[z] <= y1ve[z]) { bad += 1<> FRACBITS); - iscale = swal[dax] * yrepeat; - vince[z] = xs_ToFixed(fracbits, iscale); - vplce[z] = xs_ToFixed(fracbits, dc_texturemid + vince[z] * (y1ve[z] - CenterY + 0.5)); - } - if (bad == 15) - { - light += rw_lightstep * 4; - continue; - } - - if (!fixed) - { - for (z = 0; z < 4; ++z) - { - light += rw_lightstep; - palookupoffse[z] = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } - } - - u4 = MAX(MAX(y1ve[0],y1ve[1]),MAX(y1ve[2],y1ve[3])); - d4 = MIN(MIN(y2ve[0],y2ve[1]),MIN(y2ve[2],y2ve[3])); - - if ((bad != 0) || (u4 >= d4)) - { - for (z = 0; z < 4; ++z) - { - if (!(bad & 1)) - { - preptmvline1(vince[z],palookupoffse[z],y2ve[z]-y1ve[z],vplce[z],bufplce[z],ylookup[y1ve[z]]+p+z); - tmvline1(); - } - bad >>= 1; - } - continue; - } - - for (z = 0; z < 4; ++z) - { - if (u4 > y1ve[z]) - { - preptmvline1(vince[z],palookupoffse[z],u4-y1ve[z],vplce[z],bufplce[z],ylookup[y1ve[z]]+p+z); - vplce[z] = tmvline1(); - } - } - - if (d4 > u4) - { - dc_count = d4-u4; - dc_dest = ylookup[u4]+p; - tmvline4(); - } - - BYTE *i = p+ylookup[d4]; - for (z = 0; z < 4; ++z) - { - if (y2ve[z] > d4) - { - preptmvline1(vince[z],palookupoffse[0],y2ve[z]-d4,vplce[z],bufplce[z],i+z); - tmvline1(); - } - } - } - for(; x < x2; ++x, ++p) - { - light += rw_lightstep; - y1ve[0] = uwal[x]; - y2ve[0] = dwal[x]; - if (y2ve[0] <= y1ve[0]) continue; - - if (!fixed) - { // calculate lighting - dc_colormap = basecolormapdata + (GETPALOOKUP (light, wallshade) << COLORMAPSHIFT); - } - - dc_source = getcol (rw_pic, (lwal[x] + xoffset) >> FRACBITS); - dc_dest = ylookup[y1ve[0]] + p; - dc_count = y2ve[0] - y1ve[0]; - iscale = swal[x] * yrepeat; - dc_iscale = xs_ToFixed(fracbits, iscale); - dc_texturefrac = xs_ToFixed(fracbits, dc_texturemid + iscale * (y1ve[0] - CenterY + 0.5)); - - tmvline1(); - } - -//unclock(WallScanCycles); - - NetUpdate (); -} - // // R_RenderSegLoop // Draws zero, one, or two textures for walls. @@ -1788,9 +1758,9 @@ void R_RenderSegLoop () fixed_t xoffset = rw_offset; if (fixedlightlev >= 0) - dc_colormap = basecolormap->Maps + fixedlightlev; + R_SetColorMapLight(basecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); else if (fixedcolormap != NULL) - dc_colormap = fixedcolormap; + R_SetColorMapLight(fixedcolormap, 0, 0); // clip wall to the floor and ceiling for (x = x1; x < x2; ++x) @@ -3187,11 +3157,11 @@ static void R_RenderDecal (side_t *wall, DBaseDecal *decal, drawseg_t *clipper, rw_light = rw_lightleft + (x1 - WallC.sx1) * rw_lightstep; if (fixedlightlev >= 0) - dc_colormap = usecolormap->Maps + fixedlightlev; + R_SetColorMapLight(usecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); else if (fixedcolormap != NULL) - dc_colormap = fixedcolormap; + R_SetColorMapLight(fixedcolormap, 0, 0); else if (!foggy && (decal->RenderFlags & RF_FULLBRIGHT)) - dc_colormap = usecolormap->Maps; + R_SetColorMapLight(usecolormap, 0, 0); else calclighting = true; @@ -3242,7 +3212,7 @@ static void R_RenderDecal (side_t *wall, DBaseDecal *decal, drawseg_t *clipper, { if (calclighting) { // calculate lighting - dc_colormap = usecolormap->Maps + (GETPALOOKUP (rw_light, wallshade) << COLORMAPSHIFT); + R_SetColorMapLight(usecolormap, rw_light, wallshade); } R_WallSpriteColumn (R_DrawMaskedColumn); dc_x++; @@ -3252,9 +3222,9 @@ static void R_RenderDecal (side_t *wall, DBaseDecal *decal, drawseg_t *clipper, { if (calclighting) { // calculate lighting - dc_colormap = usecolormap->Maps + (GETPALOOKUP (rw_light, wallshade) << COLORMAPSHIFT); + R_SetColorMapLight(usecolormap, rw_light, wallshade); } - rt_initcols(); + rt_initcols(nullptr); for (int zz = 4; zz; --zz) { R_WallSpriteColumn (R_DrawMaskedColumnHoriz); @@ -3267,7 +3237,7 @@ static void R_RenderDecal (side_t *wall, DBaseDecal *decal, drawseg_t *clipper, { if (calclighting) { // calculate lighting - dc_colormap = usecolormap->Maps + (GETPALOOKUP (rw_light, wallshade) << COLORMAPSHIFT); + R_SetColorMapLight(usecolormap, rw_light, wallshade); } R_WallSpriteColumn (R_DrawMaskedColumn); dc_x++; diff --git a/src/r_swrenderer.cpp b/src/r_swrenderer.cpp index 3c3313430..5be847502 100644 --- a/src/r_swrenderer.cpp +++ b/src/r_swrenderer.cpp @@ -42,7 +42,9 @@ #include "r_3dfloors.h" #include "textures/textures.h" #include "r_data/voxels.h" +#include "r_draw_rgba.h" +EXTERN_CVAR(Bool, r_shadercolormaps) void R_SWRSetWindow(int windowSize, int fullWidth, int fullHeight, int stHeight, float trueratio); void R_SetupColormap(player_t *); @@ -57,6 +59,7 @@ void R_InitRenderer(); void FSoftwareRenderer::Init() { + r_swtruecolor = screen->IsBgra(); R_InitRenderer(); } @@ -84,11 +87,17 @@ void FSoftwareRenderer::PrecacheTexture(FTexture *tex, int cache) if (cache & FTextureManager::HIT_Columnmode) { const FTexture::Span *spanp; - tex->GetColumn(0, &spanp); + if (r_swtruecolor) + tex->GetColumnBgra(0, &spanp); + else + tex->GetColumn(0, &spanp); } else if (cache != 0) { - tex->GetPixels (); + if (r_swtruecolor) + tex->GetPixelsBgra(); + else + tex->GetPixels (); } else { @@ -154,9 +163,24 @@ void FSoftwareRenderer::Precache(BYTE *texhitlist, TMap &act void FSoftwareRenderer::RenderView(player_t *player) { + if (r_swtruecolor != screen->IsBgra()) + { + r_swtruecolor = screen->IsBgra(); + R_InitColumnDrawers(); + } + + R_BeginDrawerCommands(); R_RenderActorView (player->mo); // [RH] Let cameras draw onto textures that were visible this frame. FCanvasTextureInfo::UpdateAll (); + + // Apply special colormap if the target cannot do it + if (realfixedcolormap && r_swtruecolor && !(r_shadercolormaps && screen->Accel2D)) + { + DrawerCommandQueue::QueueCommand(realfixedcolormap, screen); + } + + R_EndDrawerCommands(); } //========================================================================== @@ -181,7 +205,7 @@ void FSoftwareRenderer::RemapVoxels() void FSoftwareRenderer::WriteSavePic (player_t *player, FileWriter *file, int width, int height) { - DCanvas *pic = new DSimpleCanvas (width, height); + DCanvas *pic = new DSimpleCanvas (width, height, false); PalEntry palette[256]; // Take a snapshot of the player's view @@ -310,27 +334,67 @@ void FSoftwareRenderer::CopyStackedViewParameters() void FSoftwareRenderer::RenderTextureView (FCanvasTexture *tex, AActor *viewpoint, int fov) { - BYTE *Pixels = const_cast(tex->GetPixels()); - DSimpleCanvas *Canvas = tex->GetCanvas(); + BYTE *Pixels = r_swtruecolor ? (BYTE*)tex->GetPixelsBgra() : (BYTE*)tex->GetPixels(); + DSimpleCanvas *Canvas = r_swtruecolor ? tex->GetCanvasBgra() : tex->GetCanvas(); // curse Doom's overuse of global variables in the renderer. // These get clobbered by rendering to a camera texture but they need to be preserved so the final rendering can be done with the correct palette. - unsigned char *savecolormap = fixedcolormap; + FSWColormap *savecolormap = fixedcolormap; FSpecialColormap *savecm = realfixedcolormap; DAngle savedfov = FieldOfView; R_SetFOV ((double)fov); R_RenderViewToCanvas (viewpoint, Canvas, 0, 0, tex->GetWidth(), tex->GetHeight(), tex->bFirstUpdate); R_SetFOV (savedfov); - if (Pixels == Canvas->GetBuffer()) + + if (Canvas->IsBgra()) { - FTexture::FlipSquareBlockRemap (Pixels, tex->GetWidth(), tex->GetHeight(), GPalette.Remap); + if (Pixels == Canvas->GetBuffer()) + { + FTexture::FlipSquareBlockBgra((uint32_t*)Pixels, tex->GetWidth(), tex->GetHeight()); + } + else + { + FTexture::FlipNonSquareBlockBgra((uint32_t*)Pixels, (const uint32_t*)Canvas->GetBuffer(), tex->GetWidth(), tex->GetHeight(), Canvas->GetPitch()); + } } else { - FTexture::FlipNonSquareBlockRemap (Pixels, Canvas->GetBuffer(), tex->GetWidth(), tex->GetHeight(), Canvas->GetPitch(), GPalette.Remap); + if (Pixels == Canvas->GetBuffer()) + { + FTexture::FlipSquareBlockRemap(Pixels, tex->GetWidth(), tex->GetHeight(), GPalette.Remap); + } + else + { + FTexture::FlipNonSquareBlockRemap(Pixels, Canvas->GetBuffer(), tex->GetWidth(), tex->GetHeight(), Canvas->GetPitch(), GPalette.Remap); + } } + + if (r_swtruecolor) + { + // True color render still sometimes uses palette textures (for sprites, mostly). + // We need to make sure that both pixel buffers contain data: + int width = tex->GetWidth(); + int height = tex->GetHeight(); + BYTE *palbuffer = (BYTE *)tex->GetPixels(); + uint32_t *bgrabuffer = (uint32_t*)tex->GetPixelsBgra(); + for (int x = 0; x < width; x++) + { + for (int y = 0; y < height; y++) + { + uint32_t color = bgrabuffer[y]; + int r = RPART(color); + int g = GPART(color); + int b = BPART(color); + palbuffer[y] = RGB32k.RGB[r >> 3][g >> 3][b >> 3]; + } + palbuffer += height; + bgrabuffer += height; + } + } + tex->SetUpdated(); + fixedcolormap = savecolormap; realfixedcolormap = savecm; } diff --git a/src/r_things.cpp b/src/r_things.cpp index a62525d08..6f1fb2700 100644 --- a/src/r_things.cpp +++ b/src/r_things.cpp @@ -58,6 +58,7 @@ #include "r_plane.h" #include "r_segs.h" #include "r_3dfloors.h" +#include "r_draw_rgba.h" #include "v_palette.h" #include "r_data/r_translate.h" #include "r_data/colormaps.h" @@ -251,6 +252,7 @@ bool sprflipvert; void R_DrawMaskedColumn (const BYTE *column, const FTexture::Span *span) { + int pixelsize = r_swtruecolor ? 4 : 1; const fixed_t centeryfrac = FLOAT2FIXED(CenterY); const fixed_t texturemid = FLOAT2FIXED(dc_texturemid); while (span->Length != 0) @@ -321,7 +323,7 @@ void R_DrawMaskedColumn (const BYTE *column, const FTexture::Span *span) } } dc_source = column + top; - dc_dest = ylookup[dc_yl] + dc_x + dc_destorg; + dc_dest = (ylookup[dc_yl] + dc_x) * pixelsize + dc_destorg; dc_count = dc_yh - dc_yl + 1; colfunc (); } @@ -414,7 +416,7 @@ void R_DrawVisSprite (vissprite_t *vis) } fixed_t centeryfrac = FLOAT2FIXED(CenterY); - dc_colormap = vis->Style.colormap; + R_SetColorMapLight(vis->Style.BaseColormap, 0, vis->Style.ColormapNum << FRACBITS); mode = R_SetPatchStyle (vis->Style.RenderStyle, vis->Style.Alpha, vis->Translation, vis->FillColor); @@ -422,7 +424,7 @@ void R_DrawVisSprite (vissprite_t *vis) { // For shaded sprites, R_SetPatchStyle sets a dc_colormap to an alpha table, but // it is the brightest one. We need to get back to the proper light level for // this sprite. - dc_colormap += vis->ColormapNum << COLORMAPSHIFT; + R_SetColorMapLight(dc_fcolormap, 0, vis->Style.ColormapNum << FRACBITS); } if (mode != DontDraw) @@ -476,7 +478,7 @@ void R_DrawVisSprite (vissprite_t *vis) while (dc_x < stop4) { - rt_initcols(); + rt_initcols(nullptr); for (int zz = 4; zz; --zz) { pixels = tex->GetColumn (frac >> FRACBITS, &spans); @@ -544,11 +546,11 @@ void R_DrawWallSprite(vissprite_t *spr) rw_lightstep = float((GlobVis / spr->wallc.sz2 - rw_lightleft) / (spr->wallc.sx2 - spr->wallc.sx1)); rw_light = rw_lightleft + (x1 - spr->wallc.sx1) * rw_lightstep; if (fixedlightlev >= 0) - dc_colormap = usecolormap->Maps + fixedlightlev; + R_SetColorMapLight(usecolormap, 0, FIXEDLIGHT2SHADE(fixedlightlev)); else if (fixedcolormap != NULL) - dc_colormap = fixedcolormap; + R_SetColorMapLight(fixedcolormap, 0, 0); else if (!foggy && (spr->renderflags & RF_FULLBRIGHT)) - dc_colormap = usecolormap->Maps; + R_SetColorMapLight(usecolormap, 0, 0); else calclighting = true; @@ -599,7 +601,7 @@ void R_DrawWallSprite(vissprite_t *spr) { if (calclighting) { // calculate lighting - dc_colormap = usecolormap->Maps + (GETPALOOKUP (rw_light, shade) << COLORMAPSHIFT); + R_SetColorMapLight(usecolormap, rw_light, shade); } if (!R_ClipSpriteColumnWithPortals(spr)) R_WallSpriteColumn(R_DrawMaskedColumn); @@ -610,9 +612,9 @@ void R_DrawWallSprite(vissprite_t *spr) { if (calclighting) { // calculate lighting - dc_colormap = usecolormap->Maps + (GETPALOOKUP (rw_light, shade) << COLORMAPSHIFT); + R_SetColorMapLight(usecolormap, rw_light, shade); } - rt_initcols(); + rt_initcols(nullptr); for (int zz = 4; zz; --zz) { if (!R_ClipSpriteColumnWithPortals(spr)) @@ -626,7 +628,7 @@ void R_DrawWallSprite(vissprite_t *spr) { if (calclighting) { // calculate lighting - dc_colormap = usecolormap->Maps + (GETPALOOKUP (rw_light, shade) << COLORMAPSHIFT); + R_SetColorMapLight(usecolormap, rw_light, shade); } if (!R_ClipSpriteColumnWithPortals(spr)) R_WallSpriteColumn(R_DrawMaskedColumn); @@ -660,14 +662,14 @@ void R_DrawVisVoxel(vissprite_t *spr, int minslabz, int maxslabz, short *cliptop int flags = 0; // Do setup for blending. - dc_colormap = spr->Style.colormap; + R_SetColorMapLight(spr->Style.BaseColormap, 0, spr->Style.ColormapNum << FRACBITS); mode = R_SetPatchStyle(spr->Style.RenderStyle, spr->Style.Alpha, spr->Translation, spr->FillColor); if (mode == DontDraw) { return; } - if (colfunc == fuzzcolfunc || colfunc == R_FillColumnP) + if (colfunc == fuzzcolfunc || colfunc == R_FillColumn) { flags = DVF_OFFSCREEN | DVF_SPANSONLY; } @@ -686,12 +688,13 @@ void R_DrawVisVoxel(vissprite_t *spr, int minslabz, int maxslabz, short *cliptop // Render the voxel, either directly to the screen or offscreen. R_DrawVoxel(spr->pa.vpos, spr->pa.vang, spr->gpos, spr->Angle, - spr->xscale, FLOAT2FIXED(spr->yscale), spr->voxel, spr->Style.colormap, cliptop, clipbot, + spr->xscale, FLOAT2FIXED(spr->yscale), spr->voxel, spr->Style.BaseColormap, spr->Style.ColormapNum, cliptop, clipbot, minslabz, maxslabz, flags); // Blend the voxel, if that's what we need to do. if ((flags & ~DVF_MIRRORED) != 0) { + int pixelsize = r_swtruecolor ? 4 : 1; for (int x = 0; x < viewwidth; ++x) { if (!(flags & DVF_SPANSONLY) && (x & 3) == 0) @@ -706,15 +709,12 @@ void R_DrawVisVoxel(vissprite_t *spr, int minslabz, int maxslabz, short *cliptop dc_yl = span->Start; dc_yh = span->Stop - 1; dc_count = span->Stop - span->Start; - dc_dest = ylookup[span->Start] + x + dc_destorg; + dc_dest = (ylookup[span->Start] + x) * pixelsize + dc_destorg; colfunc(); } else { - unsigned int **tspan = &dc_ctspan[x & 3]; - (*tspan)[0] = span->Start; - (*tspan)[1] = span->Stop - 1; - *tspan += 2; + rt_span_coverage(x, span->Start, span->Stop - 1); } } if (!(flags & DVF_SPANSONLY) && (x & 3) == 3) @@ -1073,7 +1073,7 @@ void R_ProjectSprite (AActor *thing, int fakeside, F3DFloor *fakefloor, F3DFloor vis->Style.Alpha = float(thing->Alpha); vis->fakefloor = fakefloor; vis->fakeceiling = fakeceiling; - vis->ColormapNum = 0; + vis->Style.ColormapNum = 0; vis->bInMirror = MirrorFlags & RF_XFLIP; vis->bSplitSprite = false; @@ -1125,7 +1125,8 @@ void R_ProjectSprite (AActor *thing, int fakeside, F3DFloor *fakefloor, F3DFloor // get light level if (fixedcolormap != NULL) { // fixed map - vis->Style.colormap = fixedcolormap; + vis->Style.BaseColormap = fixedcolormap; + vis->Style.ColormapNum = 0; } else { @@ -1135,17 +1136,19 @@ void R_ProjectSprite (AActor *thing, int fakeside, F3DFloor *fakefloor, F3DFloor } if (fixedlightlev >= 0) { - vis->Style.colormap = mybasecolormap->Maps + fixedlightlev; + vis->Style.BaseColormap = mybasecolormap; + vis->Style.ColormapNum = fixedlightlev >> COLORMAPSHIFT; } else if (!foggy && ((renderflags & RF_FULLBRIGHT) || (thing->flags5 & MF5_BRIGHT))) { // full bright - vis->Style.colormap = mybasecolormap->Maps; + vis->Style.BaseColormap = mybasecolormap; + vis->Style.ColormapNum = 0; } else { // diminished light - vis->ColormapNum = GETPALOOKUP( + vis->Style.ColormapNum = GETPALOOKUP( r_SpriteVisibility / MAX(tz, MINZ), spriteshade); - vis->Style.colormap = mybasecolormap->Maps + (vis->ColormapNum << COLORMAPSHIFT); + vis->Style.BaseColormap = mybasecolormap; } } } @@ -1214,14 +1217,13 @@ static void R_ProjectWallSprite(AActor *thing, const DVector3 &pos, FTextureID p vis->Style.Alpha = float(thing->Alpha); vis->fakefloor = NULL; vis->fakeceiling = NULL; - vis->ColormapNum = 0; vis->bInMirror = MirrorFlags & RF_XFLIP; vis->pic = pic; vis->bIsVoxel = false; vis->bWallSprite = true; - vis->ColormapNum = GETPALOOKUP( + vis->Style.ColormapNum = GETPALOOKUP( r_SpriteVisibility / MAX(tz, MINZ), spriteshade); - vis->Style.colormap = basecolormap->Maps + (vis->ColormapNum << COLORMAPSHIFT); + vis->Style.BaseColormap = basecolormap; vis->wallc = wallc; } @@ -1401,7 +1403,7 @@ void R_DrawPSprite(DPSprite *pspr, AActor *owner, float bobx, float boby, double vis->yscale = float(pspriteyscale / tex->Scale.Y); vis->Translation = 0; // [RH] Use default colors vis->pic = tex; - vis->ColormapNum = 0; + vis->Style.ColormapNum = 0; if (flip) { @@ -1449,9 +1451,10 @@ void R_DrawPSprite(DPSprite *pspr, AActor *owner, float bobx, float boby, double } } - if (realfixedcolormap != nullptr) + if (realfixedcolormap != nullptr && (!r_swtruecolor || (r_shadercolormaps && screen->Accel2D))) { // fixed color - vis->Style.colormap = realfixedcolormap->Colormap; + vis->Style.BaseColormap = realfixedcolormap; + vis->Style.ColormapNum = 0; } else { @@ -1461,35 +1464,38 @@ void R_DrawPSprite(DPSprite *pspr, AActor *owner, float bobx, float boby, double } if (fixedlightlev >= 0) { - vis->Style.colormap = mybasecolormap->Maps + fixedlightlev; + vis->Style.BaseColormap = mybasecolormap; + vis->Style.ColormapNum = fixedlightlev >> COLORMAPSHIFT; } else if (!foggy && pspr->GetState()->GetFullbright()) { // full bright - vis->Style.colormap = mybasecolormap->Maps; // [RH] use basecolormap + vis->Style.BaseColormap = mybasecolormap; // [RH] use basecolormap + vis->Style.ColormapNum = 0; } else { // local light - vis->Style.colormap = mybasecolormap->Maps + (GETPALOOKUP(0, spriteshade) << COLORMAPSHIFT); + vis->Style.BaseColormap = mybasecolormap; + vis->Style.ColormapNum = GETPALOOKUP(0, spriteshade); } } if (camera->Inventory != nullptr) { - lighttable_t *oldcolormap = vis->Style.colormap; - camera->Inventory->AlterWeaponSprite(&vis->Style); - if (vis->Style.colormap != oldcolormap) + BYTE oldcolormapnum = vis->Style.ColormapNum; + FSWColormap *oldcolormap = vis->Style.BaseColormap; + camera->Inventory->AlterWeaponSprite (&vis->Style); + if (vis->Style.BaseColormap != oldcolormap || vis->Style.ColormapNum != oldcolormapnum) { // The colormap has changed. Is it one we can easily identify? // If not, then don't bother trying to identify it for // hardware accelerated drawing. - if (vis->Style.colormap < SpecialColormaps[0].Colormap || - vis->Style.colormap > SpecialColormaps.Last().Colormap) + if (vis->Style.BaseColormap < &SpecialColormaps[0] || + vis->Style.BaseColormap > &SpecialColormaps.Last()) { noaccel = true; } // Has the basecolormap changed? If so, we can't hardware accelerate it, // since we don't know what it is anymore. - else if (vis->Style.colormap < mybasecolormap->Maps || - vis->Style.colormap >= mybasecolormap->Maps + NUMCOLORMAPS * 256) + else if (vis->Style.BaseColormap != mybasecolormap) { noaccel = true; } @@ -1497,13 +1503,13 @@ void R_DrawPSprite(DPSprite *pspr, AActor *owner, float bobx, float boby, double } // If we're drawing with a special colormap, but shaders for them are disabled, do // not accelerate. - if (!r_shadercolormaps && (vis->Style.colormap >= SpecialColormaps[0].Colormap && - vis->Style.colormap <= SpecialColormaps.Last().Colormap)) + if (!r_shadercolormaps && (vis->Style.BaseColormap >= &SpecialColormaps[0] && + vis->Style.BaseColormap <= &SpecialColormaps.Last())) { noaccel = true; } // If drawing with a BOOM colormap, disable acceleration. - if (mybasecolormap == &NormalLight && NormalLight.Maps != realcolormaps) + if (mybasecolormap == &NormalLight && NormalLight.Maps != realcolormaps.Maps) { noaccel = true; } @@ -1520,7 +1526,8 @@ void R_DrawPSprite(DPSprite *pspr, AActor *owner, float bobx, float boby, double else { colormap_to_use = basecolormap; - vis->Style.colormap = basecolormap->Maps; + vis->Style.BaseColormap = basecolormap; + vis->Style.ColormapNum = 0; vis->Style.RenderStyle = STYLE_Normal; } @@ -1691,18 +1698,16 @@ void R_DrawRemainingPlayerSprites() FColormapStyle colormapstyle; bool usecolormapstyle = false; - if (vis->Style.colormap >= SpecialColormaps[0].Colormap && - vis->Style.colormap < SpecialColormaps[SpecialColormaps.Size()].Colormap) + if (vis->Style.BaseColormap >= &SpecialColormaps[0] && + vis->Style.BaseColormap < &SpecialColormaps[SpecialColormaps.Size()]) { - // Yuck! There needs to be a better way to store colormaps in the vissprite... :( - ptrdiff_t specialmap = (vis->Style.colormap - SpecialColormaps[0].Colormap) / sizeof(FSpecialColormap); - special = &SpecialColormaps[specialmap]; + special = static_cast(vis->Style.BaseColormap); } else if (colormap->Color == PalEntry(255,255,255) && colormap->Desaturate == 0) { overlay = colormap->Fade; - overlay.a = BYTE(((vis->Style.colormap - colormap->Maps) >> 8) * 255 / NUMCOLORMAPS); + overlay.a = BYTE(vis->Style.ColormapNum * 255 / NUMCOLORMAPS); } else { @@ -1710,7 +1715,7 @@ void R_DrawRemainingPlayerSprites() colormapstyle.Color = colormap->Color; colormapstyle.Fade = colormap->Fade; colormapstyle.Desaturate = colormap->Desaturate; - colormapstyle.FadeLevel = ((vis->Style.colormap - colormap->Maps) >> 8) / float(NUMCOLORMAPS); + colormapstyle.FadeLevel = vis->Style.ColormapNum / float(NUMCOLORMAPS); } screen->DrawTexture(vis->pic, viewwindowx + vispsprites[i].x1, @@ -1955,7 +1960,8 @@ void R_DrawSprite (vissprite_t *spr) int r1, r2; short topclip, botclip; short *clip1, *clip2; - lighttable_t *colormap = spr->Style.colormap; + FSWColormap *colormap = spr->Style.BaseColormap; + int colormapnum = spr->Style.ColormapNum; F3DFloor *rover; FDynamicColormap *mybasecolormap; @@ -2052,17 +2058,19 @@ void R_DrawSprite (vissprite_t *spr) } if (fixedlightlev >= 0) { - spr->Style.colormap = mybasecolormap->Maps + fixedlightlev; + spr->Style.BaseColormap = mybasecolormap; + spr->Style.ColormapNum = fixedlightlev >> COLORMAPSHIFT; } else if (!foggy && (spr->renderflags & RF_FULLBRIGHT)) { // full bright - spr->Style.colormap = mybasecolormap->Maps; + spr->Style.BaseColormap = mybasecolormap; + spr->Style.ColormapNum = 0; } else { // diminished light spriteshade = LIGHT2SHADE(sec->lightlevel + r_actualextralight); - spr->Style.colormap = mybasecolormap->Maps + (GETPALOOKUP ( - r_SpriteVisibility / MAX(MINZ, (double)spr->depth), spriteshade) << COLORMAPSHIFT); + spr->Style.BaseColormap = mybasecolormap; + spr->Style.ColormapNum = GETPALOOKUP(r_SpriteVisibility / MAX(MINZ, (double)spr->depth), spriteshade); } } } @@ -2210,7 +2218,8 @@ void R_DrawSprite (vissprite_t *spr) if (topclip >= botclip) { - spr->Style.colormap = colormap; + spr->Style.BaseColormap = colormap; + spr->Style.ColormapNum = colormapnum; return; } @@ -2340,7 +2349,8 @@ void R_DrawSprite (vissprite_t *spr) } if (i == x2) { - spr->Style.colormap = colormap; + spr->Style.BaseColormap = colormap; + spr->Style.ColormapNum = colormapnum; return; } } @@ -2358,7 +2368,8 @@ void R_DrawSprite (vissprite_t *spr) int maxvoxely = spr->gzb > hzb ? INT_MAX : xs_RoundToInt((spr->gzt - hzb) / spr->yscale); R_DrawVisVoxel(spr, minvoxely, maxvoxely, cliptop, clipbot); } - spr->Style.colormap = colormap; + spr->Style.BaseColormap = colormap; + spr->Style.ColormapNum = colormapnum; } // kg3D: @@ -2475,7 +2486,7 @@ void R_ProjectParticle (particle_t *particle, const sector_t *sector, int shade, int x1, x2, y1, y2; vissprite_t* vis; sector_t* heightsec = NULL; - BYTE* map; + FSWColormap* map; // [ZZ] Particle not visible through the portal plane if (CurrentPortal && !!P_PointOnLineSide(particle->Pos, CurrentPortal->dst)) @@ -2548,7 +2559,7 @@ void R_ProjectParticle (particle_t *particle, const sector_t *sector, int shade, botplane = &heightsec->ceilingplane; toppic = sector->GetTexture(sector_t::ceiling); botpic = heightsec->GetTexture(sector_t::ceiling); - map = heightsec->ColorMap->Maps; + map = heightsec->ColorMap; } else if (fakeside == FAKED_BelowFloor) { @@ -2556,7 +2567,7 @@ void R_ProjectParticle (particle_t *particle, const sector_t *sector, int shade, botplane = §or->floorplane; toppic = heightsec->GetTexture(sector_t::floor); botpic = sector->GetTexture(sector_t::floor); - map = heightsec->ColorMap->Maps; + map = heightsec->ColorMap; } else { @@ -2564,7 +2575,7 @@ void R_ProjectParticle (particle_t *particle, const sector_t *sector, int shade, botplane = &heightsec->floorplane; toppic = heightsec->GetTexture(sector_t::ceiling); botpic = heightsec->GetTexture(sector_t::floor); - map = sector->ColorMap->Maps; + map = sector->ColorMap; } } else @@ -2573,7 +2584,7 @@ void R_ProjectParticle (particle_t *particle, const sector_t *sector, int shade, botplane = §or->floorplane; toppic = sector->GetTexture(sector_t::ceiling); botpic = sector->GetTexture(sector_t::floor); - map = sector->ColorMap->Maps; + map = sector->ColorMap; } if (botpic != skyflatnum && particle->Pos.Z < botplane->ZatPoint (particle->Pos)) @@ -2602,25 +2613,28 @@ void R_ProjectParticle (particle_t *particle, const sector_t *sector, int shade, vis->renderflags = particle->trans; vis->FakeFlatStat = fakeside; vis->floorclip = 0; - vis->ColormapNum = 0; + vis->Style.ColormapNum = 0; if (fixedlightlev >= 0) { - vis->Style.colormap = map + fixedlightlev; + vis->Style.BaseColormap = map; + vis->Style.ColormapNum = fixedlightlev >> COLORMAPSHIFT; } else if (fixedcolormap) { - vis->Style.colormap = fixedcolormap; + vis->Style.BaseColormap = fixedcolormap; + vis->Style.ColormapNum = 0; } else if (particle->bright) { - vis->Style.colormap = map; + vis->Style.BaseColormap = map; + vis->Style.ColormapNum = 0; } else { // Particles are slightly more visible than regular sprites. - vis->ColormapNum = GETPALOOKUP(tiz * r_SpriteVisibility * 0.5, shade); - vis->Style.colormap = map + (vis->ColormapNum << COLORMAPSHIFT); + vis->Style.ColormapNum = GETPALOOKUP(tiz * r_SpriteVisibility * 0.5, shade); + vis->Style.BaseColormap = map; } } @@ -2649,13 +2663,13 @@ static void R_DrawMaskedSegsBehindParticle (const vissprite_t *vis) } } -void R_DrawParticle (vissprite_t *vis) +void R_DrawParticle_C (vissprite_t *vis) { DWORD *bg2rgb; int spacing; BYTE *dest; DWORD fg; - BYTE color = vis->Style.colormap[vis->startfrac]; + BYTE color = vis->Style.BaseColormap->Maps[(vis->Style.ColormapNum << COLORMAPSHIFT) + vis->startfrac]; int yl = vis->y1; int ycount = vis->y2 - yl + 1; int x1 = vis->x1; @@ -2714,12 +2728,64 @@ void R_DrawParticle (vissprite_t *vis) } } +void R_DrawParticle_rgba(vissprite_t *vis) +{ + int spacing; + uint32_t *dest; + BYTE color = vis->Style.BaseColormap->Maps[vis->startfrac]; + int yl = vis->y1; + int ycount = vis->y2 - yl + 1; + int x1 = vis->x1; + int countbase = vis->x2 - x1; + + R_DrawMaskedSegsBehindParticle(vis); + + DrawerCommandQueue::WaitForWorkers(); + + uint32_t fg = LightBgra::shade_pal_index_simple(color, LightBgra::calc_light_multiplier(LIGHTSCALE(0, vis->Style.ColormapNum << FRACBITS))); + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + // vis->renderflags holds translucency level (0-255) + fixed_t fglevel = ((vis->renderflags + 1) << 8) & ~0x3ff; + uint32_t alpha = fglevel * 256 / FRACUNIT; + uint32_t inv_alpha = 256 - alpha; + + fg_red *= alpha; + fg_green *= alpha; + fg_blue *= alpha; + + spacing = RenderTarget->GetPitch(); + + for (int x = x1; x < (x1 + countbase); x++) + { + dc_x = x; + if (R_ClipSpriteColumnWithPortals(vis)) + continue; + dest = ylookup[yl] + x + (uint32_t*)dc_destorg; + for (int y = 0; y < ycount; y++) + { + uint32_t bg_red = (*dest >> 16) & 0xff; + uint32_t bg_green = (*dest >> 8) & 0xff; + uint32_t bg_blue = (*dest) & 0xff; + + uint32_t red = (fg_red + bg_red * inv_alpha) / 256; + uint32_t green = (fg_green + bg_green * inv_alpha) / 256; + uint32_t blue = (fg_blue + bg_blue * inv_alpha) / 256; + + *dest = 0xff000000 | (red << 16) | (green << 8) | blue; + dest += spacing; + } + } +} + extern double BaseYaspectMul;; void R_DrawVoxel(const FVector3 &globalpos, FAngle viewangle, const FVector3 &dasprpos, DAngle dasprang, fixed_t daxscale, fixed_t dayscale, FVoxel *voxobj, - lighttable_t *colormap, short *daumost, short *dadmost, int minslabz, int maxslabz, int flags) + FSWColormap *colormap, int colormapnum, short *daumost, short *dadmost, int minslabz, int maxslabz, int flags) { int i, j, k, x, y, syoff, ggxstart, ggystart, nxoff; fixed_t cosang, sinang, sprcosang, sprsinang; @@ -2761,7 +2827,9 @@ void R_DrawVoxel(const FVector3 &globalpos, FAngle viewangle, sprcosang = FLOAT2FIXED(dasprang.Cos()) >> 2; sprsinang = FLOAT2FIXED(-dasprang.Sin()) >> 2; - R_SetupDrawSlab(colormap); + R_SetupDrawSlab(colormap, 0.0f, colormapnum << FRACBITS); + + int pixelsize = r_swtruecolor ? 4 : 1; // Select mip level i = abs(DMulScale6(dasprx - globalposx, cosang, daspry - globalposy, sinang)); @@ -3016,7 +3084,7 @@ void R_DrawVoxel(const FVector3 &globalpos, FAngle viewangle, if (!(flags & DVF_OFFSCREEN)) { // Draw directly to the screen. - R_DrawSlab(xxr - xxl, yplc[xxl], z2 - z1, yinc, col, ylookup[z1] + lxt + xxl + dc_destorg); + R_DrawSlab(xxr - xxl, yplc[xxl], z2 - z1, yinc, col, (ylookup[z1] + lxt + xxl) * pixelsize + dc_destorg); } else { @@ -3247,12 +3315,12 @@ void R_CheckOffscreenBuffer(int width, int height, bool spansonly) { if (OffscreenColorBuffer == NULL) { - OffscreenColorBuffer = new BYTE[width * height]; + OffscreenColorBuffer = new BYTE[width * height * 4]; } else if (OffscreenBufferWidth != width || OffscreenBufferHeight != height) { delete[] OffscreenColorBuffer; - OffscreenColorBuffer = new BYTE[width * height]; + OffscreenColorBuffer = new BYTE[width * height * 4]; } } OffscreenBufferWidth = width; diff --git a/src/r_things.h b/src/r_things.h index 29e69d3a5..cbe34015f 100644 --- a/src/r_things.h +++ b/src/r_things.h @@ -86,7 +86,6 @@ struct vissprite_t BYTE bSplitSprite:1; // [RH] Sprite was split by a drawseg BYTE bInMirror:1; // [RH] Sprite is "inside" a mirror BYTE FakeFlatStat; // [RH] which side of fake/floor ceiling sprite is on - BYTE ColormapNum; // Which colormap is rendered (needed for shaded drawer) short renderflags; DWORD Translation; // [RH] for color translation visstyle_t Style; @@ -97,7 +96,10 @@ struct vissprite_t struct particle_t; -void R_DrawParticle (vissprite_t *); +extern void(*R_DrawParticle)(vissprite_t *); +void R_DrawParticle_C (vissprite_t *); +void R_DrawParticle_rgba (vissprite_t *); + void R_ProjectParticle (particle_t *, const sector_t *sector, int shade, int fakeside); extern int MaxVisSprites; @@ -142,7 +144,7 @@ enum { DVF_OFFSCREEN = 1, DVF_SPANSONLY = 2, DVF_MIRRORED = 4 }; void R_DrawVoxel(const FVector3 &viewpos, FAngle viewangle, const FVector3 &sprpos, DAngle dasprang, fixed_t daxscale, fixed_t dayscale, struct FVoxel *voxobj, - lighttable_t *colormap, short *daumost, short *dadmost, int minslabz, int maxslabz, int flags); + FSWColormap *colormap, int colormapnum, short *daumost, short *dadmost, int minslabz, int maxslabz, int flags); void R_ClipVisSprite (vissprite_t *vis, int xl, int xh); diff --git a/src/r_utility.cpp b/src/r_utility.cpp index b009912f0..972812821 100644 --- a/src/r_utility.cpp +++ b/src/r_utility.cpp @@ -896,11 +896,11 @@ void R_SetupFrame (AActor *actor) BaseBlendG = GPART(newblend); BaseBlendB = BPART(newblend); BaseBlendA = APART(newblend) / 255.f; - NormalLight.Maps = realcolormaps; + NormalLight.Maps = realcolormaps.Maps; } else { - NormalLight.Maps = realcolormaps + NUMCOLORMAPS*256*newblend; + NormalLight.Maps = realcolormaps.Maps + NUMCOLORMAPS*256*newblend; BaseBlendR = BaseBlendG = BaseBlendB = 0; BaseBlendA = 0.f; } diff --git a/src/textures/automaptexture.cpp b/src/textures/automaptexture.cpp index 67d68b9fe..9aac379ef 100644 --- a/src/textures/automaptexture.cpp +++ b/src/textures/automaptexture.cpp @@ -122,6 +122,7 @@ void FAutomapTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/buildtexture.cpp b/src/textures/buildtexture.cpp index bfcc6333d..1155dacc4 100644 --- a/src/textures/buildtexture.cpp +++ b/src/textures/buildtexture.cpp @@ -56,7 +56,6 @@ public: const BYTE *GetColumn (unsigned int column, const Span **spans_out); const BYTE *GetPixels (); - void Unload (); protected: const BYTE *Pixels; @@ -103,17 +102,6 @@ FBuildTexture::~FBuildTexture () // //========================================================================== -void FBuildTexture::Unload () -{ - // Nothing to do, since the pixels are accessed from memory-mapped files directly -} - -//========================================================================== -// -// -// -//========================================================================== - const BYTE *FBuildTexture::GetPixels () { return Pixels; diff --git a/src/textures/canvastexture.cpp b/src/textures/canvastexture.cpp index 062c3af1d..109d927ab 100644 --- a/src/textures/canvastexture.cpp +++ b/src/textures/canvastexture.cpp @@ -53,7 +53,6 @@ FCanvasTexture::FCanvasTexture (const char *name, int width, int height) DummySpans[1].TopOffset = 0; DummySpans[1].Length = 0; UseType = TEX_Wall; - Canvas = NULL; bNeedsUpdate = true; bDidUpdate = false; bHasCanvas = true; @@ -101,11 +100,22 @@ const BYTE *FCanvasTexture::GetPixels () return Pixels; } +const uint32_t *FCanvasTexture::GetPixelsBgra() +{ + bNeedsUpdate = true; + if (CanvasBgra == NULL) + { + MakeTextureBgra(); + } + return PixelsBgra; +} + void FCanvasTexture::MakeTexture () { - Canvas = new DSimpleCanvas (Width, Height); + Canvas = new DSimpleCanvas (Width, Height, false); Canvas->Lock (); GC::AddSoftRoot(Canvas); + if (Width != Height || Width != Canvas->GetPitch()) { Pixels = new BYTE[Width*Height]; @@ -113,29 +123,68 @@ void FCanvasTexture::MakeTexture () } else { - Pixels = Canvas->GetBuffer(); + Pixels = (BYTE*)Canvas->GetBuffer(); bPixelsAllocated = false; } + // Draw a special "unrendered" initial texture into the buffer. memset (Pixels, 0, Width*Height/2); memset (Pixels+Width*Height/2, 255, Width*Height/2); } +void FCanvasTexture::MakeTextureBgra() +{ + CanvasBgra = new DSimpleCanvas(Width, Height, true); + CanvasBgra->Lock(); + GC::AddSoftRoot(CanvasBgra); + + if (Width != Height || Width != CanvasBgra->GetPitch()) + { + PixelsBgra = new uint32_t[Width*Height]; + bPixelsAllocatedBgra = true; + } + else + { + PixelsBgra = (uint32_t*)CanvasBgra->GetBuffer(); + bPixelsAllocatedBgra = false; + } + + // Draw a special "unrendered" initial texture into the buffer. + memset(PixelsBgra, 0, Width*Height / 2 * 4); + memset(PixelsBgra + Width*Height / 2, 255, Width*Height / 2 * 4); +} + void FCanvasTexture::Unload () { if (bPixelsAllocated) { - if (Pixels != NULL) delete [] Pixels; + if (Pixels != NULL) delete[] Pixels; bPixelsAllocated = false; Pixels = NULL; } + if (bPixelsAllocatedBgra) + { + if (PixelsBgra != NULL) delete[] PixelsBgra; + bPixelsAllocatedBgra = false; + PixelsBgra = NULL; + } + if (Canvas != NULL) { GC::DelSoftRoot(Canvas); Canvas->Destroy(); Canvas = NULL; } + + if (CanvasBgra != NULL) + { + GC::DelSoftRoot(CanvasBgra); + CanvasBgra->Destroy(); + CanvasBgra = NULL; + } + + FTexture::Unload(); } bool FCanvasTexture::CheckModified () diff --git a/src/textures/ddstexture.cpp b/src/textures/ddstexture.cpp index 31e748022..fb4de34c5 100644 --- a/src/textures/ddstexture.cpp +++ b/src/textures/ddstexture.cpp @@ -401,6 +401,7 @@ void FDDSTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/flattexture.cpp b/src/textures/flattexture.cpp index 840d53aaf..08e0d1221 100644 --- a/src/textures/flattexture.cpp +++ b/src/textures/flattexture.cpp @@ -138,6 +138,7 @@ void FFlatTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/imgztexture.cpp b/src/textures/imgztexture.cpp index 1c262d707..04932d4bf 100644 --- a/src/textures/imgztexture.cpp +++ b/src/textures/imgztexture.cpp @@ -142,6 +142,7 @@ void FIMGZTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/jpegtexture.cpp b/src/textures/jpegtexture.cpp index 225396598..fc629b37e 100644 --- a/src/textures/jpegtexture.cpp +++ b/src/textures/jpegtexture.cpp @@ -295,11 +295,9 @@ FJPEGTexture::~FJPEGTexture () void FJPEGTexture::Unload () { - if (Pixels != NULL) - { - delete[] Pixels; - Pixels = NULL; - } + delete[] Pixels; + Pixels = NULL; + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/multipatchtexture.cpp b/src/textures/multipatchtexture.cpp index 41ba5f0f2..5d9aeb433 100644 --- a/src/textures/multipatchtexture.cpp +++ b/src/textures/multipatchtexture.cpp @@ -362,6 +362,7 @@ void FMultiPatchTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/patchtexture.cpp b/src/textures/patchtexture.cpp index 423ce4deb..8388515c0 100644 --- a/src/textures/patchtexture.cpp +++ b/src/textures/patchtexture.cpp @@ -184,6 +184,7 @@ void FPatchTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/pcxtexture.cpp b/src/textures/pcxtexture.cpp index 0ec5d2933..42a13b85a 100644 --- a/src/textures/pcxtexture.cpp +++ b/src/textures/pcxtexture.cpp @@ -191,6 +191,7 @@ void FPCXTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/pngtexture.cpp b/src/textures/pngtexture.cpp index d34b00607..f904d3408 100644 --- a/src/textures/pngtexture.cpp +++ b/src/textures/pngtexture.cpp @@ -372,11 +372,9 @@ FPNGTexture::~FPNGTexture () void FPNGTexture::Unload () { - if (Pixels != NULL) - { - delete[] Pixels; - Pixels = NULL; - } + delete[] Pixels; + Pixels = NULL; + FTexture::Unload(); } //========================================================================== @@ -449,6 +447,7 @@ const BYTE *FPNGTexture::GetPixels () return Pixels; } + //========================================================================== // // diff --git a/src/textures/rawpagetexture.cpp b/src/textures/rawpagetexture.cpp index 1402f8844..69313fd1c 100644 --- a/src/textures/rawpagetexture.cpp +++ b/src/textures/rawpagetexture.cpp @@ -206,6 +206,7 @@ void FRawPageTexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/texture.cpp b/src/textures/texture.cpp index 7b90c295f..12e9d8549 100644 --- a/src/textures/texture.cpp +++ b/src/textures/texture.cpp @@ -45,6 +45,7 @@ #include "v_video.h" #include "m_fixed.h" #include "textures/textures.h" +#include "v_palette.h" typedef bool (*CheckFunc)(FileReader & file); typedef FTexture * (*CreateFunc)(FileReader & file, int lumpnum); @@ -175,6 +176,37 @@ FTexture::~FTexture () KillNative(); } +void FTexture::Unload() +{ + PixelsBgra = std::vector(); +} + +const uint32_t *FTexture::GetColumnBgra(unsigned int column, const Span **spans_out) +{ + const uint32_t *pixels = GetPixelsBgra(); + + column %= Width; + + if (spans_out != nullptr) + GetColumn(column, spans_out); + return pixels + column * Height; +} + +const uint32_t *FTexture::GetPixelsBgra() +{ + if (PixelsBgra.empty() || CheckModified()) + { + if (!GetColumn(0, nullptr)) + return nullptr; + + FBitmap bitmap; + bitmap.Create(GetWidth(), GetHeight()); + CopyTrueColorPixels(&bitmap, 0, 0); + GenerateBgraFromBitmap(bitmap); + } + return PixelsBgra.data(); +} + bool FTexture::CheckModified () { return false; @@ -318,6 +350,210 @@ void FTexture::FreeSpans (Span **spans) const M_Free (spans); } +void FTexture::GenerateBgraFromBitmap(const FBitmap &bitmap) +{ + CreatePixelsBgraWithMipmaps(); + + // Transpose + const uint32_t *src = (const uint32_t *)bitmap.GetPixels(); + uint32_t *dest = PixelsBgra.data(); + for (int x = 0; x < Width; x++) + { + for (int y = 0; y < Height; y++) + { + dest[y + x * Height] = src[x + y * Width]; + } + } + + GenerateBgraMipmaps(); +} + +void FTexture::CreatePixelsBgraWithMipmaps() +{ + int levels = MipmapLevels(); + int buffersize = 0; + for (int i = 0; i < levels; i++) + { + int w = MAX(Width >> i, 1); + int h = MAX(Height >> i, 1); + buffersize += w * h; + } + PixelsBgra.resize(buffersize, 0xffff0000); +} + +int FTexture::MipmapLevels() const +{ + int widthbits = 0; + while ((Width >> widthbits) != 0) widthbits++; + + int heightbits = 0; + while ((Height >> heightbits) != 0) heightbits++; + + return MAX(widthbits, heightbits); +} + +void FTexture::GenerateBgraMipmaps() +{ + struct Color4f + { + float a, r, g, b; + Color4f operator*(const Color4f &v) const { return Color4f{ a * v.a, r * v.r, g * v.g, b * v.b }; } + Color4f operator/(const Color4f &v) const { return Color4f{ a / v.a, r / v.r, g / v.g, b / v.b }; } + Color4f operator+(const Color4f &v) const { return Color4f{ a + v.a, r + v.r, g + v.g, b + v.b }; } + Color4f operator-(const Color4f &v) const { return Color4f{ a - v.a, r - v.r, g - v.g, b - v.b }; } + Color4f operator*(float s) const { return Color4f{ a * s, r * s, g * s, b * s }; } + Color4f operator/(float s) const { return Color4f{ a / s, r / s, g / s, b / s }; } + Color4f operator+(float s) const { return Color4f{ a + s, r + s, g + s, b + s }; } + Color4f operator-(float s) const { return Color4f{ a - s, r - s, g - s, b - s }; } + }; + + int levels = MipmapLevels(); + std::vector image(PixelsBgra.size()); + + // Convert to normalized linear colorspace + { + for (int x = 0; x < Width; x++) + { + for (int y = 0; y < Height; y++) + { + uint32_t c8 = PixelsBgra[x * Height + y]; + Color4f c; + c.a = powf(APART(c8) * (1.0f / 255.0f), 2.2f); + c.r = powf(RPART(c8) * (1.0f / 255.0f), 2.2f); + c.g = powf(GPART(c8) * (1.0f / 255.0f), 2.2f); + c.b = powf(BPART(c8) * (1.0f / 255.0f), 2.2f); + image[x * Height + y] = c; + } + } + } + + // Generate mipmaps + { + std::vector smoothed(Width * Height); + Color4f *src = image.data(); + Color4f *dest = src + Width * Height; + for (int i = 1; i < levels; i++) + { + int srcw = MAX(Width >> (i - 1), 1); + int srch = MAX(Height >> (i - 1), 1); + int w = MAX(Width >> i, 1); + int h = MAX(Height >> i, 1); + + // Downscale + for (int x = 0; x < w; x++) + { + int sx0 = x * 2; + int sx1 = MIN((x + 1) * 2, srcw - 1); + for (int y = 0; y < h; y++) + { + int sy0 = y * 2; + int sy1 = MIN((y + 1) * 2, srch - 1); + + Color4f src00 = src[sy0 + sx0 * srch]; + Color4f src01 = src[sy1 + sx0 * srch]; + Color4f src10 = src[sy0 + sx1 * srch]; + Color4f src11 = src[sy1 + sx1 * srch]; + Color4f c = (src00 + src01 + src10 + src11) * 0.25f; + + dest[y + x * h] = src00; + } + } + + // Sharpen filter with a 3x3 kernel: + for (int x = 0; x < w; x++) + { + for (int y = 0; y < h; y++) + { + Color4f c = { 0.0f, 0.0f, 0.0f, 0.0f }; + for (int kx = -1; kx < 2; kx++) + { + for (int ky = -1; ky < 2; ky++) + { + int a = y + ky; + int b = x + kx; + if (a < 0) a = h - 1; + if (a == h) a = 0; + if (b < 0) b = w - 1; + if (b == w) b = 0; + c = c + dest[a + b * h]; + } + } + c = c * (1.0f / 9.0f); + smoothed[y + x * h] = c; + } + } + float k = 0.04f; + for (int j = 0; j < w * h; j++) + dest[j] = dest[j] + (dest[j] - smoothed[j]) * k; + + src = dest; + dest += w * h; + } + } + + // Convert to bgra8 sRGB colorspace + { + Color4f *src = image.data() + Width * Height; + uint32_t *dest = PixelsBgra.data() + Width * Height; + for (int i = 1; i < levels; i++) + { + int w = MAX(Width >> i, 1); + int h = MAX(Height >> i, 1); + for (int j = 0; j < w * h; j++) + { + uint32_t a = (uint32_t)clamp(powf(src[j].a, 1.0f / 2.2f) * 255.0f + 0.5f, 0.0f, 255.0f); + uint32_t r = (uint32_t)clamp(powf(src[j].r, 1.0f / 2.2f) * 255.0f + 0.5f, 0.0f, 255.0f); + uint32_t g = (uint32_t)clamp(powf(src[j].g, 1.0f / 2.2f) * 255.0f + 0.5f, 0.0f, 255.0f); + uint32_t b = (uint32_t)clamp(powf(src[j].b, 1.0f / 2.2f) * 255.0f + 0.5f, 0.0f, 255.0f); + dest[j] = (a << 24) | (r << 16) | (g << 8) | b; + } + src += w * h; + dest += w * h; + } + } +} + +void FTexture::GenerateBgraMipmapsFast() +{ + uint32_t *src = PixelsBgra.data(); + uint32_t *dest = src + Width * Height; + int levels = MipmapLevels(); + for (int i = 1; i < levels; i++) + { + int srcw = MAX(Width >> (i - 1), 1); + int srch = MAX(Height >> (i - 1), 1); + int w = MAX(Width >> i, 1); + int h = MAX(Height >> i, 1); + + for (int x = 0; x < w; x++) + { + int sx0 = x * 2; + int sx1 = MIN((x + 1) * 2, srcw - 1); + + for (int y = 0; y < h; y++) + { + int sy0 = y * 2; + int sy1 = MIN((y + 1) * 2, srch - 1); + + uint32_t src00 = src[sy0 + sx0 * srch]; + uint32_t src01 = src[sy1 + sx0 * srch]; + uint32_t src10 = src[sy0 + sx1 * srch]; + uint32_t src11 = src[sy1 + sx1 * srch]; + + uint32_t alpha = (APART(src00) + APART(src01) + APART(src10) + APART(src11) + 2) / 4; + uint32_t red = (RPART(src00) + RPART(src01) + RPART(src10) + RPART(src11) + 2) / 4; + uint32_t green = (GPART(src00) + GPART(src01) + GPART(src10) + GPART(src11) + 2) / 4; + uint32_t blue = (BPART(src00) + BPART(src01) + BPART(src10) + BPART(src11) + 2) / 4; + + dest[y + x * h] = (alpha << 24) | (red << 16) | (green << 8) | blue; + } + } + + src = dest; + dest += w * h; + } +} + void FTexture::CopyToBlock (BYTE *dest, int dwidth, int dheight, int xpos, int ypos, int rotate, const BYTE *translation) { const BYTE *pixels = GetPixels(); @@ -384,6 +620,29 @@ void FTexture::FlipSquareBlock (BYTE *block, int x, int y) } } +void FTexture::FlipSquareBlockBgra(uint32_t *block, int x, int y) +{ + int i, j; + + if (x != y) return; + + for (i = 0; i < x; ++i) + { + uint32_t *corner = block + x*i + i; + int count = x - i; + if (count & 1) + { + count--; + swapvalues(corner[count], corner[count*x]); + } + for (j = 0; j < count; j += 2) + { + swapvalues(corner[j], corner[j*x]); + swapvalues(corner[j + 1], corner[(j + 1)*x]); + } + } +} + void FTexture::FlipSquareBlockRemap (BYTE *block, int x, int y, const BYTE *remap) { int i, j; @@ -427,6 +686,19 @@ void FTexture::FlipNonSquareBlock (BYTE *dst, const BYTE *src, int x, int y, int } } +void FTexture::FlipNonSquareBlockBgra(uint32_t *dst, const uint32_t *src, int x, int y, int srcpitch) +{ + int i, j; + + for (i = 0; i < x; ++i) + { + for (j = 0; j < y; ++j) + { + dst[i*y + j] = src[i + j*srcpitch]; + } + } +} + void FTexture::FlipNonSquareBlockRemap (BYTE *dst, const BYTE *src, int x, int y, int srcpitch, const BYTE *remap) { int i, j; @@ -580,10 +852,6 @@ FDummyTexture::FDummyTexture () UseType = TEX_Null; } -void FDummyTexture::Unload () -{ -} - void FDummyTexture::SetSize (int width, int height) { Width = width; diff --git a/src/textures/textures.h b/src/textures/textures.h index 477c39ecc..58792fc41 100644 --- a/src/textures/textures.h +++ b/src/textures/textures.h @@ -3,6 +3,7 @@ #include "doomtype.h" #include "vectors.h" +#include struct FloatRect { @@ -195,9 +196,18 @@ public: // Returns a single column of the texture virtual const BYTE *GetColumn (unsigned int column, const Span **spans_out) = 0; + // Returns a single column of the texture, in BGRA8 format + virtual const uint32_t *GetColumnBgra(unsigned int column, const Span **spans_out); + // Returns the whole texture, stored in column-major order virtual const BYTE *GetPixels () = 0; - + + // Returns the whole texture, stored in column-major order, in BGRA8 format + virtual const uint32_t *GetPixelsBgra(); + + // Returns true if GetPixelsBgra includes mipmaps + virtual bool Mipmapped() { return true; } + virtual int CopyTrueColorPixels(FBitmap *bmp, int x, int y, int rotate=0, FCopyInfo *inf = NULL); int CopyTrueColorTranslated(FBitmap *bmp, int x, int y, int rotate, FRemapTable *remap, FCopyInfo *inf = NULL); virtual bool UseBasePalette(); @@ -205,7 +215,7 @@ public: virtual FTexture *GetRedirect(bool wantwarped); virtual FTexture *GetRawTexture(); // for FMultiPatchTexture to override - virtual void Unload () = 0; + virtual void Unload (); // Returns the native pixel format for this image virtual FTextureFormat GetFormat(); @@ -285,10 +295,20 @@ protected: gl_info.areas = NULL; } + std::vector PixelsBgra; + + void GenerateBgraFromBitmap(const FBitmap &bitmap); + void CreatePixelsBgraWithMipmaps(); + void GenerateBgraMipmaps(); + void GenerateBgraMipmapsFast(); + int MipmapLevels() const; + public: static void FlipSquareBlock (BYTE *block, int x, int y); + static void FlipSquareBlockBgra (uint32_t *block, int x, int y); static void FlipSquareBlockRemap (BYTE *block, int x, int y, const BYTE *remap); static void FlipNonSquareBlock (BYTE *blockto, const BYTE *blockfrom, int x, int y, int srcpitch); + static void FlipNonSquareBlockBgra (uint32_t *blockto, const uint32_t *blockfrom, int x, int y, int srcpitch); static void FlipNonSquareBlockRemap (BYTE *blockto, const BYTE *blockfrom, int x, int y, int srcpitch, const BYTE *remap); friend class D3DTex; @@ -520,7 +540,6 @@ public: FDummyTexture (); const BYTE *GetColumn (unsigned int column, const Span **spans_out); const BYTE *GetPixels (); - void Unload (); void SetSize (int width, int height); }; @@ -534,6 +553,7 @@ public: virtual int CopyTrueColorPixels(FBitmap *bmp, int x, int y, int rotate=0, FCopyInfo *inf = NULL); const BYTE *GetColumn (unsigned int column, const Span **spans_out); const BYTE *GetPixels (); + const uint32_t *GetPixelsBgra() override; void Unload (); bool CheckModified (); @@ -567,21 +587,28 @@ public: const BYTE *GetColumn (unsigned int column, const Span **spans_out); const BYTE *GetPixels (); + const uint32_t *GetPixelsBgra() override; void Unload (); bool CheckModified (); void NeedUpdate() { bNeedsUpdate=true; } void SetUpdated() { bNeedsUpdate = false; bDidUpdate = true; bFirstUpdate = false; } DSimpleCanvas *GetCanvas() { return Canvas; } + DSimpleCanvas *GetCanvasBgra() { return CanvasBgra; } + bool Mipmapped() override { return false; } void MakeTexture (); + void MakeTextureBgra (); protected: - DSimpleCanvas *Canvas; - BYTE *Pixels; + DSimpleCanvas *Canvas = nullptr; + DSimpleCanvas *CanvasBgra = nullptr; + BYTE *Pixels = nullptr; + uint32_t *PixelsBgra = nullptr; Span DummySpans[2]; - bool bNeedsUpdate; - bool bDidUpdate; - bool bPixelsAllocated; + bool bNeedsUpdate = true; + bool bDidUpdate = false; + bool bPixelsAllocated = false; + bool bPixelsAllocatedBgra = false; public: bool bFirstUpdate; diff --git a/src/textures/tgatexture.cpp b/src/textures/tgatexture.cpp index b208a51a3..5e76a63b2 100644 --- a/src/textures/tgatexture.cpp +++ b/src/textures/tgatexture.cpp @@ -181,6 +181,7 @@ void FTGATexture::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/textures/warptexture.cpp b/src/textures/warptexture.cpp index a8a2ddb9e..91c7b9fc4 100644 --- a/src/textures/warptexture.cpp +++ b/src/textures/warptexture.cpp @@ -39,6 +39,7 @@ #include "r_utility.h" #include "textures/textures.h" #include "warpbuffer.h" +#include "v_palette.h" FWarpTexture::FWarpTexture (FTexture *source, int warptype) @@ -74,6 +75,7 @@ void FWarpTexture::Unload () Spans = NULL; } SourcePic->Unload (); + FTexture::Unload(); } bool FWarpTexture::CheckModified () @@ -92,6 +94,25 @@ const BYTE *FWarpTexture::GetPixels () return Pixels; } +const uint32_t *FWarpTexture::GetPixelsBgra() +{ + DWORD time = r_FrameTime; + if (Pixels == NULL || time != GenTime) + { + MakeTexture(time); + CreatePixelsBgraWithMipmaps(); + for (int i = 0; i < Width * Height; i++) + { + if (Pixels[i] != 0) + PixelsBgra[i] = 0xff000000 | GPalette.BaseColors[Pixels[i]].d; + else + PixelsBgra[i] = 0; + } + GenerateBgraMipmapsFast(); + } + return PixelsBgra.data(); +} + const BYTE *FWarpTexture::GetColumn (unsigned int column, const Span **spans_out) { DWORD time = r_FrameTime; diff --git a/src/v_draw.cpp b/src/v_draw.cpp index 4677c4b08..37ced09d5 100644 --- a/src/v_draw.cpp +++ b/src/v_draw.cpp @@ -44,6 +44,7 @@ #include "r_utility.h" #ifndef NO_SWRENDER #include "r_draw.h" +#include "r_draw_rgba.h" #include "r_main.h" #include "r_things.h" #endif @@ -137,6 +138,12 @@ void DCanvas::DrawTextureParms(FTexture *img, DrawParms &parms) static short bottomclipper[MAXWIDTH], topclipper[MAXWIDTH]; const BYTE *translation = NULL; + if (r_swtruecolor != IsBgra()) + { + r_swtruecolor = IsBgra(); + R_InitColumnDrawers(); + } + if (parms.masked) { spanptr = &spans; @@ -173,14 +180,14 @@ void DCanvas::DrawTextureParms(FTexture *img, DrawParms &parms) if (translation != NULL) { - dc_colormap = (lighttable_t *)translation; + R_SetTranslationMap((lighttable_t *)translation); } else { - dc_colormap = identitymap; + R_SetTranslationMap(identitymap); } - fixedcolormap = dc_colormap; + fixedcolormap = dc_fcolormap; ESPSResult mode = R_SetPatchStyle (parms.style, parms.Alpha, 0, parms.fillcolor); BYTE *destorgsave = dc_destorg; @@ -306,7 +313,7 @@ void DCanvas::DrawTextureParms(FTexture *img, DrawParms &parms) while (dc_x < stop4) { - rt_initcols(); + rt_initcols(nullptr); for (int zz = 4; zz; --zz) { pixels = img->GetColumn(frac >> FRACBITS, spanptr); @@ -1023,13 +1030,35 @@ void DCanvas::PUTTRANSDOT (int xx, int yy, int basecolor, int level) oldyyshifted = yy * GetPitch(); } - BYTE *spot = GetBuffer() + oldyyshifted + xx; - DWORD *bg2rgb = Col2RGB8[1+level]; - DWORD *fg2rgb = Col2RGB8[63-level]; - DWORD fg = fg2rgb[basecolor]; - DWORD bg = bg2rgb[*spot]; - bg = (fg+bg) | 0x1f07c1f; - *spot = RGB32k.All[bg&(bg>>15)]; + if (IsBgra()) + { + uint32_t *spot = (uint32_t*)GetBuffer() + oldyyshifted + xx; + + uint32_t fg = LightBgra::shade_pal_index_simple(basecolor, LightBgra::calc_light_multiplier(0)); + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + uint32_t bg_red = (*spot >> 16) & 0xff; + uint32_t bg_green = (*spot >> 8) & 0xff; + uint32_t bg_blue = (*spot) & 0xff; + + uint32_t red = (fg_red + bg_red + 1) / 2; + uint32_t green = (fg_green + bg_green + 1) / 2; + uint32_t blue = (fg_blue + bg_blue + 1) / 2; + + *spot = 0xff000000 | (red << 16) | (green << 8) | blue; + } + else + { + BYTE *spot = GetBuffer() + oldyyshifted + xx; + DWORD *bg2rgb = Col2RGB8[1+level]; + DWORD *fg2rgb = Col2RGB8[63-level]; + DWORD fg = fg2rgb[basecolor]; + DWORD bg = bg2rgb[*spot]; + bg = (fg+bg) | 0x1f07c1f; + *spot = RGB32k.All[bg&(bg>>15)]; + } } void DCanvas::DrawLine(int x0, int y0, int x1, int y1, int palColor, uint32 realcolor) @@ -1073,27 +1102,65 @@ void DCanvas::DrawLine(int x0, int y0, int x1, int y1, int palColor, uint32 real { swapvalues (x0, x1); } - memset (GetBuffer() + y0*GetPitch() + x0, palColor, deltaX+1); + if (IsBgra()) + { + uint32_t fillColor = GPalette.BaseColors[palColor].d; + uint32_t *spot = (uint32_t*)GetBuffer() + y0*GetPitch() + x0; + for (int i = 0; i <= deltaX; i++) + spot[i] = fillColor; + } + else + { + memset (GetBuffer() + y0*GetPitch() + x0, palColor, deltaX+1); + } } else if (deltaX == 0) { // vertical line - BYTE *spot = GetBuffer() + y0*GetPitch() + x0; - int pitch = GetPitch (); - do + if (IsBgra()) { - *spot = palColor; - spot += pitch; - } while (--deltaY != 0); + uint32_t fillColor = GPalette.BaseColors[palColor].d; + uint32_t *spot = (uint32_t*)GetBuffer() + y0*GetPitch() + x0; + int pitch = GetPitch(); + do + { + *spot = fillColor; + spot += pitch; + } while (--deltaY != 0); + } + else + { + BYTE *spot = GetBuffer() + y0*GetPitch() + x0; + int pitch = GetPitch(); + do + { + *spot = palColor; + spot += pitch; + } while (--deltaY != 0); + } } else if (deltaX == deltaY) { // diagonal line. - BYTE *spot = GetBuffer() + y0*GetPitch() + x0; - int advance = GetPitch() + xDir; - do + if (IsBgra()) { - *spot = palColor; - spot += advance; - } while (--deltaY != 0); + uint32_t fillColor = GPalette.BaseColors[palColor].d; + uint32_t *spot = (uint32_t*)GetBuffer() + y0*GetPitch() + x0; + int advance = GetPitch() + xDir; + do + { + *spot = fillColor; + spot += advance; + } while (--deltaY != 0); + } + else + { + BYTE *spot = GetBuffer() + y0*GetPitch() + x0; + int advance = GetPitch() + xDir; + do + { + *spot = palColor; + spot += advance; + } while (--deltaY != 0); + } } else { @@ -1213,7 +1280,6 @@ void DCanvas::DrawPixel(int x, int y, int palColor, uint32 realcolor) void DCanvas::Clear (int left, int top, int right, int bottom, int palcolor, uint32 color) { int x, y; - BYTE *dest; if (left == right || top == bottom) { @@ -1243,12 +1309,28 @@ void DCanvas::Clear (int left, int top, int right, int bottom, int palcolor, uin palcolor = PalFromRGB(color); } - dest = Buffer + top * Pitch + left; - x = right - left; - for (y = top; y < bottom; y++) + if (IsBgra()) { - memset(dest, palcolor, x); - dest += Pitch; + uint32_t fill_color = GPalette.BaseColors[palcolor]; + + uint32_t *dest = (uint32_t*)Buffer + top * Pitch + left; + x = right - left; + for (y = top; y < bottom; y++) + { + for (int i = 0; i < x; i++) + dest[i] = fill_color; + dest += Pitch; + } + } + else + { + BYTE *dest = Buffer + top * Pitch + left; + x = right - left; + for (y = top; y < bottom; y++) + { + memset(dest, palcolor, x); + dest += Pitch; + } } } @@ -1339,8 +1421,11 @@ void DCanvas::FillSimplePoly(FTexture *tex, FVector2 *points, int npoints, // Setup constant texture mapping parameters. R_SetupSpanBits(tex); - R_SetSpanColormap(colormap != NULL ? &colormap->Maps[clamp(shade >> FRACBITS, 0, NUMCOLORMAPS-1) * 256] : identitymap); - R_SetSpanSource(tex->GetPixels()); + if (colormap) + R_SetSpanColormap(colormap, clamp(shade >> FRACBITS, 0, NUMCOLORMAPS - 1)); + else + R_SetSpanColormap(&identitycolormap, 0); + R_SetSpanSource(tex); scalex = double(1u << (32 - ds_xbits)) / scalex; scaley = double(1u << (32 - ds_ybits)) / scaley; ds_xstep = xs_RoundToInt(cosrot * scalex); @@ -1449,6 +1534,9 @@ void DCanvas::FillSimplePoly(FTexture *tex, FVector2 *points, int npoints, // void DCanvas::DrawBlock (int x, int y, int _width, int _height, const BYTE *src) const { + if (IsBgra()) + return; + int srcpitch = _width; int destpitch; BYTE *dest; @@ -1475,6 +1563,9 @@ void DCanvas::DrawBlock (int x, int y, int _width, int _height, const BYTE *src) // void DCanvas::GetBlock (int x, int y, int _width, int _height, BYTE *dest) const { + if (IsBgra()) + return; + const BYTE *src; #ifdef RANGECHECK diff --git a/src/v_font.cpp b/src/v_font.cpp index 0eb28a67c..248568b6f 100644 --- a/src/v_font.cpp +++ b/src/v_font.cpp @@ -1634,6 +1634,7 @@ void FFontChar1::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== @@ -1695,6 +1696,7 @@ void FFontChar2::Unload () delete[] Pixels; Pixels = NULL; } + FTexture::Unload(); } //========================================================================== diff --git a/src/v_video.cpp b/src/v_video.cpp index b1f1ced9c..7393324a8 100644 --- a/src/v_video.cpp +++ b/src/v_video.cpp @@ -71,6 +71,7 @@ FRenderer *Renderer; IMPLEMENT_ABSTRACT_CLASS (DCanvas) IMPLEMENT_ABSTRACT_CLASS (DFrameBuffer) +EXTERN_CVAR (Bool, swtruecolor) #if defined(_DEBUG) && defined(_M_IX86) #define DBGBREAK { __asm int 3 } @@ -83,7 +84,7 @@ class DDummyFrameBuffer : public DFrameBuffer DECLARE_CLASS (DDummyFrameBuffer, DFrameBuffer); public: DDummyFrameBuffer (int width, int height) - : DFrameBuffer (0, 0) + : DFrameBuffer (0, 0, false) { Width = width; Height = height; @@ -119,7 +120,6 @@ public: const BYTE *GetColumn(unsigned int column, const Span **spans_out); const BYTE *GetPixels(); - void Unload(); bool CheckModified(); void SetTranslation(int num); @@ -208,13 +208,14 @@ DCanvas *DCanvas::CanvasChain = NULL; // //========================================================================== -DCanvas::DCanvas (int _width, int _height) +DCanvas::DCanvas (int _width, int _height, bool _bgra) { // Init member vars Buffer = NULL; LockCount = 0; Width = _width; Height = _height; + Bgra = _bgra; // Add to list of active canvases Next = CanvasChain; @@ -344,10 +345,7 @@ void DCanvas::Dim (PalEntry color, float damount, int x1, int y1, int w, int h) if (damount == 0.f) return; - DWORD *bg2rgb; - DWORD fg; int gap; - BYTE *spot; int x, y; if (x1 >= Width || y1 >= Height) @@ -367,31 +365,73 @@ void DCanvas::Dim (PalEntry color, float damount, int x1, int y1, int w, int h) return; } - { - int amount; - - amount = (int)(damount * 64); - bg2rgb = Col2RGB8[64-amount]; - - fg = (((color.r * amount) >> 4) << 20) | - ((color.g * amount) >> 4) | - (((color.b * amount) >> 4) << 10); - } - - spot = Buffer + x1 + y1*Pitch; gap = Pitch - w; - for (y = h; y != 0; y--) - { - for (x = w; x != 0; x--) - { - DWORD bg; - bg = bg2rgb[(*spot)&0xff]; - bg = (fg+bg) | 0x1f07c1f; - *spot = RGB32k.All[bg&(bg>>15)]; - spot++; + if (IsBgra()) + { + uint32_t *spot = (uint32_t*)Buffer + x1 + y1*Pitch; + + uint32_t fg = color.d; + uint32_t fg_red = (fg >> 16) & 0xff; + uint32_t fg_green = (fg >> 8) & 0xff; + uint32_t fg_blue = fg & 0xff; + + uint32_t alpha = (uint32_t)clamp(damount * 256 + 0.5f, 0.0f, 256.0f); + uint32_t inv_alpha = 256 - alpha; + + fg_red *= alpha; + fg_green *= alpha; + fg_blue *= alpha; + + for (y = h; y != 0; y--) + { + for (x = w; x != 0; x--) + { + uint32_t bg_red = (*spot >> 16) & 0xff; + uint32_t bg_green = (*spot >> 8) & 0xff; + uint32_t bg_blue = (*spot) & 0xff; + + uint32_t red = (fg_red + bg_red * inv_alpha) / 256; + uint32_t green = (fg_green + bg_green * inv_alpha) / 256; + uint32_t blue = (fg_blue + bg_blue * inv_alpha) / 256; + + *spot = 0xff000000 | (red << 16) | (green << 8) | blue; + spot++; + } + spot += gap; + } + } + else + { + BYTE *spot = Buffer + x1 + y1*Pitch; + + DWORD *bg2rgb; + DWORD fg; + + { + int amount; + + amount = (int)(damount * 64); + bg2rgb = Col2RGB8[64-amount]; + + fg = (((color.r * amount) >> 4) << 20) | + ((color.g * amount) >> 4) | + (((color.b * amount) >> 4) << 10); + } + + for (y = h; y != 0; y--) + { + for (x = w; x != 0; x--) + { + DWORD bg; + + bg = bg2rgb[(*spot)&0xff]; + bg = (fg+bg) | 0x1f07c1f; + *spot = RGB32k.All[bg&(bg>>15)]; + spot++; + } + spot += gap; } - spot += gap; } } @@ -408,8 +448,8 @@ void DCanvas::GetScreenshotBuffer(const BYTE *&buffer, int &pitch, ESSType &colo { Lock(true); buffer = GetBuffer(); - pitch = GetPitch(); - color_type = SS_PAL; + pitch = IsBgra() ? GetPitch() * 4 : GetPitch(); + color_type = IsBgra() ? SS_BGRA : SS_PAL; } //========================================================================== @@ -704,13 +744,12 @@ void DCanvas::CalcGamma (float gamma, BYTE gammalookup[256]) // I found this formula on the web at // , // but that page no longer exits. - double invgamma = 1.f / gamma; int i; for (i = 0; i < 256; i++) { - gammalookup[i] = (BYTE)(255.0 * pow (i / 255.0, invgamma)); + gammalookup[i] = (BYTE)(255.0 * pow (i / 255.0, invgamma) + 0.5); } } @@ -722,8 +761,8 @@ void DCanvas::CalcGamma (float gamma, BYTE gammalookup[256]) // //========================================================================== -DSimpleCanvas::DSimpleCanvas (int width, int height) - : DCanvas (width, height) +DSimpleCanvas::DSimpleCanvas (int width, int height, bool bgra) + : DCanvas (width, height, bgra) { MemBuffer = nullptr; Resize(width, height); @@ -775,8 +814,9 @@ void DSimpleCanvas::Resize(int width, int height) Pitch = width + MAX(0, CPU.DataL1LineSize - 8); } } - MemBuffer = new BYTE[Pitch * height]; - memset(MemBuffer, 0, Pitch * height); + int bytes_per_pixel = swtruecolor ? 4 : 1; + MemBuffer = new BYTE[Pitch * height * bytes_per_pixel]; + memset (MemBuffer, 0, Pitch * height * bytes_per_pixel); } //========================================================================== @@ -845,8 +885,8 @@ void DSimpleCanvas::Unlock () // //========================================================================== -DFrameBuffer::DFrameBuffer (int width, int height) - : DSimpleCanvas (width, height) +DFrameBuffer::DFrameBuffer (int width, int height, bool bgra) + : DSimpleCanvas (width, height, bgra) { LastMS = LastSec = FrameCount = LastCount = LastTic = 0; Accel2D = false; @@ -855,6 +895,70 @@ DFrameBuffer::DFrameBuffer (int width, int height) VideoHeight = height; } +//========================================================================== +// +// DFrameBuffer :: PostprocessBgra +// +// Copies data to destination buffer while performing gamma and flash. +// This is only needed if a target cannot do this with shaders. +// +//========================================================================== + +void DFrameBuffer::CopyWithGammaBgra(void *output, int pitch, const BYTE *gammared, const BYTE *gammagreen, const BYTE *gammablue, PalEntry flash, int flash_amount) +{ + const BYTE *gammatables[3] = { gammared, gammagreen, gammablue }; + + if (flash_amount > 0) + { + uint16_t inv_flash_amount = 256 - flash_amount; + uint16_t flash_red = flash.r * flash_amount; + uint16_t flash_green = flash.g * flash_amount; + uint16_t flash_blue = flash.b * flash_amount; + + for (int y = 0; y < Height; y++) + { + BYTE *dest = (BYTE*)output + y * pitch; + BYTE *src = MemBuffer + y * Pitch * 4; + for (int x = 0; x < Width; x++) + { + uint16_t fg_red = src[2]; + uint16_t fg_green = src[1]; + uint16_t fg_blue = src[0]; + uint16_t red = (fg_red * inv_flash_amount + flash_red) >> 8; + uint16_t green = (fg_green * inv_flash_amount + flash_green) >> 8; + uint16_t blue = (fg_blue * inv_flash_amount + flash_blue) >> 8; + + dest[0] = gammatables[2][blue]; + dest[1] = gammatables[1][green]; + dest[2] = gammatables[0][red]; + dest[3] = 0xff; + + dest += 4; + src += 4; + } + } + } + else + { + for (int y = 0; y < Height; y++) + { + BYTE *dest = (BYTE*)output + y * pitch; + BYTE *src = MemBuffer + y * Pitch * 4; + for (int x = 0; x < Width; x++) + { + dest[0] = gammatables[2][src[0]]; + dest[1] = gammatables[1][src[1]]; + dest[2] = gammatables[0][src[2]]; + dest[3] = 0xff; + + dest += 4; + src += 4; + } + } + } +} + + //========================================================================== // // DFrameBuffer :: DrawRateStuff @@ -916,10 +1020,21 @@ void DFrameBuffer::DrawRateStuff () // Buffer can be NULL if we're doing hardware accelerated 2D if (buffer != NULL) { - buffer += (GetHeight()-1) * GetPitch(); - - for (i = 0; i < tics*2; i += 2) buffer[i] = 0xff; - for ( ; i < 20*2; i += 2) buffer[i] = 0x00; + if (IsBgra()) + { + uint32_t *buffer32 = (uint32_t*)buffer; + buffer32 += (GetHeight() - 1) * GetPitch(); + + for (i = 0; i < tics * 2; i += 2) buffer32[i] = 0xffffffff; + for (; i < 20 * 2; i += 2) buffer32[i] = 0xff000000; + } + else + { + buffer += (GetHeight() - 1) * GetPitch(); + + for (i = 0; i < tics * 2; i += 2) buffer[i] = 0xff; + for (; i < 20 * 2; i += 2) buffer[i] = 0x00; + } } else { @@ -992,16 +1107,6 @@ void FPaletteTester::SetTranslation(int num) } } -//========================================================================== -// -// FPaletteTester :: Unload -// -//========================================================================== - -void FPaletteTester::Unload() -{ -} - //========================================================================== // // FPaletteTester :: GetColumn diff --git a/src/v_video.h b/src/v_video.h index d19a3b06e..9769ce06c 100644 --- a/src/v_video.h +++ b/src/v_video.h @@ -187,7 +187,7 @@ class DCanvas : public DObject { DECLARE_ABSTRACT_CLASS (DCanvas, DObject) public: - DCanvas (int width, int height); + DCanvas (int width, int height, bool bgra); virtual ~DCanvas (); // Member variable access @@ -195,6 +195,7 @@ public: inline int GetWidth () const { return Width; } inline int GetHeight () const { return Height; } inline int GetPitch () const { return Pitch; } + inline bool IsBgra() const { return Bgra; } virtual bool IsValid (); @@ -272,6 +273,7 @@ protected: int Height; int Pitch; int LockCount; + bool Bgra; bool ClipBox (int &left, int &top, int &width, int &height, const BYTE *&src, const int srcpitch) const; void DrawTextureV(FTexture *img, double x, double y, uint32 tag, va_list tags) = delete; @@ -294,7 +296,7 @@ class DSimpleCanvas : public DCanvas { DECLARE_CLASS (DSimpleCanvas, DCanvas) public: - DSimpleCanvas (int width, int height); + DSimpleCanvas (int width, int height, bool bgra); ~DSimpleCanvas (); bool IsValid (); @@ -334,7 +336,7 @@ class DFrameBuffer : public DSimpleCanvas { DECLARE_ABSTRACT_CLASS (DFrameBuffer, DSimpleCanvas) public: - DFrameBuffer (int width, int height); + DFrameBuffer (int width, int height, bool bgra); // Force the surface to use buffered output if true is passed. virtual bool Lock (bool buffered) = 0; @@ -429,6 +431,7 @@ public: protected: void DrawRateStuff (); void CopyFromBuff (BYTE *src, int srcPitch, int width, int height, BYTE *dest); + void CopyWithGammaBgra(void *output, int pitch, const BYTE *gammared, const BYTE *gammagreen, const BYTE *gammablue, PalEntry flash, int flash_amount); DFrameBuffer () {} diff --git a/src/version.h b/src/version.h index 7742f410a..a88d3e20b 100644 --- a/src/version.h +++ b/src/version.h @@ -41,12 +41,12 @@ const char *GetVersionString(); /** Lots of different version numbers **/ -#define VERSIONSTR "2.3pre" +#define VERSIONSTR "0.0pre" // The version as seen in the Windows resource -#define RC_FILEVERSION 2,3,9999,0 -#define RC_PRODUCTVERSION 2,3,9999,0 -#define RC_PRODUCTVERSION2 "2.3pre" +#define RC_FILEVERSION 0,0,9999,0 +#define RC_PRODUCTVERSION 0,0,9999,0 +#define RC_PRODUCTVERSION2 "0.0pre" // Version identifier for network games. // Bump it every time you do a release unless you're certain you @@ -84,12 +84,12 @@ const char *GetVersionString(); #define DYNLIGHT // This is so that derivates can use the same savegame versions without worrying about engine compatibility -#define GAMESIG "GZDOOM" -#define BASEWAD "gzdoom.pk3" +#define GAMESIG "QZDOOM" +#define BASEWAD "qzdoom.pk3" // More stuff that needs to be different for derivatives. -#define GAMENAME "GZDoom" -#define GAMENAMELOWERCASE "gzdoom" +#define GAMENAME "QZDoom" +#define GAMENAMELOWERCASE "qzdoom" #define FORUM_URL "http://forum.drdteam.org" #define BUGS_FORUM_URL "http://forum.drdteam.org/viewforum.php?f=24" diff --git a/src/win32/fb_d3d9.cpp b/src/win32/fb_d3d9.cpp index 06bfa0569..46895028c 100644 --- a/src/win32/fb_d3d9.cpp +++ b/src/win32/fb_d3d9.cpp @@ -242,8 +242,8 @@ CVAR(Bool, vid_hwaalines, true, CVAR_ARCHIVE|CVAR_GLOBALCONFIG) // //========================================================================== -D3DFB::D3DFB (UINT adapter, int width, int height, bool fullscreen) - : BaseWinFB (width, height) +D3DFB::D3DFB (UINT adapter, int width, int height, bool bgra, bool fullscreen) + : BaseWinFB (width, height, bgra) { D3DPRESENT_PARAMETERS d3dpp; @@ -765,14 +765,16 @@ void D3DFB::KillNativeTexs() bool D3DFB::CreateFBTexture () { - if (FAILED(D3DDevice->CreateTexture(Width, Height, 1, D3DUSAGE_DYNAMIC, D3DFMT_L8, D3DPOOL_DEFAULT, &FBTexture, NULL))) + FBFormat = IsBgra() ? D3DFMT_A8R8G8B8 : D3DFMT_L8; + + if (FAILED(D3DDevice->CreateTexture(Width, Height, 1, D3DUSAGE_DYNAMIC, FBFormat, D3DPOOL_DEFAULT, &FBTexture, NULL))) { int pow2width, pow2height, i; for (i = 1; i < Width; i <<= 1) {} pow2width = i; for (i = 1; i < Height; i <<= 1) {} pow2height = i; - if (FAILED(D3DDevice->CreateTexture(pow2width, pow2height, 1, D3DUSAGE_DYNAMIC, D3DFMT_L8, D3DPOOL_DEFAULT, &FBTexture, NULL))) + if (FAILED(D3DDevice->CreateTexture(pow2width, pow2height, 1, D3DUSAGE_DYNAMIC, FBFormat, D3DPOOL_DEFAULT, &FBTexture, NULL))) { return false; } @@ -1322,20 +1324,45 @@ void D3DFB::Draw3DPart(bool copy3d) SUCCEEDED(FBTexture->LockRect (0, &lockrect, NULL, D3DLOCK_DISCARD))) || SUCCEEDED(FBTexture->LockRect (0, &lockrect, &texrect, 0))) { - if (lockrect.Pitch == Pitch && Pitch == Width) + if (IsBgra() && FBFormat == D3DFMT_A8R8G8B8) { - memcpy (lockrect.pBits, MemBuffer, Width * Height); + if (lockrect.Pitch == Pitch * sizeof(uint32_t) && Pitch == Width) + { + memcpy(lockrect.pBits, MemBuffer, Width * Height * sizeof(uint32_t)); + } + else + { + uint32_t *dest = (uint32_t *)lockrect.pBits; + uint32_t *src = (uint32_t*)MemBuffer; + for (int y = 0; y < Height; y++) + { + memcpy(dest, src, Width * sizeof(uint32_t)); + dest = reinterpret_cast(reinterpret_cast(dest) + lockrect.Pitch); + src += Pitch; + } + } + } + else if (!IsBgra() && FBFormat == D3DFMT_L8) + { + if (lockrect.Pitch == Pitch && Pitch == Width) + { + memcpy(lockrect.pBits, MemBuffer, Width * Height); + } + else + { + BYTE *dest = (BYTE *)lockrect.pBits; + BYTE *src = (BYTE *)MemBuffer; + for (int y = 0; y < Height; y++) + { + memcpy(dest, src, Width); + dest = reinterpret_cast(reinterpret_cast(dest) + lockrect.Pitch); + src += Pitch; + } + } } else { - BYTE *dest = (BYTE *)lockrect.pBits; - BYTE *src = MemBuffer; - for (int y = 0; y < Height; y++) - { - memcpy (dest, src, Width); - dest += lockrect.Pitch; - src += Pitch; - } + memset(lockrect.pBits, 0, lockrect.Pitch * Height); } FBTexture->UnlockRect (0); } @@ -1367,7 +1394,10 @@ void D3DFB::Draw3DPart(bool copy3d) memset(Constant, 0, sizeof(Constant)); SetAlphaBlend(D3DBLENDOP(0)); EnableAlphaTest(FALSE); - SetPixelShader(Shaders[SHADER_NormalColorPal]); + if (IsBgra()) + SetPixelShader(Shaders[SHADER_NormalColor]); + else + SetPixelShader(Shaders[SHADER_NormalColorPal]); if (copy3d) { FBVERTEX verts[4]; @@ -1385,7 +1415,10 @@ void D3DFB::Draw3DPart(bool copy3d) realfixedcolormap->ColorizeStart[1]/2, realfixedcolormap->ColorizeStart[2]/2, 0); color1 = D3DCOLOR_COLORVALUE(realfixedcolormap->ColorizeEnd[0]/2, realfixedcolormap->ColorizeEnd[1]/2, realfixedcolormap->ColorizeEnd[2]/2, 1); - SetPixelShader(Shaders[SHADER_SpecialColormapPal]); + if (IsBgra()) + SetPixelShader(Shaders[SHADER_SpecialColormap]); + else + SetPixelShader(Shaders[SHADER_SpecialColormapPal]); } } else @@ -1396,7 +1429,10 @@ void D3DFB::Draw3DPart(bool copy3d) CalcFullscreenCoords(verts, Accel2D, false, color0, color1); D3DDevice->DrawPrimitiveUP(D3DPT_TRIANGLEFAN, 2, verts, sizeof(FBVERTEX)); } - SetPixelShader(Shaders[SHADER_NormalColorPal]); + if (IsBgra()) + SetPixelShader(Shaders[SHADER_NormalColor]); + else + SetPixelShader(Shaders[SHADER_NormalColorPal]); } //========================================================================== diff --git a/src/win32/fb_ddraw.cpp b/src/win32/fb_ddraw.cpp index 7cc603786..5637e9695 100644 --- a/src/win32/fb_ddraw.cpp +++ b/src/win32/fb_ddraw.cpp @@ -32,7 +32,6 @@ ** */ - // HEADER FILES ------------------------------------------------------------ #define DIRECTDRAW_VERSION 0x0300 @@ -120,7 +119,7 @@ cycle_t BlitCycles; // CODE -------------------------------------------------------------------- DDrawFB::DDrawFB (int width, int height, bool fullscreen) - : BaseWinFB (width, height) + : BaseWinFB (width, height, false) { int i; diff --git a/src/win32/hardware.cpp b/src/win32/hardware.cpp index 787c0a4f3..3cf941307 100644 --- a/src/win32/hardware.cpp +++ b/src/win32/hardware.cpp @@ -51,6 +51,7 @@ EXTERN_CVAR (Bool, ticker) EXTERN_CVAR (Bool, fullscreen) +EXTERN_CVAR (Bool, swtruecolor) EXTERN_CVAR (Float, vid_winscale) CVAR(Int, win_x, -1, CVAR_ARCHIVE | CVAR_GLOBALCONFIG) @@ -71,7 +72,7 @@ int currentrenderer = -1; bool changerenderer; // [ZDoomGL] -CUSTOM_CVAR (Int, vid_renderer, 1, CVAR_ARCHIVE | CVAR_GLOBALCONFIG | CVAR_NOINITCALL) +CUSTOM_CVAR (Int, vid_renderer, 0, CVAR_ARCHIVE | CVAR_GLOBALCONFIG | CVAR_NOINITCALL) { // 0: Software renderer // 1: OpenGL renderer @@ -190,7 +191,7 @@ DFrameBuffer *I_SetMode (int &width, int &height, DFrameBuffer *old) } break; } - DFrameBuffer *res = Video->CreateFrameBuffer (width, height, fs, old); + DFrameBuffer *res = Video->CreateFrameBuffer (width, height, swtruecolor, fs, old); //* Right now, CreateFrameBuffer cannot return NULL if (res == NULL) @@ -357,6 +358,16 @@ void I_RestoreWindowedPos () extern int NewWidth, NewHeight, NewBits, DisplayBits; +CUSTOM_CVAR(Bool, swtruecolor, true, CVAR_ARCHIVE|CVAR_GLOBALCONFIG|CVAR_NOINITCALL) +{ + // Strictly speaking this doesn't require a mode switch, but it is the easiest + // way to force a CreateFramebuffer call without a lot of refactoring. + NewWidth = screen->GetWidth(); + NewHeight = screen->GetHeight(); + NewBits = DisplayBits; + setmodeneeded = true; +} + CUSTOM_CVAR (Bool, fullscreen, false, CVAR_ARCHIVE|CVAR_GLOBALCONFIG|CVAR_NOINITCALL) { NewWidth = screen->GetWidth(); diff --git a/src/win32/hardware.h b/src/win32/hardware.h index b2bafef32..184eeccf5 100644 --- a/src/win32/hardware.h +++ b/src/win32/hardware.h @@ -45,7 +45,7 @@ class IVideo virtual EDisplayType GetDisplayType () = 0; virtual void SetWindowedScale (float scale) = 0; - virtual DFrameBuffer *CreateFrameBuffer (int width, int height, bool fs, DFrameBuffer *old) = 0; + virtual DFrameBuffer *CreateFrameBuffer (int width, int height, bool bgra, bool fs, DFrameBuffer *old) = 0; virtual void StartModeIterator (int bits, bool fs) = 0; virtual bool NextMode (int *width, int *height, bool *letterbox) = 0; diff --git a/src/win32/win32gliface.cpp b/src/win32/win32gliface.cpp index 7ca001e1e..59ef471d3 100644 --- a/src/win32/win32gliface.cpp +++ b/src/win32/win32gliface.cpp @@ -346,7 +346,8 @@ bool Win32GLVideo::GoFullscreen(bool yes) // //========================================================================== -DFrameBuffer *Win32GLVideo::CreateFrameBuffer(int width, int height, bool fs, DFrameBuffer *old) + +DFrameBuffer *Win32GLVideo::CreateFrameBuffer(int width, int height, bool bgra, bool fs, DFrameBuffer *old) { Win32GLFrameBuffer *fb; @@ -860,7 +861,7 @@ IMPLEMENT_ABSTRACT_CLASS(Win32GLFrameBuffer) // //========================================================================== -Win32GLFrameBuffer::Win32GLFrameBuffer(void *hMonitor, int width, int height, int bits, int refreshHz, bool fullscreen) : BaseWinFB(width, height) +Win32GLFrameBuffer::Win32GLFrameBuffer(void *hMonitor, int width, int height, int bits, int refreshHz, bool fullscreen) : BaseWinFB(width, height, false) { m_Width = width; m_Height = height; diff --git a/src/win32/win32gliface.h b/src/win32/win32gliface.h index 6320e2903..87eb10de6 100644 --- a/src/win32/win32gliface.h +++ b/src/win32/win32gliface.h @@ -38,7 +38,7 @@ public: void StartModeIterator (int bits, bool fs); bool NextMode (int *width, int *height, bool *letterbox); bool GoFullscreen(bool yes); - DFrameBuffer *CreateFrameBuffer (int width, int height, bool fs, DFrameBuffer *old); + DFrameBuffer *CreateFrameBuffer (int width, int height, bool bgra, bool fs, DFrameBuffer *old); virtual bool SetResolution (int width, int height, int bits); void DumpAdapters(); bool InitHardware (HWND Window, int multisample); diff --git a/src/win32/win32iface.h b/src/win32/win32iface.h index 9b2754eae..d30475eb3 100644 --- a/src/win32/win32iface.h +++ b/src/win32/win32iface.h @@ -70,7 +70,7 @@ class Win32Video : public IVideo EDisplayType GetDisplayType () { return DISPLAY_Both; } void SetWindowedScale (float scale); - DFrameBuffer *CreateFrameBuffer (int width, int height, bool fs, DFrameBuffer *old); + DFrameBuffer *CreateFrameBuffer (int width, int height, bool bgra, bool fs, DFrameBuffer *old); void StartModeIterator (int bits, bool fs); bool NextMode (int *width, int *height, bool *letterbox); @@ -121,7 +121,7 @@ class BaseWinFB : public DFrameBuffer { DECLARE_ABSTRACT_CLASS(BaseWinFB, DFrameBuffer) public: - BaseWinFB (int width, int height) : DFrameBuffer (width, height), Windowed (true) {} + BaseWinFB (int width, int height, bool bgra) : DFrameBuffer (width, height, bgra), Windowed (true) {} bool IsFullscreen () { return !Windowed; } virtual void Blank () = 0; @@ -228,7 +228,7 @@ class D3DFB : public BaseWinFB { DECLARE_CLASS(D3DFB, BaseWinFB) public: - D3DFB (UINT adapter, int width, int height, bool fullscreen); + D3DFB (UINT adapter, int width, int height, bool bgra, bool fullscreen); ~D3DFB (); bool IsValid (); @@ -422,6 +422,7 @@ private: bool NeedPalUpdate; bool NeedGammaUpdate; int FBWidth, FBHeight; + D3DFORMAT FBFormat; bool VSync; RECT BlendingRect; int In2D; diff --git a/src/win32/win32video.cpp b/src/win32/win32video.cpp index 8eb2349ec..0d1fe0cff 100644 --- a/src/win32/win32video.cpp +++ b/src/win32/win32video.cpp @@ -629,7 +629,7 @@ bool Win32Video::NextMode (int *width, int *height, bool *letterbox) return false; } -DFrameBuffer *Win32Video::CreateFrameBuffer (int width, int height, bool fullscreen, DFrameBuffer *old) +DFrameBuffer *Win32Video::CreateFrameBuffer (int width, int height, bool bgra, bool fullscreen, DFrameBuffer *old) { static int retry = 0; static int owidth, oheight; @@ -645,7 +645,8 @@ DFrameBuffer *Win32Video::CreateFrameBuffer (int width, int height, bool fullscr BaseWinFB *fb = static_cast (old); if (fb->Width == width && fb->Height == height && - fb->Windowed == !fullscreen) + fb->Windowed == !fullscreen && + fb->Bgra == bgra) { return old; } @@ -662,12 +663,13 @@ DFrameBuffer *Win32Video::CreateFrameBuffer (int width, int height, bool fullscr if (D3D != NULL) { - fb = new D3DFB (m_Adapter, width, height, fullscreen); + fb = new D3DFB (m_Adapter, width, height, bgra, fullscreen); } else { fb = new DDrawFB (width, height, fullscreen); } + LOG1 ("New fb created @ %p\n", fb); // If we could not create the framebuffer, try again with slightly @@ -726,7 +728,7 @@ DFrameBuffer *Win32Video::CreateFrameBuffer (int width, int height, bool fullscr } ++retry; - fb = static_cast(CreateFrameBuffer (width, height, fullscreen, NULL)); + fb = static_cast(CreateFrameBuffer (width, height, bgra, fullscreen, NULL)); } retry = 0; diff --git a/src/win32/zdoom.rc b/src/win32/zdoom.rc index 5e2226c6d..1f760d8ea 100644 --- a/src/win32/zdoom.rc +++ b/src/win32/zdoom.rc @@ -72,13 +72,13 @@ BEGIN " BEGIN\r\n" " VALUE ""Comments"", ""Thanks to id Software for creating DOOM and then releasing the source code. Thanks also to TeamTNT for creating BOOM, which ZDoom is partially based on. Includes code based on the Cajun Bot 0.97 by Martin Collberg.""\r\n" " VALUE ""CompanyName"", "" ""\r\n" - " VALUE ""FileDescription"", ""GZDoom""\r\n" + " VALUE ""FileDescription"", ""QZDoom""\r\n" " VALUE ""FileVersion"", RC_FILEVERSION2\r\n" - " VALUE ""InternalName"", ""GZDoom""\r\n" + " VALUE ""InternalName"", ""QZDoom""\r\n" " VALUE ""LegalCopyright"", ""Copyright \\u00A9 1993-1996 id Software, 1998-2010 Randy Heit, 2002-2010 Christoph Oelckers, et al.""\r\n" " VALUE ""LegalTrademarks"", ""DoomR is a Registered Trademark of id Software, Inc.""\r\n" - " VALUE ""OriginalFilename"", ""gzdoom.exe""\r\n" - " VALUE ""ProductName"", ""GZDoom""\r\n" + " VALUE ""OriginalFilename"", ""qzdoom.exe""\r\n" + " VALUE ""ProductName"", ""QZDoom""\r\n" " VALUE ""ProductVersion"", RC_PRODUCTVERSION2\r\n" " END\r\n" " END\r\n" @@ -492,13 +492,13 @@ BEGIN BEGIN VALUE "Comments", "Thanks to id Software for creating DOOM and then releasing the source code. Thanks also to TeamTNT for creating BOOM, which ZDoom is partially based on. Includes code based on the Cajun Bot 0.97 by Martin Collberg." VALUE "CompanyName", " " - VALUE "FileDescription", "GZDoom" + VALUE "FileDescription", "QZDoom" VALUE "FileVersion", RC_FILEVERSION2 - VALUE "InternalName", "GZDoom" + VALUE "InternalName", "QZDoom" VALUE "LegalCopyright", "Copyright \u00A9 1993-1996 id Software, 1998-2010 Randy Heit, 2002-2010 Christoph Oelckers, et al." VALUE "LegalTrademarks", "DoomR is a Registered Trademark of id Software, Inc." - VALUE "OriginalFilename", "gzdoom.exe" - VALUE "ProductName", "GZDoom" + VALUE "OriginalFilename", "qzdoom.exe" + VALUE "ProductName", "QZDoom" VALUE "ProductVersion", RC_PRODUCTVERSION2 END END diff --git a/wadsrc/CMakeLists.txt b/wadsrc/CMakeLists.txt index 80189a328..5a85840e0 100644 --- a/wadsrc/CMakeLists.txt +++ b/wadsrc/CMakeLists.txt @@ -1,3 +1,3 @@ cmake_minimum_required( VERSION 2.8.7 ) -add_pk3(gzdoom.pk3 ${CMAKE_CURRENT_SOURCE_DIR}/static) +add_pk3(qzdoom.pk3 ${CMAKE_CURRENT_SOURCE_DIR}/static) diff --git a/wadsrc/static/language.enu b/wadsrc/static/language.enu index a668cadf1..54b43f1a5 100644 --- a/wadsrc/static/language.enu +++ b/wadsrc/static/language.enu @@ -1780,6 +1780,10 @@ DSPLYMNU_BRIGHTNESS = "Brightness"; DSPLYMNU_VSYNC = "Vertical Sync"; DSPLYMNU_CAPFPS = "Rendering Interpolation"; DSPLYMNU_COLUMNMETHOD = "Column render mode"; +DSPLYMNU_TRUECOLOR = "True color output"; +DSPLYMNU_MINFILTER = "Linear filter when downscaling"; +DSPLYMNU_MAGFILTER = "Linear filter when upscaling"; +DSPLYMNU_MIPMAP = "Use mipmapped textures"; DSPLYMNU_WIPETYPE = "Screen wipe style"; DSPLYMNU_SHOWENDOOM = "Show ENDOOM screen"; DSPLYMNU_PALLETEHACK = "DirectDraw palette hack"; // Not used diff --git a/wadsrc/static/menudef.txt b/wadsrc/static/menudef.txt index adcc2d29c..589407262 100644 --- a/wadsrc/static/menudef.txt +++ b/wadsrc/static/menudef.txt @@ -674,6 +674,10 @@ OptionMenu "VideoOptions" Option "$DSPLYMNU_VSYNC", "vid_vsync", "OnOff" Option "$DSPLYMNU_CAPFPS", "cl_capfps", "OffOn" + Option "$DSPLYMNU_TRUECOLOR", "swtruecolor", "OnOff" + Option "$DSPLYMNU_MINFILTER", "r_minfilter", "OnOff" + Option "$DSPLYMNU_MAGFILTER", "r_magfilter", "OnOff" + Option "$DSPLYMNU_MIPMAP", "r_mipmap", "OnOff" StaticText " " Option "$DSPLYMNU_WIPETYPE", "wipetype", "Wipes"