diff --git a/src/r_draw.h b/src/r_draw.h index 55ad8a0cad..d09d0ab892 100644 --- a/src/r_draw.h +++ b/src/r_draw.h @@ -581,4 +581,22 @@ public: static void WaitForWorkers(); }; +class ApplySpecialColormapRGBACommand : public DrawerCommand +{ + BYTE *buffer; + int pitch; + int width; + int height; + int start_red; + int start_green; + int start_blue; + int end_red; + int end_green; + int end_blue; + +public: + ApplySpecialColormapRGBACommand(FSpecialColormap *colormap, DFrameBuffer *screen); + void Execute(DrawerThread *thread) override; +}; + #endif diff --git a/src/r_draw_rgba.cpp b/src/r_draw_rgba.cpp index a5d924dfa1..d5c275d0ee 100644 --- a/src/r_draw_rgba.cpp +++ b/src/r_draw_rgba.cpp @@ -3488,6 +3488,158 @@ public: } }; +ApplySpecialColormapRGBACommand::ApplySpecialColormapRGBACommand(FSpecialColormap *colormap, DFrameBuffer *screen) +{ + buffer = screen->GetBuffer(); + pitch = screen->GetPitch(); + width = screen->GetWidth(); + height = screen->GetHeight(); + + start_red = (int)(colormap->ColorizeStart[0] * 255); + start_green = (int)(colormap->ColorizeStart[1] * 255); + start_blue = (int)(colormap->ColorizeStart[2] * 255); + end_red = (int)(colormap->ColorizeEnd[0] * 255); + end_green = (int)(colormap->ColorizeEnd[1] * 255); + end_blue = (int)(colormap->ColorizeEnd[2] * 255); +} + +#ifdef NO_SSE +void ApplySpecialColormapRGBACommand::Execute(DrawerThread *thread) +{ + int y = thread->skipped_by_thread(0); + int count = thread->count_for_thread(0, height); + while (count > 0) + { + BYTE *pixels = buffer + y * pitch * 4; + for (int x = 0; x < width; x++) + { + int fg_red = pixels[2]; + int fg_green = pixels[1]; + int fg_blue = pixels[0]; + + int gray = (fg_red * 77 + fg_green * 143 + fg_blue * 37) >> 8; + gray += (gray >> 7); // gray*=256/255 + int inv_gray = 256 - gray; + + int red = clamp((start_red * inv_gray + end_red * gray) >> 8, 0, 255); + int green = clamp((start_green * inv_gray + end_green * gray) >> 8, 0, 255); + int blue = clamp((start_blue * inv_gray + end_blue * gray) >> 8, 0, 255); + + pixels[0] = (BYTE)blue; + pixels[1] = (BYTE)green; + pixels[2] = (BYTE)red; + pixels[3] = 0xff; + + pixels += 4; + } + y += thread->num_cores; + count--; + } +} +#else +void ApplySpecialColormapRGBACommand::Execute(DrawerThread *thread) +{ + int y = thread->skipped_by_thread(0); + int count = thread->count_for_thread(0, height); + __m128i gray_weight = _mm_set_epi16(256, 77, 143, 37, 256, 77, 143, 37); + __m128i start_end = _mm_set_epi16(255, start_red, start_green, start_blue, 255, end_red, end_green, end_blue); + while (count > 0) + { + BYTE *pixels = buffer + y * pitch * 4; + int sse_length = width / 4; + for (int x = 0; x < sse_length; x++) + { + // Unpack to integers: + __m128i p = _mm_loadu_si128((const __m128i*)pixels); + + __m128i p16_0 = _mm_unpacklo_epi8(p, _mm_setzero_si128()); + __m128i p16_1 = _mm_unpackhi_epi8(p, _mm_setzero_si128()); + + // Add gray weighting to colors + __m128i mullo0 = _mm_mullo_epi16(p16_0, gray_weight); + __m128i mullo1 = _mm_mullo_epi16(p16_1, gray_weight); + __m128i p32_0 = _mm_unpacklo_epi16(mullo0, _mm_setzero_si128()); + __m128i p32_1 = _mm_unpackhi_epi16(mullo0, _mm_setzero_si128()); + __m128i p32_2 = _mm_unpacklo_epi16(mullo1, _mm_setzero_si128()); + __m128i p32_3 = _mm_unpackhi_epi16(mullo1, _mm_setzero_si128()); + + // Transpose to get color components in individual vectors: + __m128 tmpx = _mm_castsi128_ps(p32_0); + __m128 tmpy = _mm_castsi128_ps(p32_1); + __m128 tmpz = _mm_castsi128_ps(p32_2); + __m128 tmpw = _mm_castsi128_ps(p32_3); + _MM_TRANSPOSE4_PS(tmpx, tmpy, tmpz, tmpw); + __m128i blue = _mm_castps_si128(tmpx); + __m128i green = _mm_castps_si128(tmpy); + __m128i red = _mm_castps_si128(tmpz); + __m128i alpha = _mm_castps_si128(tmpw); + + // Calculate gray and 256-gray values: + __m128i gray = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(red, green), blue), 8); + __m128i inv_gray = _mm_sub_epi32(_mm_set1_epi32(256), gray); + + // p32 = start * inv_gray + end * gray: + __m128i gray0 = _mm_shuffle_epi32(gray, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i gray1 = _mm_shuffle_epi32(gray, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i gray2 = _mm_shuffle_epi32(gray, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i gray3 = _mm_shuffle_epi32(gray, _MM_SHUFFLE(3, 3, 3, 3)); + __m128i inv_gray0 = _mm_shuffle_epi32(inv_gray, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i inv_gray1 = _mm_shuffle_epi32(inv_gray, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i inv_gray2 = _mm_shuffle_epi32(inv_gray, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i inv_gray3 = _mm_shuffle_epi32(inv_gray, _MM_SHUFFLE(3, 3, 3, 3)); + __m128i gray16_0 = _mm_packs_epi32(gray0, inv_gray0); + __m128i gray16_1 = _mm_packs_epi32(gray1, inv_gray1); + __m128i gray16_2 = _mm_packs_epi32(gray2, inv_gray2); + __m128i gray16_3 = _mm_packs_epi32(gray3, inv_gray3); + __m128i gray16_0_mullo = _mm_mullo_epi16(gray16_0, start_end); + __m128i gray16_1_mullo = _mm_mullo_epi16(gray16_1, start_end); + __m128i gray16_2_mullo = _mm_mullo_epi16(gray16_2, start_end); + __m128i gray16_3_mullo = _mm_mullo_epi16(gray16_3, start_end); + __m128i gray16_0_mulhi = _mm_mulhi_epi16(gray16_0, start_end); + __m128i gray16_1_mulhi = _mm_mulhi_epi16(gray16_1, start_end); + __m128i gray16_2_mulhi = _mm_mulhi_epi16(gray16_2, start_end); + __m128i gray16_3_mulhi = _mm_mulhi_epi16(gray16_3, start_end); + p32_0 = _mm_srli_epi32(_mm_add_epi32(_mm_unpacklo_epi16(gray16_0_mullo, gray16_0_mulhi), _mm_unpackhi_epi16(gray16_0_mullo, gray16_0_mulhi)), 8); + p32_1 = _mm_srli_epi32(_mm_add_epi32(_mm_unpacklo_epi16(gray16_1_mullo, gray16_1_mulhi), _mm_unpackhi_epi16(gray16_1_mullo, gray16_1_mulhi)), 8); + p32_2 = _mm_srli_epi32(_mm_add_epi32(_mm_unpacklo_epi16(gray16_2_mullo, gray16_2_mulhi), _mm_unpackhi_epi16(gray16_2_mullo, gray16_2_mulhi)), 8); + p32_3 = _mm_srli_epi32(_mm_add_epi32(_mm_unpacklo_epi16(gray16_3_mullo, gray16_3_mulhi), _mm_unpackhi_epi16(gray16_3_mullo, gray16_3_mulhi)), 8); + + p16_0 = _mm_packs_epi32(p32_0, p32_1); + p16_1 = _mm_packs_epi32(p32_2, p32_3); + p = _mm_packus_epi16(p16_0, p16_1); + + _mm_storeu_si128((__m128i*)pixels, p); + pixels += 16; + } + + for (int x = sse_length * 4; x < width; x++) + { + int fg_red = pixels[2]; + int fg_green = pixels[1]; + int fg_blue = pixels[0]; + + int gray = (fg_red * 77 + fg_green * 143 + fg_blue * 37) >> 8; + gray += (gray >> 7); // gray*=256/255 + int inv_gray = 256 - gray; + + int red = clamp((start_red * inv_gray + end_red * gray) >> 8, 0, 255); + int green = clamp((start_green * inv_gray + end_green * gray) >> 8, 0, 255); + int blue = clamp((start_blue * inv_gray + end_blue * gray) >> 8, 0, 255); + + pixels[0] = (BYTE)blue; + pixels[1] = (BYTE)green; + pixels[2] = (BYTE)red; + pixels[3] = 0xff; + + pixels += 4; + } + + y += thread->num_cores; + count--; + } +} +#endif + ///////////////////////////////////////////////////////////////////////////// void R_BeginDrawerCommands() diff --git a/src/r_main.cpp b/src/r_main.cpp index c1b78303b3..2eb0ce1419 100644 --- a/src/r_main.cpp +++ b/src/r_main.cpp @@ -459,7 +459,7 @@ void R_SetupColormap(player_t *player) if (player->fixedcolormap >= 0 && player->fixedcolormap < (int)SpecialColormaps.Size()) { realfixedcolormap = &SpecialColormaps[player->fixedcolormap]; - if (RenderTarget == screen && (DFrameBuffer *)screen->Accel2D && r_shadercolormaps) + if (RenderTarget == screen && (r_swtruecolor || ((DFrameBuffer *)screen->Accel2D && r_shadercolormaps))) { // Render everything fullbright. The copy to video memory will // apply the special colormap, so it won't be restricted to the @@ -935,7 +935,7 @@ void R_RenderActorView (AActor *actor, bool dontmaplines) // If we don't want shadered colormaps, NULL it now so that the // copy to the screen does not use a special colormap shader. - if (!r_shadercolormaps) + if (!r_shadercolormaps && !r_swtruecolor) { realfixedcolormap = NULL; } diff --git a/src/r_swrenderer.cpp b/src/r_swrenderer.cpp index 11f879c38c..c4558bf7c8 100644 --- a/src/r_swrenderer.cpp +++ b/src/r_swrenderer.cpp @@ -43,6 +43,7 @@ #include "textures/textures.h" #include "r_data/voxels.h" +EXTERN_CVAR(Bool, r_shadercolormaps) class FArchive; void R_SWRSetWindow(int windowSize, int fullWidth, int fullHeight, int stHeight, int trueratio); @@ -166,6 +167,13 @@ void FSoftwareRenderer::RenderView(player_t *player) R_RenderActorView (player->mo); // [RH] Let cameras draw onto textures that were visible this frame. FCanvasTextureInfo::UpdateAll (); + + // Apply special colormap if the target cannot do it + if (realfixedcolormap && r_swtruecolor && !(r_shadercolormaps && screen->Accel2D)) + { + DrawerCommandQueue::QueueCommand(realfixedcolormap, screen); + } + R_EndDrawerCommands(); } diff --git a/src/r_things.cpp b/src/r_things.cpp index 0c5e17b7cd..2dc0bdb6cd 100644 --- a/src/r_things.cpp +++ b/src/r_things.cpp @@ -1423,7 +1423,7 @@ void R_DrawPSprite (pspdef_t* psp, int pspnum, AActor *owner, double sx, double } } - if (realfixedcolormap != NULL) + if (realfixedcolormap != NULL && (!r_swtruecolor || (r_shadercolormaps && screen->Accel2D))) { // fixed color vis->Style.BaseColormap = realfixedcolormap; vis->Style.ColormapNum = 0;