// Emacs style mode select -*- C++ -*- //----------------------------------------------------------------------------- // // $Id:$ // // Copyright (C) 1993-1996 by id Software, Inc. // // This source is available for distribution and/or modification // only under the terms of the DOOM Source Code License as // published by id Software. All rights reserved. // // The source is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // FITNESS FOR A PARTICULAR PURPOSE. See the DOOM Source Code License // for more details. // // $Log:$ // // DESCRIPTION: // True color span/column drawing functions. // //----------------------------------------------------------------------------- #include #include "templates.h" #include "doomdef.h" #include "i_system.h" #include "w_wad.h" #include "r_local.h" #include "v_video.h" #include "doomstat.h" #include "st_stuff.h" #include "g_game.h" #include "g_level.h" #include "r_data/r_translate.h" #include "v_palette.h" #include "r_data/colormaps.h" #include "r_plane.h" #include "gi.h" #include "stats.h" #include "x86.h" #ifndef NO_SSE #include #endif #include extern int vlinebits; extern int mvlinebits; extern int tmvlinebits; extern "C" short spanend[MAXHEIGHT]; extern float rw_light; extern float rw_lightstep; extern int wallshade; ///////////////////////////////////////////////////////////////////////////// DrawerCommandQueue *DrawerCommandQueue::Instance() { static DrawerCommandQueue queue; return &queue; } DrawerCommandQueue::~DrawerCommandQueue() { StopThreads(); } void* DrawerCommandQueue::AllocMemory(size_t size) { // Make sure allocations remain 16-byte aligned size = (size + 15) / 16 * 16; auto queue = Instance(); if (queue->memorypool_pos + size > memorypool_size) return nullptr; void *data = queue->memorypool + queue->memorypool_pos; queue->memorypool_pos += size; return data; } void DrawerCommandQueue::Begin() { auto queue = Instance(); queue->Finish(); queue->threaded_render++; } void DrawerCommandQueue::End() { auto queue = Instance(); queue->Finish(); if (queue->threaded_render > 0) queue->threaded_render--; } void DrawerCommandQueue::Finish() { auto queue = Instance(); if (queue->commands.empty()) return; // Give worker threads something to do: std::unique_lock start_lock(queue->start_mutex); queue->active_commands.swap(queue->commands); queue->run_id++; start_lock.unlock(); queue->StartThreads(); queue->start_condition.notify_all(); // Do one thread ourselves: DrawerThread thread; thread.core = 0; thread.num_cores = queue->threads.size() + 1; for (int pass = 0; pass < queue->num_passes; pass++) { thread.pass_start_y = pass * queue->rows_in_pass; thread.pass_end_y = (pass + 1) * queue->rows_in_pass; if (pass + 1 == queue->num_passes) thread.pass_end_y = MAX(thread.pass_end_y, MAXHEIGHT); size_t size = queue->active_commands.size(); for (size_t i = 0; i < size; i++) { auto &command = queue->active_commands[i]; command->Execute(&thread); } } // Wait for everyone to finish: std::unique_lock end_lock(queue->end_mutex); queue->end_condition.wait(end_lock, [&]() { return queue->finished_threads == queue->threads.size(); }); // Clean up batch: for (auto &command : queue->active_commands) command->~DrawerCommand(); queue->active_commands.clear(); queue->memorypool_pos = 0; queue->finished_threads = 0; } void DrawerCommandQueue::StartThreads() { if (!threads.empty()) return; int num_threads = std::thread::hardware_concurrency(); if (num_threads == 0) num_threads = 4; threads.resize(num_threads - 1); for (int i = 0; i < num_threads - 1; i++) { DrawerCommandQueue *queue = this; DrawerThread *thread = &threads[i]; thread->core = i + 1; thread->num_cores = num_threads; thread->thread = std::thread([=]() { int run_id = 0; while (true) { // Wait until we are signalled to run: std::unique_lock start_lock(queue->start_mutex); queue->start_condition.wait(start_lock, [&]() { return queue->run_id != run_id || queue->shutdown_flag; }); if (queue->shutdown_flag) break; run_id = queue->run_id; start_lock.unlock(); // Do the work: for (int pass = 0; pass < queue->num_passes; pass++) { thread->pass_start_y = pass * queue->rows_in_pass; thread->pass_end_y = (pass + 1) * queue->rows_in_pass; if (pass + 1 == queue->num_passes) thread->pass_end_y = MAX(thread->pass_end_y, MAXHEIGHT); size_t size = queue->active_commands.size(); for (size_t i = 0; i < size; i++) { auto &command = queue->active_commands[i]; command->Execute(thread); } } // Notify main thread that we finished: std::unique_lock end_lock(queue->end_mutex); queue->finished_threads++; end_lock.unlock(); queue->end_condition.notify_all(); } }); } } void DrawerCommandQueue::StopThreads() { std::unique_lock lock(start_mutex); shutdown_flag = true; lock.unlock(); start_condition.notify_all(); for (auto &thread : threads) thread.thread.join(); threads.clear(); lock.lock(); shutdown_flag = false; } ///////////////////////////////////////////////////////////////////////////// class DrawColumnRGBACommand : public DrawerCommand { int dc_count; BYTE *dc_dest; DWORD dc_texturefrac; DWORD dc_iscale; fixed_t dc_light; const BYTE *dc_source; int dc_pitch; ShadeConstants dc_shade_constants; public: DrawColumnRGBACommand() { dc_count = ::dc_count; dc_dest = ::dc_dest; dc_texturefrac = ::dc_texturefrac; dc_iscale = ::dc_iscale; dc_light = ::dc_light; dc_source = ::dc_source; dc_pitch = ::dc_pitch; dc_shade_constants = ::dc_shade_constants; } void Execute(DrawerThread *thread) override { int count; uint32_t* dest; fixed_t frac; fixed_t fracstep; count = thread->count_for_thread(dc_dest_y, dc_count); // Zero length, column does not exceed a pixel. if (count <= 0) return; // Framebuffer destination address. dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; // Determine scaling, // which is the only mapping to be done. fracstep = dc_iscale * thread->num_cores; frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { // [RH] Get local copies of these variables so that the compiler // has a better chance of optimizing this well. const BYTE *source = dc_source; int pitch = dc_pitch * thread->num_cores; // Inner loop that does the actual texture mapping, // e.g. a DDA-lile scaling. // This is as fast as it gets. do { *dest = shade_pal_index(source[frac >> FRACBITS], light, shade_constants); dest += pitch; frac += fracstep; } while (--count); } } }; class FillColumnRGBACommand : public DrawerCommand { int dc_count; BYTE *dc_dest; fixed_t dc_light; int dc_pitch; int dc_color; public: FillColumnRGBACommand() { dc_count = ::dc_count; dc_dest = ::dc_dest; dc_light = ::dc_light; dc_pitch = ::dc_pitch; dc_color = ::dc_color; } void Execute(DrawerThread *thread) override { int count; uint32_t* dest; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); uint32_t light = calc_light_multiplier(dc_light); { int pitch = dc_pitch * thread->num_cores; BYTE color = dc_color; do { *dest = shade_pal_index_simple(color, light); dest += pitch; } while (--count); } } }; class FillAddColumnRGBACommand : public DrawerCommand { int dc_count; BYTE *dc_dest; int dc_pitch; fixed_t dc_light; int dc_color; public: FillAddColumnRGBACommand() { dc_count = ::dc_count; dc_dest = ::dc_dest; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_color = ::dc_color; } void Execute(DrawerThread *thread) override { int count; uint32_t *dest; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int pitch = dc_pitch * thread->num_cores; uint32_t fg = shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light)); uint32_t fg_red = (fg >> 24) & 0xff; uint32_t fg_green = (fg >> 16) & 0xff; uint32_t fg_blue = fg & 0xff; do { uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = (fg_red + bg_red + 1) / 2; uint32_t green = (fg_green + bg_green + 1) / 2; uint32_t blue = (fg_blue + bg_blue + 1) / 2; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; } while (--count); } }; class FillAddClampColumnRGBACommand : public DrawerCommand { int dc_count; BYTE *dc_dest; int dc_pitch; fixed_t dc_light; int dc_color; public: FillAddClampColumnRGBACommand() { dc_count = ::dc_count; dc_dest = ::dc_dest; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_color = ::dc_color; } void Execute(DrawerThread *thread) override { int count; uint32_t *dest; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int pitch = dc_pitch * thread->num_cores; uint32_t fg = shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light)); uint32_t fg_red = (fg >> 24) & 0xff; uint32_t fg_green = (fg >> 16) & 0xff; uint32_t fg_blue = fg & 0xff; do { uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp(fg_red + bg_red, 0, 255); uint32_t green = clamp(fg_green + bg_green, 0, 255); uint32_t blue = clamp(fg_blue + bg_blue, 0, 255); *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; } while (--count); } }; class FillSubClampColumnRGBACommand : public DrawerCommand { int dc_count; BYTE *dc_dest; int dc_pitch; int dc_color; fixed_t dc_light; public: FillSubClampColumnRGBACommand() { dc_count = ::dc_count; dc_dest = ::dc_dest; dc_pitch = ::dc_pitch; dc_color = ::dc_color; dc_light = ::dc_light; } void Execute(DrawerThread *thread) override { int count; uint32_t *dest; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int pitch = dc_pitch * thread->num_cores; uint32_t fg = shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light)); uint32_t fg_red = (fg >> 24) & 0xff; uint32_t fg_green = (fg >> 16) & 0xff; uint32_t fg_blue = fg & 0xff; do { uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp(256 - fg_red + bg_red, 256, 256 + 255) - 255; uint32_t green = clamp(256 - fg_green + bg_green, 256, 256 + 255) - 255; uint32_t blue = clamp(256 - fg_blue + bg_blue, 256, 256 + 255) - 255; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; } while (--count); } }; class FillRevSubClampColumnRGBACommand : public DrawerCommand { int dc_count; BYTE *dc_dest; int dc_pitch; int dc_color; fixed_t dc_light; public: FillRevSubClampColumnRGBACommand() { dc_count = ::dc_count; dc_dest = ::dc_dest; dc_pitch = ::dc_pitch; dc_color = ::dc_color; dc_light = ::dc_light; } void Execute(DrawerThread *thread) override { int count; uint32_t *dest; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int pitch = dc_pitch * thread->num_cores; uint32_t fg = shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light)); uint32_t fg_red = (fg >> 24) & 0xff; uint32_t fg_green = (fg >> 16) & 0xff; uint32_t fg_blue = fg & 0xff; do { uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp(256 + fg_red - bg_red, 256, 256 + 255) - 255; uint32_t green = clamp(256 + fg_green - bg_green, 256, 256 + 255) - 255; uint32_t blue = clamp(256 + fg_blue - bg_blue, 256, 256 + 255) - 255; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; } while (--count); } }; class DrawFuzzColumnRGBACommand : public DrawerCommand { int dc_x; int dc_yl; int dc_yh; BYTE *dc_destorg; int dc_pitch; int fuzzpos; int fuzzviewheight; public: DrawFuzzColumnRGBACommand() { dc_x = ::dc_x; dc_yl = ::dc_yl; dc_yh = ::dc_yh; dc_destorg = ::dc_destorg; dc_pitch = ::dc_pitch; fuzzpos = ::fuzzpos; fuzzviewheight = ::fuzzviewheight; } void Execute(DrawerThread *thread) override { int count; uint32_t *dest; // Adjust borders. Low... if (dc_yl == 0) dc_yl = 1; // .. and high. if (dc_yh > fuzzviewheight) dc_yh = fuzzviewheight; count = thread->count_for_thread(dc_yl, dc_yh - dc_yl + 1); // Zero length. if (count <= 0) return; dest = thread->dest_for_thread(dc_yl, dc_pitch, ylookup[dc_yl] + dc_x + (uint32_t*)dc_destorg); // Note: this implementation assumes this function is only used for the pinky shadow effect (i.e. no other fancy colormap than black) // I'm not sure if this is really always the case or not. { // [RH] Make local copies of global vars to try and improve // the optimizations made by the compiler. int pitch = dc_pitch * thread->num_cores; int fuzz = fuzzpos; int cnt; // [RH] Split this into three separate loops to minimize // the number of times fuzzpos needs to be clamped. if (fuzz) { cnt = MIN(FUZZTABLE - fuzz, count); count -= cnt; do { uint32_t bg = dest[fuzzoffset[fuzz++]]; uint32_t bg_red = (bg >> 16) & 0xff; uint32_t bg_green = (bg >> 8) & 0xff; uint32_t bg_blue = (bg) & 0xff; uint32_t red = bg_red * 3 / 4; uint32_t green = bg_green * 3 / 4; uint32_t blue = bg_blue * 3 / 4; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; } while (--cnt); } if (fuzz == FUZZTABLE || count > 0) { while (count >= FUZZTABLE) { fuzz = 0; cnt = FUZZTABLE; count -= FUZZTABLE; do { uint32_t bg = dest[fuzzoffset[fuzz++]]; uint32_t bg_red = (bg >> 16) & 0xff; uint32_t bg_green = (bg >> 8) & 0xff; uint32_t bg_blue = (bg) & 0xff; uint32_t red = bg_red * 3 / 4; uint32_t green = bg_green * 3 / 4; uint32_t blue = bg_blue * 3 / 4; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; } while (--cnt); } fuzz = 0; if (count > 0) { do { uint32_t bg = dest[fuzzoffset[fuzz++]]; uint32_t bg_red = (bg >> 16) & 0xff; uint32_t bg_green = (bg >> 8) & 0xff; uint32_t bg_blue = (bg) & 0xff; uint32_t red = bg_red * 3 / 4; uint32_t green = bg_green * 3 / 4; uint32_t blue = bg_blue * 3 / 4; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; } while (--count); } } fuzzpos = fuzz; } } }; class DrawAddColumnRGBACommand : public DrawerCommand { int dc_count; BYTE *dc_dest; DWORD dc_iscale; DWORD dc_texturefrac; const BYTE *dc_source; int dc_pitch; fixed_t dc_light; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; public: DrawAddColumnRGBACommand() { dc_count = ::dc_count; dc_dest = ::dc_dest; dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_source = ::dc_source; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; } void Execute(DrawerThread *thread) override { int count; uint32_t *dest; fixed_t frac; fixed_t fracstep; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); fracstep = dc_iscale * thread->num_cores; frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { const BYTE *source = dc_source; int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); do { uint32_t fg = shade_pal_index(source[frac >> FRACBITS], light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; frac += fracstep; } while (--count); } } }; class DrawTranslatedColumnRGBACommand : public DrawerCommand { int dc_count; fixed_t dc_light; ShadeConstants dc_shade_constants; BYTE *dc_dest; DWORD dc_iscale; DWORD dc_texturefrac; BYTE *dc_translation; const BYTE *dc_source; int dc_pitch; public: DrawTranslatedColumnRGBACommand() { dc_count = ::dc_count; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; dc_dest = ::dc_dest; dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_translation = ::dc_translation; dc_source = ::dc_source; dc_pitch = ::dc_pitch; } void Execute(DrawerThread *thread) override { int count; uint32_t* dest; fixed_t frac; fixed_t fracstep; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); fracstep = dc_iscale * thread->num_cores; frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { // [RH] Local copies of global vars to improve compiler optimizations BYTE *translation = dc_translation; const BYTE *source = dc_source; int pitch = dc_pitch * thread->num_cores; do { *dest = shade_pal_index(translation[source[frac >> FRACBITS]], light, shade_constants); dest += pitch; frac += fracstep; } while (--count); } } }; class DrawTlatedAddColumnRGBACommand : public DrawerCommand { int dc_count; fixed_t dc_light; ShadeConstants dc_shade_constants; BYTE *dc_dest; DWORD dc_iscale; DWORD dc_texturefrac; BYTE *dc_translation; const BYTE *dc_source; int dc_pitch; public: DrawTlatedAddColumnRGBACommand() { dc_count = ::dc_count; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; dc_dest = ::dc_dest; dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_translation = ::dc_translation; dc_source = ::dc_source; dc_pitch = ::dc_pitch; } void Execute(DrawerThread *thread) override { int count; uint32_t *dest; fixed_t frac; fixed_t fracstep; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); fracstep = dc_iscale * thread->num_cores; frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { BYTE *translation = dc_translation; const BYTE *source = dc_source; int pitch = dc_pitch * thread->num_cores; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); do { uint32_t fg = shade_pal_index(translation[source[frac >> FRACBITS]], light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; frac += fracstep; } while (--count); } } }; class DrawShadedColumnRGBACommand : public DrawerCommand { private: int dc_count; BYTE *dc_dest; DWORD dc_iscale; DWORD dc_texturefrac; fixed_t dc_light; const BYTE *dc_source; lighttable_t *dc_colormap; int dc_color; int dc_pitch; public: DrawShadedColumnRGBACommand() { dc_count = ::dc_count; dc_dest = ::dc_dest; dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_light = ::dc_light; dc_source = ::dc_source; dc_colormap = ::dc_colormap; dc_color = ::dc_color; dc_pitch = ::dc_pitch; } void Execute(DrawerThread *thread) override { int count; uint32_t *dest; fixed_t frac, fracstep; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); fracstep = dc_iscale * thread->num_cores; frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); uint32_t fg = shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light)); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; { const BYTE *source = dc_source; BYTE *colormap = dc_colormap; int pitch = dc_pitch * thread->num_cores; do { DWORD alpha = clamp(colormap[source[frac >> FRACBITS]], 0, 64); DWORD inv_alpha = 64 - alpha; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = (fg_red * alpha + bg_red * inv_alpha) / 64; uint32_t green = (fg_green * alpha + bg_green * inv_alpha) / 64; uint32_t blue = (fg_blue * alpha + bg_blue * inv_alpha) / 64; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; frac += fracstep; } while (--count); } } }; class DrawAddClampColumnRGBACommand : public DrawerCommand { int dc_count; BYTE *dc_dest; DWORD dc_iscale; DWORD dc_texturefrac; const BYTE *dc_source; int dc_pitch; fixed_t dc_light; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; public: DrawAddClampColumnRGBACommand() { dc_count = ::dc_count; dc_dest = ::dc_dest; dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_source = ::dc_source; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; } void Execute(DrawerThread *thread) override { int count; uint32_t *dest; fixed_t frac; fixed_t fracstep; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); fracstep = dc_iscale * thread->num_cores; frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { const BYTE *source = dc_source; int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); do { uint32_t fg = shade_pal_index(source[frac >> FRACBITS], light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; frac += fracstep; } while (--count); } } }; class DrawAddClampTranslatedColumnRGBACommand : public DrawerCommand { int dc_count; BYTE *dc_dest; DWORD dc_iscale; DWORD dc_texturefrac; BYTE *dc_translation; const BYTE *dc_source; int dc_pitch; fixed_t dc_light; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; public: DrawAddClampTranslatedColumnRGBACommand() { dc_count = ::dc_count; dc_dest = ::dc_dest; dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_translation = ::dc_translation; dc_source = ::dc_source; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; } void Execute(DrawerThread *thread) override { int count; uint32_t *dest; fixed_t frac; fixed_t fracstep; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); fracstep = dc_iscale * thread->num_cores; frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { BYTE *translation = dc_translation; const BYTE *source = dc_source; int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); do { uint32_t fg = shade_pal_index(translation[source[frac >> FRACBITS]], light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; frac += fracstep; } while (--count); } } }; class DrawSubClampColumnRGBACommand : public DrawerCommand { int dc_count; BYTE *dc_dest; DWORD dc_iscale; DWORD dc_texturefrac; const BYTE *dc_source; int dc_pitch; fixed_t dc_light; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; public: DrawSubClampColumnRGBACommand() { dc_count = ::dc_count; dc_dest = ::dc_dest; dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_source = ::dc_source; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; } void Execute(DrawerThread *thread) override { int count; uint32_t *dest; fixed_t frac; fixed_t fracstep; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); fracstep = dc_iscale * thread->num_cores; frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { const BYTE *source = dc_source; int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); do { uint32_t fg = shade_pal_index(source[frac >> FRACBITS], light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t green = clamp((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t blue = clamp((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; frac += fracstep; } while (--count); } } }; class DrawSubClampTranslatedColumnRGBACommand : public DrawerCommand { int dc_count; BYTE *dc_dest; DWORD dc_iscale; DWORD dc_texturefrac; const BYTE *dc_source; int dc_pitch; fixed_t dc_light; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; BYTE *dc_translation; public: DrawSubClampTranslatedColumnRGBACommand() { dc_count = ::dc_count; dc_dest = ::dc_dest; dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_source = ::dc_source; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; dc_translation = ::dc_translation; } void Execute(DrawerThread *thread) override { int count; uint32_t *dest; fixed_t frac; fixed_t fracstep; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); fracstep = dc_iscale * thread->num_cores; frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { BYTE *translation = dc_translation; const BYTE *source = dc_source; int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); do { uint32_t fg = shade_pal_index(translation[source[frac >> FRACBITS]], light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t green = clamp((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t blue = clamp((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; frac += fracstep; } while (--count); } } }; class DrawRevSubClampColumnRGBACommand : public DrawerCommand { int dc_count; BYTE *dc_dest; DWORD dc_iscale; DWORD dc_texturefrac; const BYTE *dc_source; int dc_pitch; fixed_t dc_light; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; public: DrawRevSubClampColumnRGBACommand() { dc_count = ::dc_count; dc_dest = ::dc_dest; dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_source = ::dc_source; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; } void Execute(DrawerThread *thread) override { int count; uint32_t *dest; fixed_t frac; fixed_t fracstep; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); fracstep = dc_iscale * thread->num_cores; frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { const BYTE *source = dc_source; int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); do { uint32_t fg = shade_pal_index(source[frac >> FRACBITS], light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t green = clamp((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t blue = clamp((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; frac += fracstep; } while (--count); } } }; class DrawRevSubClampTranslatedColumnRGBACommand : public DrawerCommand { int dc_count; BYTE *dc_dest; DWORD dc_iscale; DWORD dc_texturefrac; const BYTE *dc_source; int dc_pitch; fixed_t dc_light; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; BYTE *dc_translation; public: DrawRevSubClampTranslatedColumnRGBACommand() { dc_count = ::dc_count; dc_dest = ::dc_dest; dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_source = ::dc_source; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; dc_translation = ::dc_translation; } void Execute(DrawerThread *thread) override { int count; uint32_t *dest; fixed_t frac; fixed_t fracstep; count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); fracstep = dc_iscale * thread->num_cores; frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { BYTE *translation = dc_translation; const BYTE *source = dc_source; int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); do { uint32_t fg = shade_pal_index(translation[source[frac >> FRACBITS]], light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t green = clamp((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t blue = clamp((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; dest += pitch; frac += fracstep; } while (--count); } } }; class DrawSpanRGBACommand : public DrawerCommand { const uint32_t *ds_source; fixed_t ds_xfrac; fixed_t ds_yfrac; fixed_t ds_xstep; fixed_t ds_ystep; int ds_x1; int ds_x2; int ds_y; int ds_xbits; int ds_ybits; BYTE *dc_destorg; fixed_t ds_light; ShadeConstants ds_shade_constants; public: DrawSpanRGBACommand() { ds_source = (const uint32_t*)::ds_source; ds_xfrac = ::ds_xfrac; ds_yfrac = ::ds_yfrac; ds_xstep = ::ds_xstep; ds_ystep = ::ds_ystep; ds_x1 = ::ds_x1; ds_x2 = ::ds_x2; ds_y = ::ds_y; ds_xbits = ::ds_xbits; ds_ybits = ::ds_ybits; dc_destorg = ::dc_destorg; ds_light = ::ds_light; ds_shade_constants = ::ds_shade_constants; } #ifdef NO_SSE void Execute(DrawerThread *thread) override { if (thread->line_skipped_by_thread(ds_y)) return; dsfixed_t xfrac; dsfixed_t yfrac; dsfixed_t xstep; dsfixed_t ystep; uint32_t* dest; const uint32_t* source = ds_source; int count; int spot; xfrac = ds_xfrac; yfrac = ds_yfrac; dest = ylookup[ds_y] + ds_x1 + (uint32_t*)dc_destorg; count = ds_x2 - ds_x1 + 1; xstep = ds_xstep; ystep = ds_ystep; uint32_t light = calc_light_multiplier(ds_light); ShadeConstants shade_constants = ds_shade_constants; if (ds_xbits == 6 && ds_ybits == 6) { // 64x64 is the most common case by far, so special case it. do { // Current texture index in u,v. spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); // Lookup pixel from flat texture tile *dest++ = shade_bgra(source[spot], light, shade_constants); // Next step in u,v. xfrac += xstep; yfrac += ystep; } while (--count); } else { BYTE yshift = 32 - ds_ybits; BYTE xshift = yshift - ds_xbits; int xmask = ((1 << ds_xbits) - 1) << ds_ybits; do { // Current texture index in u,v. spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); // Lookup pixel from flat texture tile *dest++ = shade_bgra(source[spot], light, shade_constants); // Next step in u,v. xfrac += xstep; yfrac += ystep; } while (--count); } } #else void Execute(DrawerThread *thread) override { if (thread->line_skipped_by_thread(ds_y)) return; dsfixed_t xfrac; dsfixed_t yfrac; dsfixed_t xstep; dsfixed_t ystep; uint32_t* dest; const uint32_t* source = ds_source; int count; int spot; xfrac = ds_xfrac; yfrac = ds_yfrac; dest = ylookup[ds_y] + ds_x1 + (uint32_t*)dc_destorg; count = ds_x2 - ds_x1 + 1; xstep = ds_xstep; ystep = ds_ystep; uint32_t light = calc_light_multiplier(ds_light); ShadeConstants shade_constants = ds_shade_constants; if (ds_xbits == 6 && ds_ybits == 6) { // 64x64 is the most common case by far, so special case it. int sse_count = count / 4; count -= sse_count * 4; if (shade_constants.simple_shade) { SSE_SHADE_SIMPLE_INIT(light); while (sse_count--) { // Current texture index in u,v. spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); uint32_t p0 = source[spot]; xfrac += xstep; yfrac += ystep; spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); uint32_t p1 = source[spot]; xfrac += xstep; yfrac += ystep; spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); uint32_t p2 = source[spot]; xfrac += xstep; yfrac += ystep; spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); uint32_t p3 = source[spot]; xfrac += xstep; yfrac += ystep; // Lookup pixel from flat texture tile, // re-index using light/colormap. __m128i fg = _mm_set_epi32(p3, p2, p1, p0); SSE_SHADE_SIMPLE(fg); _mm_storeu_si128((__m128i*)dest, fg); // Next step in u,v. dest += 4; } } else { SSE_SHADE_INIT(light, shade_constants); while (sse_count--) { // Current texture index in u,v. spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); uint32_t p0 = source[spot]; xfrac += xstep; yfrac += ystep; spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); uint32_t p1 = source[spot]; xfrac += xstep; yfrac += ystep; spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); uint32_t p2 = source[spot]; xfrac += xstep; yfrac += ystep; spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); uint32_t p3 = source[spot]; xfrac += xstep; yfrac += ystep; // Lookup pixel from flat texture tile, // re-index using light/colormap. __m128i fg = _mm_set_epi32(p3, p2, p1, p0); SSE_SHADE(fg, shade_constants); _mm_storeu_si128((__m128i*)dest, fg); // Next step in u,v. dest += 4; } } if (count == 0) return; do { // Current texture index in u,v. spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); // Lookup pixel from flat texture tile *dest++ = shade_bgra(source[spot], light, shade_constants); // Next step in u,v. xfrac += xstep; yfrac += ystep; } while (--count); } else { BYTE yshift = 32 - ds_ybits; BYTE xshift = yshift - ds_xbits; int xmask = ((1 << ds_xbits) - 1) << ds_ybits; int sse_count = count / 4; count -= sse_count * 4; if (shade_constants.simple_shade) { SSE_SHADE_SIMPLE_INIT(light); while (sse_count--) { spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); uint32_t p0 = source[spot]; xfrac += xstep; yfrac += ystep; spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); uint32_t p1 = source[spot]; xfrac += xstep; yfrac += ystep; spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); uint32_t p2 = source[spot]; xfrac += xstep; yfrac += ystep; spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); uint32_t p3 = source[spot]; xfrac += xstep; yfrac += ystep; // Lookup pixel from flat texture tile __m128i fg = _mm_set_epi32(p3, p2, p1, p0); SSE_SHADE_SIMPLE(fg); _mm_storeu_si128((__m128i*)dest, fg); dest += 4; } } else { SSE_SHADE_INIT(light, shade_constants); while (sse_count--) { spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); uint32_t p0 = source[spot]; xfrac += xstep; yfrac += ystep; spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); uint32_t p1 = source[spot]; xfrac += xstep; yfrac += ystep; spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); uint32_t p2 = source[spot]; xfrac += xstep; yfrac += ystep; spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); uint32_t p3 = source[spot]; xfrac += xstep; yfrac += ystep; // Lookup pixel from flat texture tile __m128i fg = _mm_set_epi32(p3, p2, p1, p0); SSE_SHADE(fg, shade_constants); _mm_storeu_si128((__m128i*)dest, fg); dest += 4; } } if (count == 0) return; do { // Current texture index in u,v. spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); // Lookup pixel from flat texture tile *dest++ = shade_bgra(source[spot], light, shade_constants); // Next step in u,v. xfrac += xstep; yfrac += ystep; } while (--count); } } #endif }; class DrawSpanMaskedRGBACommand : public DrawerCommand { const uint32_t *ds_source; fixed_t ds_light; ShadeConstants ds_shade_constants; fixed_t ds_xfrac; fixed_t ds_yfrac; BYTE *dc_destorg; int ds_x1; int ds_x2; int ds_y1; int ds_y; fixed_t ds_xstep; fixed_t ds_ystep; int ds_xbits; int ds_ybits; public: DrawSpanMaskedRGBACommand() { ds_source = (const uint32_t*)::ds_source; ds_light = ::ds_light; ds_shade_constants = ::ds_shade_constants; ds_xfrac = ::ds_xfrac; ds_yfrac = ::ds_yfrac; dc_destorg = ::dc_destorg; ds_x1 = ::ds_x1; ds_x2 = ::ds_x2; ds_y = ::ds_y; ds_xstep = ::ds_xstep; ds_ystep = ::ds_ystep; ds_xbits = ::ds_xbits; ds_ybits = ::ds_ybits; } void Execute(DrawerThread *thread) override { if (thread->line_skipped_by_thread(ds_y)) return; dsfixed_t xfrac; dsfixed_t yfrac; dsfixed_t xstep; dsfixed_t ystep; uint32_t* dest; const uint32_t* source = ds_source; int count; int spot; uint32_t light = calc_light_multiplier(ds_light); ShadeConstants shade_constants = ds_shade_constants; xfrac = ds_xfrac; yfrac = ds_yfrac; dest = ylookup[ds_y] + ds_x1 + (uint32_t*)dc_destorg; count = ds_x2 - ds_x1 + 1; xstep = ds_xstep; ystep = ds_ystep; if (ds_xbits == 6 && ds_ybits == 6) { // 64x64 is the most common case by far, so special case it. do { uint32_t texdata; spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); texdata = source[spot]; if (texdata != 0) { *dest = shade_bgra(texdata, light, shade_constants); } dest++; xfrac += xstep; yfrac += ystep; } while (--count); } else { BYTE yshift = 32 - ds_ybits; BYTE xshift = yshift - ds_xbits; int xmask = ((1 << ds_xbits) - 1) << ds_ybits; do { uint32_t texdata; spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); texdata = source[spot]; if (texdata != 0) { *dest = shade_bgra(texdata, light, shade_constants); } dest++; xfrac += xstep; yfrac += ystep; } while (--count); } } }; class DrawSpanTranslucentRGBACommand : public DrawerCommand { const uint32_t *ds_source; fixed_t ds_light; ShadeConstants ds_shade_constants; fixed_t ds_xfrac; fixed_t ds_yfrac; BYTE *dc_destorg; int ds_x1; int ds_x2; int ds_y1; int ds_y; fixed_t ds_xstep; fixed_t ds_ystep; int ds_xbits; int ds_ybits; public: DrawSpanTranslucentRGBACommand() { ds_source = (const uint32_t *)::ds_source; ds_light = ::ds_light; ds_shade_constants = ::ds_shade_constants; ds_xfrac = ::ds_xfrac; ds_yfrac = ::ds_yfrac; dc_destorg = ::dc_destorg; ds_x1 = ::ds_x1; ds_x2 = ::ds_x2; ds_y = ::ds_y; ds_xstep = ::ds_xstep; ds_ystep = ::ds_ystep; ds_xbits = ::ds_xbits; ds_ybits = ::ds_ybits; } void Execute(DrawerThread *thread) override { if (thread->line_skipped_by_thread(ds_y)) return; dsfixed_t xfrac; dsfixed_t yfrac; dsfixed_t xstep; dsfixed_t ystep; uint32_t* dest; const uint32_t* source = ds_source; int count; int spot; xfrac = ds_xfrac; yfrac = ds_yfrac; dest = ylookup[ds_y] + ds_x1 + (uint32_t*)dc_destorg; count = ds_x2 - ds_x1 + 1; xstep = ds_xstep; ystep = ds_ystep; uint32_t light = calc_light_multiplier(ds_light); ShadeConstants shade_constants = ds_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); if (ds_xbits == 6 && ds_ybits == 6) { // 64x64 is the most common case by far, so special case it. do { spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); uint32_t fg = shade_bgra(source[spot], light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = (fg) & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = (fg_red * fg_alpha + bg_red * bg_alpha) / 256; uint32_t green = (fg_green * fg_alpha + bg_green * bg_alpha) / 256; uint32_t blue = (fg_blue * fg_alpha + bg_blue * bg_alpha) / 256; *dest++ = 0xff000000 | (red << 16) | (green << 8) | blue; xfrac += xstep; yfrac += ystep; } while (--count); } else { BYTE yshift = 32 - ds_ybits; BYTE xshift = yshift - ds_xbits; int xmask = ((1 << ds_xbits) - 1) << ds_ybits; do { spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); uint32_t fg = shade_bgra(source[spot], light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = (fg) & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = (fg_red * fg_alpha + bg_red * bg_alpha) / 256; uint32_t green = (fg_green * fg_alpha + bg_green * bg_alpha) / 256; uint32_t blue = (fg_blue * fg_alpha + bg_blue * bg_alpha) / 256; *dest++ = 0xff000000 | (red << 16) | (green << 8) | blue; xfrac += xstep; yfrac += ystep; } while (--count); } } }; class DrawSpanMaskedTranslucentRGBACommand : public DrawerCommand { const uint32_t *ds_source; fixed_t ds_light; ShadeConstants ds_shade_constants; fixed_t ds_xfrac; fixed_t ds_yfrac; BYTE *dc_destorg; int ds_x1; int ds_x2; int ds_y1; int ds_y; fixed_t ds_xstep; fixed_t ds_ystep; int ds_xbits; int ds_ybits; public: DrawSpanMaskedTranslucentRGBACommand() { ds_source = (const uint32_t*)::ds_source; ds_light = ::ds_light; ds_shade_constants = ::ds_shade_constants; ds_xfrac = ::ds_xfrac; ds_yfrac = ::ds_yfrac; dc_destorg = ::dc_destorg; ds_x1 = ::ds_x1; ds_x2 = ::ds_x2; ds_y = ::ds_y; ds_xstep = ::ds_xstep; ds_ystep = ::ds_ystep; ds_xbits = ::ds_xbits; ds_ybits = ::ds_ybits; } void Execute(DrawerThread *thread) override { if (thread->line_skipped_by_thread(ds_y)) return; dsfixed_t xfrac; dsfixed_t yfrac; dsfixed_t xstep; dsfixed_t ystep; uint32_t* dest; const uint32_t* source = ds_source; int count; int spot; uint32_t light = calc_light_multiplier(ds_light); ShadeConstants shade_constants = ds_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); xfrac = ds_xfrac; yfrac = ds_yfrac; dest = ylookup[ds_y] + ds_x1 + (uint32_t*)dc_destorg; count = ds_x2 - ds_x1 + 1; xstep = ds_xstep; ystep = ds_ystep; if (ds_xbits == 6 && ds_ybits == 6) { // 64x64 is the most common case by far, so special case it. do { uint32_t texdata; spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); texdata = source[spot]; if (texdata != 0) { uint32_t fg = shade_bgra(texdata, light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = (fg) & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = (fg_red * fg_alpha + bg_red * bg_alpha) / 256; uint32_t green = (fg_green * fg_alpha + bg_green * bg_alpha) / 256; uint32_t blue = (fg_blue * fg_alpha + bg_blue * bg_alpha) / 256; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; } dest++; xfrac += xstep; yfrac += ystep; } while (--count); } else { BYTE yshift = 32 - ds_ybits; BYTE xshift = yshift - ds_xbits; int xmask = ((1 << ds_xbits) - 1) << ds_ybits; do { uint32_t texdata; spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); texdata = source[spot]; if (texdata != 0) { uint32_t fg = shade_bgra(texdata, light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = (fg) & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = (fg_red * fg_alpha + bg_red * bg_alpha) / 256; uint32_t green = (fg_green * fg_alpha + bg_green * bg_alpha) / 256; uint32_t blue = (fg_blue * fg_alpha + bg_blue * bg_alpha) / 256; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; } dest++; xfrac += xstep; yfrac += ystep; } while (--count); } } }; class DrawSpanAddClampRGBACommand : public DrawerCommand { const uint32_t *ds_source; fixed_t ds_light; ShadeConstants ds_shade_constants; fixed_t ds_xfrac; fixed_t ds_yfrac; BYTE *dc_destorg; int ds_x1; int ds_x2; int ds_y1; int ds_y; fixed_t ds_xstep; fixed_t ds_ystep; int ds_xbits; int ds_ybits; public: DrawSpanAddClampRGBACommand() { ds_source = (const uint32_t*)::ds_source; ds_light = ::ds_light; ds_shade_constants = ::ds_shade_constants; ds_xfrac = ::ds_xfrac; ds_yfrac = ::ds_yfrac; dc_destorg = ::dc_destorg; ds_x1 = ::ds_x1; ds_x2 = ::ds_x2; ds_y = ::ds_y; ds_xstep = ::ds_xstep; ds_ystep = ::ds_ystep; ds_xbits = ::ds_xbits; ds_ybits = ::ds_ybits; } void Execute(DrawerThread *thread) override { if (thread->line_skipped_by_thread(ds_y)) return; dsfixed_t xfrac; dsfixed_t yfrac; dsfixed_t xstep; dsfixed_t ystep; uint32_t* dest; const uint32_t* source = ds_source; int count; int spot; uint32_t light = calc_light_multiplier(ds_light); ShadeConstants shade_constants = ds_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); xfrac = ds_xfrac; yfrac = ds_yfrac; dest = ylookup[ds_y] + ds_x1 + (uint32_t*)dc_destorg; count = ds_x2 - ds_x1 + 1; xstep = ds_xstep; ystep = ds_ystep; if (ds_xbits == 6 && ds_ybits == 6) { // 64x64 is the most common case by far, so special case it. do { spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); uint32_t fg = shade_bgra(source[spot], light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = (fg) & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); *dest++ = 0xff000000 | (red << 16) | (green << 8) | blue; xfrac += xstep; yfrac += ystep; } while (--count); } else { BYTE yshift = 32 - ds_ybits; BYTE xshift = yshift - ds_xbits; int xmask = ((1 << ds_xbits) - 1) << ds_ybits; do { spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); uint32_t fg = shade_bgra(source[spot], light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = (fg) & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); *dest++ = 0xff000000 | (red << 16) | (green << 8) | blue; xfrac += xstep; yfrac += ystep; } while (--count); } } }; class DrawSpanMaskedAddClampRGBACommand : public DrawerCommand { const uint32_t *ds_source; fixed_t ds_light; ShadeConstants ds_shade_constants; fixed_t ds_xfrac; fixed_t ds_yfrac; BYTE *dc_destorg; int ds_x1; int ds_x2; int ds_y1; int ds_y; fixed_t ds_xstep; fixed_t ds_ystep; int ds_xbits; int ds_ybits; public: DrawSpanMaskedAddClampRGBACommand() { ds_source = (const uint32_t*)::ds_source; ds_light = ::ds_light; ds_shade_constants = ::ds_shade_constants; ds_xfrac = ::ds_xfrac; ds_yfrac = ::ds_yfrac; dc_destorg = ::dc_destorg; ds_x1 = ::ds_x1; ds_x2 = ::ds_x2; ds_y = ::ds_y; ds_xstep = ::ds_xstep; ds_ystep = ::ds_ystep; ds_xbits = ::ds_xbits; ds_ybits = ::ds_ybits; } void Execute(DrawerThread *thread) override { if (thread->line_skipped_by_thread(ds_y)) return; dsfixed_t xfrac; dsfixed_t yfrac; dsfixed_t xstep; dsfixed_t ystep; uint32_t* dest; const uint32_t* source = ds_source; int count; int spot; uint32_t light = calc_light_multiplier(ds_light); ShadeConstants shade_constants = ds_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); xfrac = ds_xfrac; yfrac = ds_yfrac; dest = ylookup[ds_y] + ds_x1 + (uint32_t*)dc_destorg; count = ds_x2 - ds_x1 + 1; xstep = ds_xstep; ystep = ds_ystep; if (ds_xbits == 6 && ds_ybits == 6) { // 64x64 is the most common case by far, so special case it. do { uint32_t texdata; spot = ((xfrac >> (32 - 6 - 6))&(63 * 64)) + (yfrac >> (32 - 6)); texdata = source[spot]; if (texdata != 0) { uint32_t fg = shade_bgra(texdata, light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = (fg) & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = (fg_red * fg_alpha + bg_red * bg_alpha) / 256; uint32_t green = (fg_green * fg_alpha + bg_green * bg_alpha) / 256; uint32_t blue = (fg_blue * fg_alpha + bg_blue * bg_alpha) / 256; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; } dest++; xfrac += xstep; yfrac += ystep; } while (--count); } else { BYTE yshift = 32 - ds_ybits; BYTE xshift = yshift - ds_xbits; int xmask = ((1 << ds_xbits) - 1) << ds_ybits; do { uint32_t texdata; spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); texdata = source[spot]; if (texdata != 0) { uint32_t fg = shade_bgra(texdata, light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = (fg) & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = (fg_red * fg_alpha + bg_red * bg_alpha) / 256; uint32_t green = (fg_green * fg_alpha + bg_green * bg_alpha) / 256; uint32_t blue = (fg_blue * fg_alpha + bg_blue * bg_alpha) / 256; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; } dest++; xfrac += xstep; yfrac += ystep; } while (--count); } } }; class FillSpanRGBACommand : public DrawerCommand { int ds_x1; int ds_x2; int ds_y; BYTE *dc_destorg; fixed_t ds_light; int ds_color; public: FillSpanRGBACommand() { ds_x1 = ::ds_x1; ds_x2 = ::ds_x2; ds_y = ::ds_y; dc_destorg = ::dc_destorg; ds_light = ::ds_light; ds_color = ::ds_color; } void Execute(DrawerThread *thread) override { if (thread->line_skipped_by_thread(ds_y)) return; uint32_t *dest = ylookup[ds_y] + ds_x1 + (uint32_t*)dc_destorg; int count = (ds_x2 - ds_x1 + 1); uint32_t light = calc_light_multiplier(ds_light); uint32_t color = shade_pal_index_simple(ds_color, light); for (int i = 0; i < count; i++) dest[i] = color; } }; class Vlinec1RGBACommand : public DrawerCommand { DWORD dc_iscale; DWORD dc_texturefrac; int dc_count; const BYTE *dc_source; BYTE *dc_dest; int vlinebits; int dc_pitch; fixed_t dc_light; ShadeConstants dc_shade_constants; public: Vlinec1RGBACommand() { dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_count = ::dc_count; dc_source = ::dc_source; dc_dest = ::dc_dest; vlinebits = ::vlinebits; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; } void Execute(DrawerThread *thread) override { int count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; DWORD fracstep = dc_iscale * thread->num_cores; DWORD frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); const uint32 *source = (const uint32 *)dc_source; uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int bits = vlinebits; int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; do { *dest = shade_bgra(source[frac >> bits], light, shade_constants); frac += fracstep; dest += pitch; } while (--count); } }; class Vlinec4RGBACommand : public DrawerCommand { BYTE *dc_dest; int dc_count; int dc_pitch; ShadeConstants dc_shade_constants; int vlinebits; fixed_t palookuplight[4]; DWORD vplce[4]; DWORD vince[4]; const uint32 *bufplce[4]; public: Vlinec4RGBACommand() { dc_dest = ::dc_dest; dc_count = ::dc_count; dc_pitch = ::dc_pitch; dc_shade_constants = ::dc_shade_constants; vlinebits = ::vlinebits; for (int i = 0; i < 4; i++) { palookuplight[i] = ::palookuplight[i]; vplce[i] = ::vplce[i]; vince[i] = ::vince[i]; bufplce[i] = (const uint32 *)::bufplce[i]; } } #ifdef NO_SSE void Execute(DrawerThread *thread) override { int count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int pitch = dc_pitch * thread->num_cores; int bits = vlinebits; DWORD place; uint32_t light0 = calc_light_multiplier(palookuplight[0]); uint32_t light1 = calc_light_multiplier(palookuplight[1]); uint32_t light2 = calc_light_multiplier(palookuplight[2]); uint32_t light3 = calc_light_multiplier(palookuplight[3]); ShadeConstants shade_constants = dc_shade_constants; DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; int skipped = thread->skipped_by_thread(dc_dest_y); for (int i = 0; i < 4; i++) { local_vplce[i] += local_vince[i] * skipped; local_vince[i] *= thread->num_cores; } do { dest[0] = shade_bgra(bufplce[0][(place = local_vplce[0]) >> bits], light0, shade_constants); local_vplce[0] = place + local_vince[0]; dest[1] = shade_bgra(bufplce[1][(place = local_vplce[1]) >> bits], light1, shade_constants); local_vplce[1] = place + local_vince[1]; dest[2] = shade_bgra(bufplce[2][(place = local_vplce[2]) >> bits], light2, shade_constants); local_vplce[2] = place + local_vince[2]; dest[3] = shade_bgra(bufplce[3][(place = local_vplce[3]) >> bits], light3, shade_constants); local_vplce[3] = place + local_vince[3]; dest += pitch; } while (--count); } #else void Execute(DrawerThread *thread) override { int count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int bits = vlinebits; int pitch = dc_pitch * thread->num_cores; uint32_t light0 = calc_light_multiplier(palookuplight[0]); uint32_t light1 = calc_light_multiplier(palookuplight[1]); uint32_t light2 = calc_light_multiplier(palookuplight[2]); uint32_t light3 = calc_light_multiplier(palookuplight[3]); ShadeConstants shade_constants = dc_shade_constants; DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; int skipped = thread->skipped_by_thread(dc_dest_y); for (int i = 0; i < 4; i++) { local_vplce[i] += local_vince[i] * skipped; local_vince[i] *= thread->num_cores; } if (shade_constants.simple_shade) { SSE_SHADE_SIMPLE_INIT4(light3, light2, light1, light0); do { DWORD place0 = local_vplce[0]; DWORD place1 = local_vplce[1]; DWORD place2 = local_vplce[2]; DWORD place3 = local_vplce[3]; uint32_t p0 = bufplce[0][place0 >> bits]; uint32_t p1 = bufplce[1][place1 >> bits]; uint32_t p2 = bufplce[2][place2 >> bits]; uint32_t p3 = bufplce[3][place3 >> bits]; local_vplce[0] = place0 + local_vince[0]; local_vplce[1] = place1 + local_vince[1]; local_vplce[2] = place2 + local_vince[2]; local_vplce[3] = place3 + local_vince[3]; __m128i fg = _mm_set_epi32(p3, p2, p1, p0); SSE_SHADE_SIMPLE(fg); _mm_storeu_si128((__m128i*)dest, fg); dest += pitch; } while (--count); } else { SSE_SHADE_INIT4(light3, light2, light1, light0, shade_constants); do { DWORD place0 = local_vplce[0]; DWORD place1 = local_vplce[1]; DWORD place2 = local_vplce[2]; DWORD place3 = local_vplce[3]; uint32_t p0 = bufplce[0][place0 >> bits]; uint32_t p1 = bufplce[1][place1 >> bits]; uint32_t p2 = bufplce[2][place2 >> bits]; uint32_t p3 = bufplce[3][place3 >> bits]; local_vplce[0] = place0 + local_vince[0]; local_vplce[1] = place1 + local_vince[1]; local_vplce[2] = place2 + local_vince[2]; local_vplce[3] = place3 + local_vince[3]; __m128i fg = _mm_set_epi32(p3, p2, p1, p0); SSE_SHADE(fg, shade_constants); _mm_storeu_si128((__m128i*)dest, fg); dest += pitch; } while (--count); } } #endif }; class Mvlinec1RGBACommand : public DrawerCommand { DWORD dc_iscale; DWORD dc_texturefrac; int dc_count; const BYTE *dc_source; BYTE *dc_dest; int mvlinebits; int dc_pitch; fixed_t dc_light; ShadeConstants dc_shade_constants; public: Mvlinec1RGBACommand() { dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_count = ::dc_count; dc_source = ::dc_source; dc_dest = ::dc_dest; mvlinebits = ::mvlinebits; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; } void Execute(DrawerThread *thread) override { int count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; DWORD fracstep = dc_iscale * thread->num_cores; DWORD frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); const uint32 *source = (const uint32 *)dc_source; uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int bits = mvlinebits; int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; do { uint32_t pix = source[frac >> bits]; if (pix != 0) { *dest = shade_bgra(pix, light, shade_constants); } frac += fracstep; dest += pitch; } while (--count); } }; class Mvlinec4RGBACommand : public DrawerCommand { BYTE *dc_dest; int dc_count; int dc_pitch; ShadeConstants dc_shade_constants; int mvlinebits; fixed_t palookuplight[4]; DWORD vplce[4]; DWORD vince[4]; const uint32 *bufplce[4]; public: Mvlinec4RGBACommand() { dc_dest = ::dc_dest; dc_count = ::dc_count; dc_pitch = ::dc_pitch; dc_shade_constants = ::dc_shade_constants; mvlinebits = ::mvlinebits; for (int i = 0; i < 4; i++) { palookuplight[i] = ::palookuplight[i]; vplce[i] = ::vplce[i]; vince[i] = ::vince[i]; bufplce[i] = (const uint32 *)::bufplce[i]; } } #ifdef NO_SSE void Execute(DrawerThread *thread) override { int count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int pitch = dc_pitch * thread->num_cores; int bits = mvlinebits; DWORD place; uint32_t light0 = calc_light_multiplier(palookuplight[0]); uint32_t light1 = calc_light_multiplier(palookuplight[1]); uint32_t light2 = calc_light_multiplier(palookuplight[2]); uint32_t light3 = calc_light_multiplier(palookuplight[3]); ShadeConstants shade_constants = dc_shade_constants; DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; int skipped = thread->skipped_by_thread(dc_dest_y); for (int i = 0; i < 4; i++) { local_vplce[i] += local_vince[i] * skipped; local_vince[i] *= thread->num_cores; } do { uint32_t pix; pix = bufplce[0][(place = local_vplce[0]) >> bits]; if (pix) dest[0] = shade_bgra(pix, light0, shade_constants); local_vplce[0] = place + local_vince[0]; pix = bufplce[1][(place = local_vplce[1]) >> bits]; if (pix) dest[1] = shade_bgra(pix, light1, shade_constants); local_vplce[1] = place + local_vince[1]; pix = bufplce[2][(place = local_vplce[2]) >> bits]; if (pix) dest[2] = shade_bgra(pix, light2, shade_constants); local_vplce[2] = place + local_vince[2]; pix = bufplce[3][(place = local_vplce[3]) >> bits]; if (pix) dest[3] = shade_bgra(pix, light3, shade_constants); local_vplce[3] = place + local_vince[3]; dest += pitch; } while (--count); } #else void Execute(DrawerThread *thread) override { int count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int pitch = dc_pitch * thread->num_cores; int bits = mvlinebits; uint32_t light0 = calc_light_multiplier(palookuplight[0]); uint32_t light1 = calc_light_multiplier(palookuplight[1]); uint32_t light2 = calc_light_multiplier(palookuplight[2]); uint32_t light3 = calc_light_multiplier(palookuplight[3]); ShadeConstants shade_constants = dc_shade_constants; DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; int skipped = thread->skipped_by_thread(dc_dest_y); for (int i = 0; i < 4; i++) { local_vplce[i] += local_vince[i] * skipped; local_vince[i] *= thread->num_cores; } if (shade_constants.simple_shade) { SSE_SHADE_SIMPLE_INIT4(light3, light2, light1, light0); do { DWORD place0 = local_vplce[0]; DWORD place1 = local_vplce[1]; DWORD place2 = local_vplce[2]; DWORD place3 = local_vplce[3]; uint32_t pix0 = bufplce[0][place0 >> bits]; uint32_t pix1 = bufplce[1][place1 >> bits]; uint32_t pix2 = bufplce[2][place2 >> bits]; uint32_t pix3 = bufplce[3][place3 >> bits]; // movemask = !(pix == 0) __m128i movemask = _mm_xor_si128(_mm_cmpeq_epi32(_mm_set_epi32(pix3, pix2, pix1, pix0), _mm_setzero_si128()), _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); local_vplce[0] = place0 + local_vince[0]; local_vplce[1] = place1 + local_vince[1]; local_vplce[2] = place2 + local_vince[2]; local_vplce[3] = place3 + local_vince[3]; __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); SSE_SHADE_SIMPLE(fg); _mm_maskmoveu_si128(fg, movemask, (char*)dest); dest += pitch; } while (--count); } else { SSE_SHADE_INIT4(light3, light2, light1, light0, shade_constants); do { DWORD place0 = local_vplce[0]; DWORD place1 = local_vplce[1]; DWORD place2 = local_vplce[2]; DWORD place3 = local_vplce[3]; uint32_t pix0 = bufplce[0][place0 >> bits]; uint32_t pix1 = bufplce[1][place1 >> bits]; uint32_t pix2 = bufplce[2][place2 >> bits]; uint32_t pix3 = bufplce[3][place3 >> bits]; // movemask = !(pix == 0) __m128i movemask = _mm_xor_si128(_mm_cmpeq_epi32(_mm_set_epi32(pix3, pix2, pix1, pix0), _mm_setzero_si128()), _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); local_vplce[0] = place0 + local_vince[0]; local_vplce[1] = place1 + local_vince[1]; local_vplce[2] = place2 + local_vince[2]; local_vplce[3] = place3 + local_vince[3]; __m128i fg = _mm_set_epi32(pix3, pix2, pix1, pix0); SSE_SHADE(fg, shade_constants); _mm_maskmoveu_si128(fg, movemask, (char*)dest); dest += pitch; } while (--count); } } #endif }; class Tmvline1AddRGBACommand : public DrawerCommand { DWORD dc_iscale; DWORD dc_texturefrac; int dc_count; const BYTE *dc_source; BYTE *dc_dest; int tmvlinebits; int dc_pitch; fixed_t dc_light; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; public: Tmvline1AddRGBACommand() { dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_count = ::dc_count; dc_source = ::dc_source; dc_dest = ::dc_dest; tmvlinebits = ::tmvlinebits; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; } void Execute(DrawerThread *thread) override { int count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; DWORD fracstep = dc_iscale * thread->num_cores; DWORD frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); const uint32 *source = (const uint32 *)dc_source; uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int bits = tmvlinebits; int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); do { uint32_t pix = source[frac >> bits]; if (pix != 0) { uint32_t fg = shade_bgra(pix, light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); *dest = 0xff000000 | (red << 16) | (green << 8) | blue; } frac += fracstep; dest += pitch; } while (--count); } }; class Tmvline4AddRGBACommand : public DrawerCommand { BYTE *dc_dest; int dc_count; int dc_pitch; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; int tmvlinebits; fixed_t palookuplight[4]; DWORD vplce[4]; DWORD vince[4]; const uint32 *bufplce[4]; public: Tmvline4AddRGBACommand() { dc_dest = ::dc_dest; dc_count = ::dc_count; dc_pitch = ::dc_pitch; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; tmvlinebits = ::tmvlinebits; for (int i = 0; i < 4; i++) { palookuplight[i] = ::palookuplight[i]; vplce[i] = ::vplce[i]; vince[i] = ::vince[i]; bufplce[i] = (const uint32 *)::bufplce[i]; } } void Execute(DrawerThread *thread) override { int count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int pitch = dc_pitch * thread->num_cores; int bits = tmvlinebits; uint32_t light[4]; light[0] = calc_light_multiplier(palookuplight[0]); light[1] = calc_light_multiplier(palookuplight[1]); light[2] = calc_light_multiplier(palookuplight[2]); light[3] = calc_light_multiplier(palookuplight[3]); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; int skipped = thread->skipped_by_thread(dc_dest_y); for (int i = 0; i < 4; i++) { local_vplce[i] += local_vince[i] * skipped; local_vince[i] *= thread->num_cores; } do { for (int i = 0; i < 4; ++i) { uint32_t pix = bufplce[i][local_vplce[i] >> bits]; if (pix != 0) { uint32_t fg = shade_bgra(pix, light[i], shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; } local_vplce[i] += local_vince[i]; } dest += pitch; } while (--count); } }; class Tmvline1AddClampRGBACommand : public DrawerCommand { DWORD dc_iscale; DWORD dc_texturefrac; int dc_count; const BYTE *dc_source; BYTE *dc_dest; int tmvlinebits; int dc_pitch; fixed_t dc_light; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; public: Tmvline1AddClampRGBACommand() { dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_count = ::dc_count; dc_source = ::dc_source; dc_dest = ::dc_dest; tmvlinebits = ::tmvlinebits; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; } void Execute(DrawerThread *thread) override { int count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; DWORD fracstep = dc_iscale * thread->num_cores; DWORD frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); const uint32 *source = (const uint32 *)dc_source; uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int bits = tmvlinebits; int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); do { uint32_t pix = source[frac >> bits]; if (pix != 0) { uint32_t fg = shade_bgra(pix, light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); *dest = 0xff000000 | (red << 16) | (green << 8) | blue; } frac += fracstep; dest += pitch; } while (--count); } }; class Tmvline4AddClampRGBACommand : public DrawerCommand { BYTE *dc_dest; int dc_count; int dc_pitch; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; int tmvlinebits; fixed_t palookuplight[4]; DWORD vplce[4]; DWORD vince[4]; const uint32 *bufplce[4]; public: Tmvline4AddClampRGBACommand() { dc_dest = ::dc_dest; dc_count = ::dc_count; dc_pitch = ::dc_pitch; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; tmvlinebits = ::tmvlinebits; for (int i = 0; i < 4; i++) { palookuplight[i] = ::palookuplight[i]; vplce[i] = ::vplce[i]; vince[i] = ::vince[i]; bufplce[i] = (const uint32 *)::bufplce[i]; } } void Execute(DrawerThread *thread) override { int count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int pitch = dc_pitch * thread->num_cores; int bits = tmvlinebits; uint32_t light[4]; light[0] = calc_light_multiplier(palookuplight[0]); light[1] = calc_light_multiplier(palookuplight[1]); light[2] = calc_light_multiplier(palookuplight[2]); light[3] = calc_light_multiplier(palookuplight[3]); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; int skipped = thread->skipped_by_thread(dc_dest_y); for (int i = 0; i < 4; i++) { local_vplce[i] += local_vince[i] * skipped; local_vince[i] *= thread->num_cores; } do { for (int i = 0; i < 4; ++i) { uint32_t pix = bufplce[i][local_vplce[i] >> bits]; if (pix != 0) { uint32_t fg = shade_bgra(pix, light[i], shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (dest[i] >> 16) & 0xff; uint32_t bg_green = (dest[i] >> 8) & 0xff; uint32_t bg_blue = (dest[i]) & 0xff; uint32_t red = clamp((fg_red * fg_alpha + bg_red * bg_alpha) / 256, 0, 255); uint32_t green = clamp((fg_green * fg_alpha + bg_green * bg_alpha) / 256, 0, 255); uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; } local_vplce[i] += local_vince[i]; } dest += pitch; } while (--count); } }; class Tmvline1SubClampRGBACommand : public DrawerCommand { DWORD dc_iscale; DWORD dc_texturefrac; int dc_count; const BYTE *dc_source; BYTE *dc_dest; int tmvlinebits; int dc_pitch; fixed_t dc_light; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; public: Tmvline1SubClampRGBACommand() { dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_count = ::dc_count; dc_source = ::dc_source; dc_dest = ::dc_dest; tmvlinebits = ::tmvlinebits; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; } void Execute(DrawerThread *thread) override { int count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; DWORD fracstep = dc_iscale * thread->num_cores; DWORD frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); const uint32 *source = (const uint32 *)dc_source; uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int bits = tmvlinebits; int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); do { uint32_t pix = source[frac >> bits]; if (pix != 0) { uint32_t fg = shade_bgra(pix, light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t green = clamp((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t blue = clamp((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; } frac += fracstep; dest += pitch; } while (--count); } }; class Tmvline4SubClampRGBACommand : public DrawerCommand { BYTE *dc_dest; int dc_count; int dc_pitch; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; int tmvlinebits; fixed_t palookuplight[4]; DWORD vplce[4]; DWORD vince[4]; const uint32 *bufplce[4]; public: Tmvline4SubClampRGBACommand() { dc_dest = ::dc_dest; dc_count = ::dc_count; dc_pitch = ::dc_pitch; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; tmvlinebits = ::tmvlinebits; for (int i = 0; i < 4; i++) { palookuplight[i] = ::palookuplight[i]; vplce[i] = ::vplce[i]; vince[i] = ::vince[i]; bufplce[i] = (const uint32 *)::bufplce[i]; } } void Execute(DrawerThread *thread) override { int count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int pitch = dc_pitch * thread->num_cores; int bits = tmvlinebits; uint32_t light[4]; light[0] = calc_light_multiplier(palookuplight[0]); light[1] = calc_light_multiplier(palookuplight[1]); light[2] = calc_light_multiplier(palookuplight[2]); light[3] = calc_light_multiplier(palookuplight[3]); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; int skipped = thread->skipped_by_thread(dc_dest_y); for (int i = 0; i < 4; i++) { local_vplce[i] += local_vince[i] * skipped; local_vince[i] *= thread->num_cores; } do { for (int i = 0; i < 4; ++i) { uint32_t pix = bufplce[i][local_vplce[i] >> bits]; if (pix != 0) { uint32_t fg = shade_bgra(pix, light[i], shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (dest[i] >> 16) & 0xff; uint32_t bg_green = (dest[i] >> 8) & 0xff; uint32_t bg_blue = (dest[i]) & 0xff; uint32_t red = clamp((0x10000 - fg_red * fg_alpha + bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t green = clamp((0x10000 - fg_green * fg_alpha + bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t blue = clamp((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; } local_vplce[i] += local_vince[i]; } dest += pitch; } while (--count); } }; class Tmvline1RevSubClampRGBACommand : public DrawerCommand { DWORD dc_iscale; DWORD dc_texturefrac; int dc_count; const BYTE *dc_source; BYTE *dc_dest; int tmvlinebits; int dc_pitch; fixed_t dc_light; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; public: Tmvline1RevSubClampRGBACommand() { dc_iscale = ::dc_iscale; dc_texturefrac = ::dc_texturefrac; dc_count = ::dc_count; dc_source = ::dc_source; dc_dest = ::dc_dest; tmvlinebits = ::tmvlinebits; dc_pitch = ::dc_pitch; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; } void Execute(DrawerThread *thread) override { int count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; DWORD fracstep = dc_iscale * thread->num_cores; DWORD frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); const uint32 *source = (const uint32 *)dc_source; uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int bits = tmvlinebits; int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); do { uint32_t pix = source[frac >> bits]; if (pix != 0) { uint32_t fg = shade_bgra(pix, light, shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (*dest >> 16) & 0xff; uint32_t bg_green = (*dest >> 8) & 0xff; uint32_t bg_blue = (*dest) & 0xff; uint32_t red = clamp((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t green = clamp((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t blue = clamp((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; } frac += fracstep; dest += pitch; } while (--count); } }; class Tmvline4RevSubClampRGBACommand : public DrawerCommand { BYTE *dc_dest; int dc_count; int dc_pitch; ShadeConstants dc_shade_constants; fixed_t dc_srcalpha; fixed_t dc_destalpha; int tmvlinebits; fixed_t palookuplight[4]; DWORD vplce[4]; DWORD vince[4]; const uint32 *bufplce[4]; public: Tmvline4RevSubClampRGBACommand() { dc_dest = ::dc_dest; dc_count = ::dc_count; dc_pitch = ::dc_pitch; dc_shade_constants = ::dc_shade_constants; dc_srcalpha = ::dc_srcalpha; dc_destalpha = ::dc_destalpha; tmvlinebits = ::tmvlinebits; for (int i = 0; i < 4; i++) { palookuplight[i] = ::palookuplight[i]; vplce[i] = ::vplce[i]; vince[i] = ::vince[i]; bufplce[i] = (const uint32 *)::bufplce[i]; } } void Execute(DrawerThread *thread) override { int count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int pitch = dc_pitch * thread->num_cores; int bits = tmvlinebits; uint32_t light[4]; light[0] = calc_light_multiplier(palookuplight[0]); light[1] = calc_light_multiplier(palookuplight[1]); light[2] = calc_light_multiplier(palookuplight[2]); light[3] = calc_light_multiplier(palookuplight[3]); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; int skipped = thread->skipped_by_thread(dc_dest_y); for (int i = 0; i < 4; i++) { local_vplce[i] += local_vince[i] * skipped; local_vince[i] *= thread->num_cores; } do { for (int i = 0; i < 4; ++i) { uint32_t pix = bufplce[i][local_vplce[i] >> bits]; if (pix != 0) { uint32_t fg = shade_bgra(pix, light[i], shade_constants); uint32_t fg_red = (fg >> 16) & 0xff; uint32_t fg_green = (fg >> 8) & 0xff; uint32_t fg_blue = fg & 0xff; uint32_t bg_red = (dest[i] >> 16) & 0xff; uint32_t bg_green = (dest[i] >> 8) & 0xff; uint32_t bg_blue = (dest[i]) & 0xff; uint32_t red = clamp((0x10000 + fg_red * fg_alpha - bg_red * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t green = clamp((0x10000 + fg_green * fg_alpha - bg_green * bg_alpha) / 256, 256, 256 + 255) - 256; uint32_t blue = clamp((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; } local_vplce[i] += local_vince[i]; } dest += pitch; } while (--count); } }; class DrawFogBoundaryLineRGBACommand : public DrawerCommand { int _y; int _x; int _x2; BYTE *dc_destorg; fixed_t dc_light; ShadeConstants dc_shade_constants; public: DrawFogBoundaryLineRGBACommand(int y, int x, int x2) { _y = y; _x = x; _x2 = x2; dc_destorg = ::dc_destorg; dc_light = ::dc_light; dc_shade_constants = ::dc_shade_constants; } void Execute(DrawerThread *thread) override { if (thread->line_skipped_by_thread(_y)) return; int y = _y; int x = _x; int x2 = _x2; uint32_t *dest = ylookup[y] + (uint32_t*)dc_destorg; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants constants = dc_shade_constants; do { uint32_t red = (dest[x] >> 16) & 0xff; uint32_t green = (dest[x] >> 8) & 0xff; uint32_t blue = dest[x] & 0xff; if (constants.simple_shade) { red = red * light / 256; green = green * light / 256; blue = blue * light / 256; } else { uint32_t inv_light = 256 - light; uint32_t inv_desaturate = 256 - constants.desaturate; uint32_t intensity = ((red * 77 + green * 143 + blue * 37) >> 8) * constants.desaturate; red = (red * inv_desaturate + intensity) / 256; green = (green * inv_desaturate + intensity) / 256; blue = (blue * inv_desaturate + intensity) / 256; red = (constants.fade_red * inv_light + red * light) / 256; green = (constants.fade_green * inv_light + green * light) / 256; blue = (constants.fade_blue * inv_light + blue * light) / 256; red = (red * constants.light_red) / 256; green = (green * constants.light_green) / 256; blue = (blue * constants.light_blue) / 256; } dest[x] = 0xff000000 | (red << 16) | (green << 8) | blue; } while (++x <= x2); } }; ///////////////////////////////////////////////////////////////////////////// void R_BeginDrawerCommands() { DrawerCommandQueue::Begin(); } void R_EndDrawerCommands() { DrawerCommandQueue::End(); } void R_DrawColumnP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_FillColumnP_RGBA() { DrawerCommandQueue::QueueCommand(); } void R_FillAddColumn_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_FillAddClampColumn_RGBA() { DrawerCommandQueue::QueueCommand(); } void R_FillSubClampColumn_RGBA() { DrawerCommandQueue::QueueCommand(); } void R_FillRevSubClampColumn_RGBA() { DrawerCommandQueue::QueueCommand(); } void R_DrawFuzzColumnP_RGBA_C() { DrawerCommandQueue::QueueCommand(); fuzzpos = (fuzzpos + dc_yh - dc_yl) % FUZZTABLE; } void R_DrawAddColumnP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawTranslatedColumnP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawTlatedAddColumnP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawShadedColumnP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawAddClampColumnP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawAddClampTranslatedColumnP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawSubClampColumnP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawSubClampTranslatedColumnP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawRevSubClampColumnP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawRevSubClampTranslatedColumnP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawSpanP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawSpanMaskedP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawSpanTranslucentP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawSpanMaskedTranslucentP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawSpanAddClampP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_DrawSpanMaskedAddClampP_RGBA_C() { DrawerCommandQueue::QueueCommand(); } void R_FillSpan_RGBA() { DrawerCommandQueue::QueueCommand(); } extern FTexture *rw_pic; // For the asserts below DWORD vlinec1_RGBA() { DWORD fracstep = dc_iscale; DWORD frac = dc_texturefrac; DWORD height = rw_pic->GetHeight(); assert((frac >> vlinebits) < height); frac += dc_count * fracstep; assert((frac >> vlinebits) <= height); DrawerCommandQueue::QueueCommand(); return dc_texturefrac + dc_count * dc_iscale; } void vlinec4_RGBA() { DrawerCommandQueue::QueueCommand(); for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } DWORD mvlinec1_RGBA() { DrawerCommandQueue::QueueCommand(); return dc_texturefrac + dc_count * dc_iscale; } void mvlinec4_RGBA() { DrawerCommandQueue::QueueCommand(); for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } fixed_t tmvline1_add_RGBA() { DrawerCommandQueue::QueueCommand(); return dc_texturefrac + dc_count * dc_iscale; } void tmvline4_add_RGBA() { DrawerCommandQueue::QueueCommand(); for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } fixed_t tmvline1_addclamp_RGBA() { DrawerCommandQueue::QueueCommand(); return dc_texturefrac + dc_count * dc_iscale; } void tmvline4_addclamp_RGBA() { DrawerCommandQueue::QueueCommand(); for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } fixed_t tmvline1_subclamp_RGBA() { DrawerCommandQueue::QueueCommand(); return dc_texturefrac + dc_count * dc_iscale; } void tmvline4_subclamp_RGBA() { DrawerCommandQueue::QueueCommand(); for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } fixed_t tmvline1_revsubclamp_RGBA() { DrawerCommandQueue::QueueCommand(); return dc_texturefrac + dc_count * dc_iscale; } void tmvline4_revsubclamp_RGBA() { DrawerCommandQueue::QueueCommand(); for (int i = 0; i < 4; i++) vplce[i] += vince[i] * dc_count; } void R_DrawFogBoundarySection_RGBA(int y, int y2, int x1) { for (; y < y2; ++y) { int x2 = spanend[y]; DrawerCommandQueue::QueueCommand(y, x1, x2); } } void R_DrawFogBoundary_RGBA(int x1, int x2, short *uclip, short *dclip) { // To do: we do not need to create new spans when using rgba output - instead we should calculate light on a per pixel basis // This is essentially the same as R_MapVisPlane but with an extra step // to create new horizontal spans whenever the light changes enough that // we need to use a new colormap. double lightstep = rw_lightstep; double light = rw_light + rw_lightstep*(x2 - x1 - 1); int x = x2 - 1; int t2 = uclip[x]; int b2 = dclip[x]; int rcolormap = GETPALOOKUP(light, wallshade); int lcolormap; BYTE *basecolormapdata = basecolormap->Maps; if (b2 > t2) { clearbufshort(spanend + t2, b2 - t2, x); } R_SetColorMapLight(basecolormap, (float)light, wallshade); BYTE *fake_dc_colormap = basecolormap->Maps + (GETPALOOKUP(light, wallshade) << COLORMAPSHIFT); for (--x; x >= x1; --x) { int t1 = uclip[x]; int b1 = dclip[x]; const int xr = x + 1; int stop; light -= rw_lightstep; lcolormap = GETPALOOKUP(light, wallshade); if (lcolormap != rcolormap) { if (t2 < b2 && rcolormap != 0) { // Colormap 0 is always the identity map, so rendering it is // just a waste of time. R_DrawFogBoundarySection_RGBA(t2, b2, xr); } if (t1 < t2) t2 = t1; if (b1 > b2) b2 = b1; if (t2 < b2) { clearbufshort(spanend + t2, b2 - t2, x); } rcolormap = lcolormap; R_SetColorMapLight(basecolormap, (float)light, wallshade); fake_dc_colormap = basecolormap->Maps + (GETPALOOKUP(light, wallshade) << COLORMAPSHIFT); } else { if (fake_dc_colormap != basecolormapdata) { stop = MIN(t1, b2); while (t2 < stop) { int y = t2++; DrawerCommandQueue::QueueCommand(y, xr, spanend[y]); } stop = MAX(b1, t2); while (b2 > stop) { int y = --b2; DrawerCommandQueue::QueueCommand(y, xr, spanend[y]); } } else { t2 = MAX(t2, MIN(t1, b2)); b2 = MIN(b2, MAX(b1, t2)); } stop = MIN(t2, b1); while (t1 < stop) { spanend[t1++] = x; } stop = MAX(b2, t2); while (b1 > stop) { spanend[--b1] = x; } } t2 = uclip[x]; b2 = dclip[x]; } if (t2 < b2 && rcolormap != 0) { R_DrawFogBoundarySection_RGBA(t2, b2, x1); } }