From c452d0257380799e6b2d89e6177380ae2a948235 Mon Sep 17 00:00:00 2001 From: Magnus Norddahl Date: Tue, 7 Jun 2016 15:25:11 +0200 Subject: [PATCH] Added multicore rendering to true color drawers --- src/r_draw.h | 60 ++++- src/r_draw_rgba.cpp | 535 ++++++++++++++++++++++++++++++------------- src/r_drawt_rgba.cpp | 308 +++++++++++++------------ 3 files changed, 603 insertions(+), 300 deletions(-) diff --git a/src/r_draw.h b/src/r_draw.h index 98be57c51..bf73c9dfb 100644 --- a/src/r_draw.h +++ b/src/r_draw.h @@ -25,6 +25,9 @@ #include "r_defs.h" #include +#include +#include +#include // Spectre/Invisibility. #define FUZZTABLE 50 @@ -74,7 +77,6 @@ extern "C" unsigned int dc_tspans[4][MAXHEIGHT]; extern "C" unsigned int *dc_ctspan[4]; extern "C" unsigned int horizspans[4]; - // [RH] Pointers to the different column and span drawers... // The span blitting interface. @@ -443,19 +445,58 @@ void R_SetTranslationMap(lighttable_t *translation); // Wait until all drawers finished executing void R_FinishDrawerCommands(); +class DrawerCommandQueue; + class DrawerThread { public: + std::thread thread; + + // Thread line index of this thread int core = 0; + + // Number of active threads int num_cores = 1; uint32_t dc_temp_rgbabuff_rgba[MAXHEIGHT * 4]; uint32_t *dc_temp_rgba; + + // Checks if a line is rendered by this thread + bool line_skipped_by_thread(int line) + { + return line % num_cores != core; + } + + // The number of lines to skip to reach the first line to be rendered by this thread + int skipped_by_thread(int first_line) + { + return (num_cores - (first_line - core) % num_cores) % num_cores; + } + + // The number of lines to be rendered by this thread + int count_for_thread(int first_line, int count) + { + return (count - skipped_by_thread(first_line) + num_cores - 1) / num_cores; + } + + // Calculate the dest address for the first line to be rendered by this thread + uint32_t *dest_for_thread(int first_line, int pitch, uint32_t *dest) + { + return dest + skipped_by_thread(first_line) * pitch; + } }; class DrawerCommand { +protected: + int dc_dest_y; + public: + DrawerCommand() + { + dc_dest_y = static_cast((dc_dest - dc_destorg) / (dc_pitch * 4)); + } + virtual void Execute(DrawerThread *thread) = 0; }; @@ -467,8 +508,25 @@ class DrawerCommandQueue std::vector commands; + std::vector threads; + + std::mutex start_mutex; + std::condition_variable start_condition; + std::vector active_commands; + bool shutdown_flag = false; + int run_id = 0; + + std::mutex end_mutex; + std::condition_variable end_condition; + int finished_threads = 0; + + void StartThreads(); + void StopThreads(); + static DrawerCommandQueue *Instance(); + ~DrawerCommandQueue(); + public: // Allocate memory valid for the duration of a command execution static void* AllocMemory(size_t size); diff --git a/src/r_draw_rgba.cpp b/src/r_draw_rgba.cpp index 9e61bb427..489716e1f 100644 --- a/src/r_draw_rgba.cpp +++ b/src/r_draw_rgba.cpp @@ -63,6 +63,11 @@ DrawerCommandQueue *DrawerCommandQueue::Instance() return &queue; } +DrawerCommandQueue::~DrawerCommandQueue() +{ + StopThreads(); +} + void* DrawerCommandQueue::AllocMemory(size_t size) { // Make sure allocations remain 16-byte aligned @@ -81,19 +86,102 @@ void DrawerCommandQueue::Finish() { auto queue = Instance(); - DrawerThread thread; + // Give worker threads something to do: - size_t size = queue->commands.size(); + std::unique_lock start_lock(queue->start_mutex); + queue->active_commands.swap(queue->commands); + queue->run_id++; + start_lock.unlock(); + + queue->StartThreads(); + queue->start_condition.notify_all(); + + // Do one thread ourselves: + + DrawerThread thread; + thread.core = 0; + thread.num_cores = queue->threads.size() + 1; + + size_t size = queue->active_commands.size(); for (size_t i = 0; i < size; i++) { - auto &command = queue->commands[i]; + auto &command = queue->active_commands[i]; command->Execute(&thread); } - for (auto &command : queue->commands) + // Wait for everyone to finish: + + std::unique_lock end_lock(queue->end_mutex); + queue->end_condition.wait(end_lock, [&]() { return queue->finished_threads == queue->threads.size(); }); + + // Clean up batch: + + for (auto &command : queue->active_commands) command->~DrawerCommand(); - queue->commands.clear(); + queue->active_commands.clear(); queue->memorypool_pos = 0; + queue->finished_threads = 0; +} + +void DrawerCommandQueue::StartThreads() +{ + if (!threads.empty()) + return; + + int num_threads = std::thread::hardware_concurrency(); + if (num_threads == 0) + num_threads = 4; + + threads.resize(num_threads - 1); + + for (int i = 0; i < num_threads - 1; i++) + { + DrawerCommandQueue *queue = this; + DrawerThread *thread = &threads[i]; + thread->core = i + 1; + thread->num_cores = num_threads; + thread->thread = std::thread([=]() + { + int run_id = 0; + while (true) + { + // Wait until we are signalled to run: + std::unique_lock start_lock(queue->start_mutex); + queue->start_condition.wait(start_lock, [&]() { return queue->run_id != run_id || queue->shutdown_flag; }); + if (queue->shutdown_flag) + break; + run_id = queue->run_id; + start_lock.unlock(); + + // Do the work: + size_t size = queue->active_commands.size(); + for (size_t i = 0; i < size; i++) + { + auto &command = queue->active_commands[i]; + command->Execute(thread); + } + + // Notify main thread that we finished: + std::unique_lock end_lock(queue->end_mutex); + queue->finished_threads++; + end_lock.unlock(); + queue->end_condition.notify_all(); + } + }); + } +} + +void DrawerCommandQueue::StopThreads() +{ + std::unique_lock lock(start_mutex); + shutdown_flag = true; + lock.unlock(); + start_condition.notify_all(); + for (auto &thread : threads) + thread.thread.join(); + threads.clear(); + lock.lock(); + shutdown_flag = false; } ///////////////////////////////////////////////////////////////////////////// @@ -129,28 +217,28 @@ public: fixed_t frac; fixed_t fracstep; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); // Zero length, column does not exceed a pixel. if (count <= 0) return; // Framebuffer destination address. - dest = (uint32_t*)dc_dest; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; // Determine scaling, // which is the only mapping to be done. - fracstep = dc_iscale; - frac = dc_texturefrac; + fracstep = dc_iscale * thread->num_cores; + frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { // [RH] Get local copies of these variables so that the compiler // has a better chance of optimizing this well. const BYTE *source = dc_source; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; // Inner loop that does the actual texture mapping, // e.g. a DDA-lile scaling. @@ -190,17 +278,17 @@ public: int count; uint32_t* dest; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; - dest = (uint32_t*)dc_dest; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); uint32_t light = calc_light_multiplier(dc_light); { - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; BYTE color = dc_color; do @@ -235,12 +323,12 @@ public: int count; uint32_t *dest; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; - dest = (uint32_t*)dc_dest; - int pitch = dc_pitch; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); + int pitch = dc_pitch * thread->num_cores; uint32_t fg = shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light)); uint32_t fg_red = (fg >> 24) & 0xff; @@ -286,12 +374,12 @@ public: int count; uint32_t *dest; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; - dest = (uint32_t*)dc_dest; - int pitch = dc_pitch; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); + int pitch = dc_pitch * thread->num_cores; uint32_t fg = shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light)); uint32_t fg_red = (fg >> 24) & 0xff; @@ -337,12 +425,12 @@ public: int count; uint32_t *dest; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; - dest = (uint32_t*)dc_dest; - int pitch = dc_pitch; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); + int pitch = dc_pitch * thread->num_cores; uint32_t fg = shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light)); uint32_t fg_red = (fg >> 24) & 0xff; @@ -388,12 +476,12 @@ public: int count; uint32_t *dest; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; - dest = (uint32_t*)dc_dest; - int pitch = dc_pitch; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); + int pitch = dc_pitch * thread->num_cores; uint32_t fg = shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light)); uint32_t fg_red = (fg >> 24) & 0xff; @@ -451,15 +539,13 @@ public: if (dc_yh > fuzzviewheight) dc_yh = fuzzviewheight; - count = dc_yh - dc_yl; + count = thread->count_for_thread(dc_yl, dc_yh - dc_yl + 1); // Zero length. - if (count < 0) + if (count <= 0) return; - count++; - - dest = ylookup[dc_yl] + dc_x + (uint32_t*)dc_destorg; + dest = thread->dest_for_thread(dc_yl, dc_pitch, ylookup[dc_yl] + dc_x + (uint32_t*)dc_destorg); // Note: this implementation assumes this function is only used for the pinky shadow effect (i.e. no other fancy colormap than black) // I'm not sure if this is really always the case or not. @@ -467,7 +553,7 @@ public: { // [RH] Make local copies of global vars to try and improve // the optimizations made by the compiler. - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; int fuzz = fuzzpos; int cnt; @@ -573,18 +659,18 @@ public: fixed_t frac; fixed_t fracstep; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; - dest = (uint32_t*)dc_dest; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); - fracstep = dc_iscale; - frac = dc_texturefrac; + fracstep = dc_iscale * thread->num_cores; + frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { const BYTE *source = dc_source; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -649,23 +735,23 @@ public: fixed_t frac; fixed_t fracstep; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; - dest = (uint32_t*)dc_dest; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); - fracstep = dc_iscale; - frac = dc_texturefrac; + fracstep = dc_iscale * thread->num_cores; + frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { // [RH] Local copies of global vars to improve compiler optimizations BYTE *translation = dc_translation; const BYTE *source = dc_source; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; do { @@ -710,22 +796,22 @@ public: fixed_t frac; fixed_t fracstep; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; - dest = (uint32_t*)dc_dest; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); - fracstep = dc_iscale; - frac = dc_texturefrac; + fracstep = dc_iscale * thread->num_cores; + frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { BYTE *translation = dc_translation; const BYTE *source = dc_source; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); @@ -787,15 +873,15 @@ public: uint32_t *dest; fixed_t frac, fracstep; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; - dest = (uint32_t*)dc_dest; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); - fracstep = dc_iscale; - frac = dc_texturefrac; + fracstep = dc_iscale * thread->num_cores; + frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); uint32_t fg = shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light)); uint32_t fg_red = (fg >> 16) & 0xff; @@ -805,7 +891,7 @@ public: { const BYTE *source = dc_source; BYTE *colormap = dc_colormap; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; do { @@ -863,18 +949,18 @@ public: fixed_t frac; fixed_t fracstep; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; - dest = (uint32_t*)dc_dest; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); - fracstep = dc_iscale; - frac = dc_texturefrac; + fracstep = dc_iscale * thread->num_cores; + frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { const BYTE *source = dc_source; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -941,19 +1027,19 @@ public: fixed_t frac; fixed_t fracstep; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; - dest = (uint32_t*)dc_dest; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); - fracstep = dc_iscale; - frac = dc_texturefrac; + fracstep = dc_iscale * thread->num_cores; + frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { BYTE *translation = dc_translation; const BYTE *source = dc_source; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -1018,18 +1104,18 @@ public: fixed_t frac; fixed_t fracstep; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; - dest = (uint32_t*)dc_dest; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); - fracstep = dc_iscale; - frac = dc_texturefrac; + fracstep = dc_iscale * thread->num_cores; + frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { const BYTE *source = dc_source; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -1096,19 +1182,19 @@ public: fixed_t frac; fixed_t fracstep; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; - dest = (uint32_t*)dc_dest; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); - fracstep = dc_iscale; - frac = dc_texturefrac; + fracstep = dc_iscale * thread->num_cores; + frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { BYTE *translation = dc_translation; const BYTE *source = dc_source; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -1173,18 +1259,18 @@ public: fixed_t frac; fixed_t fracstep; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; - dest = (uint32_t*)dc_dest; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); - fracstep = dc_iscale; - frac = dc_texturefrac; + fracstep = dc_iscale * thread->num_cores; + frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { const BYTE *source = dc_source; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); @@ -1250,19 +1336,19 @@ public: fixed_t frac; fixed_t fracstep; - count = dc_count; + count = thread->count_for_thread(dc_dest_y, dc_count); if (count <= 0) return; - dest = (uint32_t*)dc_dest; + dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); - fracstep = dc_iscale; - frac = dc_texturefrac; + fracstep = dc_iscale * thread->num_cores; + frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); { BYTE *translation = dc_translation; const BYTE *source = dc_source; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -1329,6 +1415,9 @@ public: #ifdef NO_SSE void Execute(DrawerThread *thread) override { + if (thread->line_skipped_by_thread(ds_y)) + return; + dsfixed_t xfrac; dsfixed_t yfrac; dsfixed_t xstep; @@ -1391,6 +1480,9 @@ public: #else void Execute(DrawerThread *thread) override { + if (thread->line_skipped_by_thread(ds_y)) + return; + dsfixed_t xfrac; dsfixed_t yfrac; dsfixed_t xstep; @@ -1572,6 +1664,9 @@ public: void Execute(DrawerThread *thread) override { + if (thread->line_skipped_by_thread(ds_y)) + return; + dsfixed_t xfrac; dsfixed_t yfrac; dsfixed_t xstep; @@ -1671,6 +1766,9 @@ public: void Execute(DrawerThread *thread) override { + if (thread->line_skipped_by_thread(ds_y)) + return; + dsfixed_t xfrac; dsfixed_t yfrac; dsfixed_t xstep; @@ -1789,6 +1887,9 @@ public: void Execute(DrawerThread *thread) override { + if (thread->line_skipped_by_thread(ds_y)) + return; + dsfixed_t xfrac; dsfixed_t yfrac; dsfixed_t xstep; @@ -1917,6 +2018,9 @@ public: void Execute(DrawerThread *thread) override { + if (thread->line_skipped_by_thread(ds_y)) + return; + dsfixed_t xfrac; dsfixed_t yfrac; dsfixed_t xstep; @@ -2035,6 +2139,9 @@ public: void Execute(DrawerThread *thread) override { + if (thread->line_skipped_by_thread(ds_y)) + return; + dsfixed_t xfrac; dsfixed_t yfrac; dsfixed_t xstep; @@ -2149,6 +2256,9 @@ public: void Execute(DrawerThread *thread) override { + if (thread->line_skipped_by_thread(ds_y)) + return; + uint32_t *dest = ylookup[ds_y] + ds_x1 + (uint32_t*)dc_destorg; int count = (ds_x2 - ds_x1 + 1); uint32_t light = calc_light_multiplier(ds_light); @@ -2186,13 +2296,16 @@ public: void Execute(DrawerThread *thread) override { - DWORD fracstep = dc_iscale; - DWORD frac = dc_texturefrac; - int count = dc_count; + int count = thread->count_for_thread(dc_dest_y, dc_count); + if (count <= 0) + return; + + DWORD fracstep = dc_iscale * thread->num_cores; + DWORD frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); const BYTE *source = dc_source; - uint32_t *dest = (uint32_t*)dc_dest; + uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int bits = vlinebits; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -2238,8 +2351,12 @@ public: #ifdef NO_SSE void Execute(DrawerThread *thread) override { - uint32_t *dest = (uint32_t*)dc_dest; - int count = dc_count; + int count = thread->count_for_thread(dc_dest_y, dc_count); + if (count <= 0) + return; + + uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); + int pitch = dc_pitch * thread->num_cores; int bits = vlinebits; DWORD place; @@ -2250,21 +2367,34 @@ public: ShadeConstants shade_constants = dc_shade_constants; + DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; + DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; + int skipped = thread->skipped_by_thread(dc_dest_y); + for (int i = 0; i < 4; i++) + { + local_vplce[i] += local_vince[i] * skipped; + local_vince[i] *= thread->num_cores; + } + do { - dest[0] = shade_pal_index(bufplce[0][(place = vplce[0]) >> bits], light0, shade_constants); vplce[0] = place + vince[0]; - dest[1] = shade_pal_index(bufplce[1][(place = vplce[1]) >> bits], light1, shade_constants); vplce[1] = place + vince[1]; - dest[2] = shade_pal_index(bufplce[2][(place = vplce[2]) >> bits], light2, shade_constants); vplce[2] = place + vince[2]; - dest[3] = shade_pal_index(bufplce[3][(place = vplce[3]) >> bits], light3, shade_constants); vplce[3] = place + vince[3]; - dest += dc_pitch; + dest[0] = shade_pal_index(bufplce[0][(place = local_vplce[0]) >> bits], light0, shade_constants); local_vplce[0] = place + local_vince[0]; + dest[1] = shade_pal_index(bufplce[1][(place = local_vplce[1]) >> bits], light1, shade_constants); local_vplce[1] = place + local_vince[1]; + dest[2] = shade_pal_index(bufplce[2][(place = local_vplce[2]) >> bits], light2, shade_constants); local_vplce[2] = place + local_vince[2]; + dest[3] = shade_pal_index(bufplce[3][(place = local_vplce[3]) >> bits], light3, shade_constants); local_vplce[3] = place + local_vince[3]; + dest += pitch; } while (--count); } #else void Execute(DrawerThread *thread) override { - uint32_t *dest = (uint32_t*)dc_dest; - int count = dc_count; + int count = thread->count_for_thread(dc_dest_y, dc_count); + if (count <= 0) + return; + + uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int bits = vlinebits; + int pitch = dc_pitch * thread->num_cores; uint32_t light0 = calc_light_multiplier(palookuplight[0]); uint32_t light1 = calc_light_multiplier(palookuplight[1]); @@ -2276,6 +2406,12 @@ public: uint32_t *palette = (uint32_t*)GPalette.BaseColors; DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; + int skipped = thread->skipped_by_thread(dc_dest_y); + for (int i = 0; i < 4; i++) + { + local_vplce[i] += local_vince[i] * skipped; + local_vince[i] *= thread->num_cores; + } if (shade_constants.simple_shade) { @@ -2300,7 +2436,7 @@ public: __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); SSE_SHADE_SIMPLE(fg); _mm_storeu_si128((__m128i*)dest, fg); - dest += dc_pitch; + dest += pitch; } while (--count); } else @@ -2326,7 +2462,7 @@ public: __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); SSE_SHADE(fg, shade_constants); _mm_storeu_si128((__m128i*)dest, fg); - dest += dc_pitch; + dest += pitch; } while (--count); } } @@ -2361,13 +2497,16 @@ public: void Execute(DrawerThread *thread) override { - DWORD fracstep = dc_iscale; - DWORD frac = dc_texturefrac; - int count = dc_count; + int count = thread->count_for_thread(dc_dest_y, dc_count); + if (count <= 0) + return; + + DWORD fracstep = dc_iscale * thread->num_cores; + DWORD frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); const BYTE *source = dc_source; - uint32_t *dest = (uint32_t*)dc_dest; + uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int bits = mvlinebits; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -2417,8 +2556,12 @@ public: #ifdef NO_SSE void Execute(DrawerThread *thread) override { - uint32_t *dest = (uint32_t*)dc_dest; - int count = dc_count; + int count = thread->count_for_thread(dc_dest_y, dc_count); + if (count <= 0) + return; + + uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); + int pitch = dc_pitch * thread->num_cores; int bits = mvlinebits; DWORD place; @@ -2429,21 +2572,34 @@ public: ShadeConstants shade_constants = dc_shade_constants; + DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; + DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; + int skipped = thread->skipped_by_thread(dc_dest_y); + for (int i = 0; i < 4; i++) + { + local_vplce[i] += local_vince[i] * skipped; + local_vince[i] *= thread->num_cores; + } + do { BYTE pix; - pix = bufplce[0][(place = vplce[0]) >> bits]; if (pix) dest[0] = shade_pal_index(pix, light0, shade_constants); vplce[0] = place + vince[0]; - pix = bufplce[1][(place = vplce[1]) >> bits]; if (pix) dest[1] = shade_pal_index(pix, light1, shade_constants); vplce[1] = place + vince[1]; - pix = bufplce[2][(place = vplce[2]) >> bits]; if (pix) dest[2] = shade_pal_index(pix, light2, shade_constants); vplce[2] = place + vince[2]; - pix = bufplce[3][(place = vplce[3]) >> bits]; if (pix) dest[3] = shade_pal_index(pix, light3, shade_constants); vplce[3] = place + vince[3]; - dest += dc_pitch; + pix = bufplce[0][(place = local_vplce[0]) >> bits]; if (pix) dest[0] = shade_pal_index(pix, light0, shade_constants); local_vplce[0] = place + local_vince[0]; + pix = bufplce[1][(place = local_vplce[1]) >> bits]; if (pix) dest[1] = shade_pal_index(pix, light1, shade_constants); local_vplce[1] = place + local_vince[1]; + pix = bufplce[2][(place = local_vplce[2]) >> bits]; if (pix) dest[2] = shade_pal_index(pix, light2, shade_constants); local_vplce[2] = place + local_vince[2]; + pix = bufplce[3][(place = local_vplce[3]) >> bits]; if (pix) dest[3] = shade_pal_index(pix, light3, shade_constants); local_vplce[3] = place + local_vince[3]; + dest += pitch; } while (--count); } #else void Execute(DrawerThread *thread) override { - uint32_t *dest = (uint32_t*)dc_dest; - int count = dc_count; + int count = thread->count_for_thread(dc_dest_y, dc_count); + if (count <= 0) + return; + + uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); + int pitch = dc_pitch * thread->num_cores; int bits = mvlinebits; uint32_t light0 = calc_light_multiplier(palookuplight[0]); @@ -2456,6 +2612,12 @@ public: uint32_t *palette = (uint32_t*)GPalette.BaseColors; DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; + int skipped = thread->skipped_by_thread(dc_dest_y); + for (int i = 0; i < 4; i++) + { + local_vplce[i] += local_vince[i] * skipped; + local_vince[i] *= thread->num_cores; + } if (shade_constants.simple_shade) { @@ -2483,7 +2645,7 @@ public: __m128i fg = _mm_set_epi32(palette[pix3], palette[pix2], palette[pix1], palette[pix0]); SSE_SHADE_SIMPLE(fg); _mm_maskmoveu_si128(fg, movemask, (char*)dest); - dest += dc_pitch; + dest += pitch; } while (--count); } else @@ -2512,7 +2674,7 @@ public: __m128i fg = _mm_set_epi32(palette[pix3], palette[pix2], palette[pix1], palette[pix0]); SSE_SHADE(fg, shade_constants); _mm_maskmoveu_si128(fg, movemask, (char*)dest); - dest += dc_pitch; + dest += pitch; } while (--count); } } @@ -2551,13 +2713,16 @@ public: void Execute(DrawerThread *thread) override { - DWORD fracstep = dc_iscale; - DWORD frac = dc_texturefrac; - int count = dc_count; + int count = thread->count_for_thread(dc_dest_y, dc_count); + if (count <= 0) + return; + + DWORD fracstep = dc_iscale * thread->num_cores; + DWORD frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); const BYTE *source = dc_source; - uint32_t *dest = (uint32_t*)dc_dest; + uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int bits = tmvlinebits; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -2626,8 +2791,12 @@ public: void Execute(DrawerThread *thread) override { - uint32_t *dest = (uint32_t*)dc_dest; - int count = dc_count; + int count = thread->count_for_thread(dc_dest_y, dc_count); + if (count <= 0) + return; + + uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); + int pitch = dc_pitch * thread->num_cores; int bits = tmvlinebits; uint32_t light[4]; @@ -2641,11 +2810,20 @@ public: uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); + DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; + DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; + int skipped = thread->skipped_by_thread(dc_dest_y); + for (int i = 0; i < 4; i++) + { + local_vplce[i] += local_vince[i] * skipped; + local_vince[i] *= thread->num_cores; + } + do { for (int i = 0; i < 4; ++i) { - BYTE pix = bufplce[i][vplce[i] >> bits]; + BYTE pix = bufplce[i][local_vplce[i] >> bits]; if (pix != 0) { uint32_t fg = shade_pal_index(pix, light[i], shade_constants); @@ -2663,9 +2841,9 @@ public: dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; } - vplce[i] += vince[i]; + local_vplce[i] += local_vince[i]; } - dest += dc_pitch; + dest += pitch; } while (--count); } }; @@ -2702,13 +2880,16 @@ public: void Execute(DrawerThread *thread) override { - DWORD fracstep = dc_iscale; - DWORD frac = dc_texturefrac; - int count = dc_count; + int count = thread->count_for_thread(dc_dest_y, dc_count); + if (count <= 0) + return; + + DWORD fracstep = dc_iscale * thread->num_cores; + DWORD frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); const BYTE *source = dc_source; - uint32_t *dest = (uint32_t*)dc_dest; + uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int bits = tmvlinebits; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -2777,8 +2958,12 @@ public: void Execute(DrawerThread *thread) override { - uint32_t *dest = (uint32_t*)dc_dest; - int count = dc_count; + int count = thread->count_for_thread(dc_dest_y, dc_count); + if (count <= 0) + return; + + uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); + int pitch = dc_pitch * thread->num_cores; int bits = tmvlinebits; uint32_t light[4]; @@ -2792,11 +2977,20 @@ public: uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); + DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; + DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; + int skipped = thread->skipped_by_thread(dc_dest_y); + for (int i = 0; i < 4; i++) + { + local_vplce[i] += local_vince[i] * skipped; + local_vince[i] *= thread->num_cores; + } + do { for (int i = 0; i < 4; ++i) { - BYTE pix = bufplce[i][vplce[i] >> bits]; + BYTE pix = bufplce[i][local_vplce[i] >> bits]; if (pix != 0) { uint32_t fg = shade_pal_index(pix, light[i], shade_constants); @@ -2814,9 +3008,9 @@ public: dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; } - vplce[i] += vince[i]; + local_vplce[i] += local_vince[i]; } - dest += dc_pitch; + dest += pitch; } while (--count); } }; @@ -2853,13 +3047,16 @@ public: void Execute(DrawerThread *thread) override { - DWORD fracstep = dc_iscale; - DWORD frac = dc_texturefrac; - int count = dc_count; + int count = thread->count_for_thread(dc_dest_y, dc_count); + if (count <= 0) + return; + + DWORD fracstep = dc_iscale * thread->num_cores; + DWORD frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); const BYTE *source = dc_source; - uint32_t *dest = (uint32_t*)dc_dest; + uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int bits = tmvlinebits; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -2928,8 +3125,12 @@ public: void Execute(DrawerThread *thread) override { - uint32_t *dest = (uint32_t*)dc_dest; - int count = dc_count; + int count = thread->count_for_thread(dc_dest_y, dc_count); + if (count <= 0) + return; + + uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); + int pitch = dc_pitch * thread->num_cores; int bits = tmvlinebits; uint32_t light[4]; @@ -2943,11 +3144,20 @@ public: uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); + DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; + DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; + int skipped = thread->skipped_by_thread(dc_dest_y); + for (int i = 0; i < 4; i++) + { + local_vplce[i] += local_vince[i] * skipped; + local_vince[i] *= thread->num_cores; + } + do { for (int i = 0; i < 4; ++i) { - BYTE pix = bufplce[i][vplce[i] >> bits]; + BYTE pix = bufplce[i][local_vplce[i] >> bits]; if (pix != 0) { uint32_t fg = shade_pal_index(pix, light[i], shade_constants); @@ -2965,9 +3175,9 @@ public: dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; } - vplce[i] += vince[i]; + local_vplce[i] += local_vince[i]; } - dest += dc_pitch; + dest += pitch; } while (--count); } }; @@ -3004,13 +3214,16 @@ public: void Execute(DrawerThread *thread) override { - DWORD fracstep = dc_iscale; - DWORD frac = dc_texturefrac; - int count = dc_count; + int count = thread->count_for_thread(dc_dest_y, dc_count); + if (count <= 0) + return; + + DWORD fracstep = dc_iscale * thread->num_cores; + DWORD frac = dc_texturefrac + dc_iscale * thread->skipped_by_thread(dc_dest_y); const BYTE *source = dc_source; - uint32_t *dest = (uint32_t*)dc_dest; + uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); int bits = tmvlinebits; - int pitch = dc_pitch; + int pitch = dc_pitch * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -3079,8 +3292,12 @@ public: void Execute(DrawerThread *thread) override { - uint32_t *dest = (uint32_t*)dc_dest; - int count = dc_count; + int count = thread->count_for_thread(dc_dest_y, dc_count); + if (count <= 0) + return; + + uint32_t *dest = thread->dest_for_thread(dc_dest_y, dc_pitch, (uint32_t*)dc_dest); + int pitch = dc_pitch * thread->num_cores; int bits = tmvlinebits; uint32_t light[4]; @@ -3094,11 +3311,20 @@ public: uint32_t fg_alpha = dc_srcalpha >> (FRACBITS - 8); uint32_t bg_alpha = dc_destalpha >> (FRACBITS - 8); + DWORD local_vplce[4] = { vplce[0], vplce[1], vplce[2], vplce[3] }; + DWORD local_vince[4] = { vince[0], vince[1], vince[2], vince[3] }; + int skipped = thread->skipped_by_thread(dc_dest_y); + for (int i = 0; i < 4; i++) + { + local_vplce[i] += local_vince[i] * skipped; + local_vince[i] *= thread->num_cores; + } + do { for (int i = 0; i < 4; ++i) { - BYTE pix = bufplce[i][vplce[i] >> bits]; + BYTE pix = bufplce[i][local_vplce[i] >> bits]; if (pix != 0) { uint32_t fg = shade_pal_index(pix, light[i], shade_constants); @@ -3116,9 +3342,9 @@ public: dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; } - vplce[i] += vince[i]; + local_vplce[i] += local_vince[i]; } - dest += dc_pitch; + dest += pitch; } while (--count); } }; @@ -3146,6 +3372,9 @@ public: void Execute(DrawerThread *thread) override { + if (thread->line_skipped_by_thread(_y)) + return; + int y = _y; int x = _x; int x2 = _x2; diff --git a/src/r_drawt_rgba.cpp b/src/r_drawt_rgba.cpp index bbf68a795..c2caec0c2 100644 --- a/src/r_drawt_rgba.cpp +++ b/src/r_drawt_rgba.cpp @@ -78,26 +78,26 @@ public: uint32_t *source; uint32_t *dest; int count; - int pitch; + int pitch, sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, (yh - yl + 1)); + if (count <= 0) return; - count++; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4 + hx]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4 + hx] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = thread->num_cores * 4; if (count & 1) { *dest = *source; - source += 4; + source += sincr; dest += pitch; } if (count & 2) { dest[0] = source[0]; - dest[pitch] = source[4]; - source += 8; + dest[pitch] = source[sincr]; + source += sincr * 2; dest += pitch * 2; } if (!(count >>= 2)) @@ -105,10 +105,10 @@ public: do { dest[0] = source[0]; - dest[pitch] = source[4]; - dest[pitch * 2] = source[8]; - dest[pitch * 3] = source[12]; - source += 16; + dest[pitch] = source[sincr]; + dest[pitch * 2] = source[sincr * 2]; + dest[pitch * 3] = source[sincr * 3]; + source += sincr * 4; dest += pitch * 4; } while (--count); } @@ -145,22 +145,23 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4 + hx]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4 + hx] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = thread->num_cores * 4; if (count & 1) { *dest = shade_pal_index(*source, light, shade_constants); - source += 4; + source += sincr; dest += pitch; } if (!(count >>= 1)) @@ -168,8 +169,8 @@ public: do { dest[0] = shade_pal_index(source[0], light, shade_constants); - dest[pitch] = shade_pal_index(source[4], light, shade_constants); - source += 8; + dest[pitch] = shade_pal_index(source[sincr], light, shade_constants); + source += sincr * 2; dest += pitch * 2; } while (--count); } @@ -205,25 +206,26 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = thread->num_cores * 4; if (count & 1) { dest[0] = shade_pal_index(source[0], light, shade_constants); dest[1] = shade_pal_index(source[1], light, shade_constants); dest[2] = shade_pal_index(source[2], light, shade_constants); dest[3] = shade_pal_index(source[3], light, shade_constants); - source += 4; + source += sincr; dest += pitch; } if (!(count >>= 1)) @@ -234,11 +236,11 @@ public: dest[1] = shade_pal_index(source[1], light, shade_constants); dest[2] = shade_pal_index(source[2], light, shade_constants); dest[3] = shade_pal_index(source[3], light, shade_constants); - dest[pitch] = shade_pal_index(source[4], light, shade_constants); - dest[pitch + 1] = shade_pal_index(source[5], light, shade_constants); - dest[pitch + 2] = shade_pal_index(source[6], light, shade_constants); - dest[pitch + 3] = shade_pal_index(source[7], light, shade_constants); - source += 8; + dest[pitch] = shade_pal_index(source[sincr], light, shade_constants); + dest[pitch + 1] = shade_pal_index(source[sincr + 1], light, shade_constants); + dest[pitch + 2] = shade_pal_index(source[sincr + 2], light, shade_constants); + dest[pitch + 3] = shade_pal_index(source[sincr + 3], light, shade_constants); + source += sincr * 2; dest += pitch * 2; } while (--count); } @@ -249,19 +251,20 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; ShadeConstants shade_constants = dc_shade_constants; uint32_t light = calc_light_multiplier(dc_light); uint32_t *palette = (uint32_t*)GPalette.BaseColors; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = thread->num_cores * 4; if (shade_constants.simple_shade) { @@ -278,7 +281,7 @@ public: SSE_SHADE_SIMPLE(fg); _mm_storeu_si128((__m128i*)dest, fg); - source += 4; + source += sincr; dest += pitch; } if (!(count >>= 1)) @@ -299,17 +302,17 @@ public: // shade_pal_index 4-7 (pitch) { - uint32_t p0 = source[4]; - uint32_t p1 = source[5]; - uint32_t p2 = source[6]; - uint32_t p3 = source[7]; + uint32_t p0 = source[sincr]; + uint32_t p1 = source[sincr + 1]; + uint32_t p2 = source[sincr + 2]; + uint32_t p3 = source[sincr + 3]; __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); SSE_SHADE_SIMPLE(fg); _mm_storeu_si128((__m128i*)(dest + pitch), fg); } - source += 8; + source += sincr * 2; dest += pitch * 2; } while (--count); } @@ -328,7 +331,7 @@ public: SSE_SHADE(fg, shade_constants); _mm_storeu_si128((__m128i*)dest, fg); - source += 4; + source += sincr; dest += pitch; } if (!(count >>= 1)) @@ -349,17 +352,17 @@ public: // shade_pal_index 4-7 (pitch) { - uint32_t p0 = source[4]; - uint32_t p1 = source[5]; - uint32_t p2 = source[6]; - uint32_t p3 = source[7]; + uint32_t p0 = source[sincr]; + uint32_t p1 = source[sincr + 1]; + uint32_t p2 = source[sincr + 2]; + uint32_t p3 = source[sincr + 3]; __m128i fg = _mm_set_epi32(palette[p3], palette[p2], palette[p1], palette[p0]); SSE_SHADE(fg, shade_constants); _mm_storeu_si128((__m128i*)(dest + pitch), fg); } - source += 8; + source += sincr * 2; dest += pitch * 2; } while (--count); } @@ -522,15 +525,16 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4 + hx]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4 + hx] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = 4 * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -554,7 +558,7 @@ public: *dest = 0xff000000 | (red << 16) | (green << 8) | blue; - source += 4; + source += sincr; dest += pitch; } while (--count); } @@ -590,15 +594,16 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = 4 * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -625,7 +630,7 @@ public: dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; } - source += 4; + source += sincr; dest += pitch; } while (--count); } @@ -636,15 +641,16 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = 4 * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); uint32_t *palette = (uint32_t*)GPalette.BaseColors; @@ -686,7 +692,7 @@ public: __m128i color = _mm_packus_epi16(color_lo, color_hi); _mm_storeu_si128((__m128i*)dest, color); - source += 4; + source += sincr; dest += pitch; } while (--count); } @@ -722,7 +728,7 @@ public: __m128i color = _mm_packus_epi16(color_lo, color_hi); _mm_storeu_si128((__m128i*)dest, color); - source += 4; + source += sincr; dest += pitch; } while (--count); } @@ -764,16 +770,17 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; colormap = dc_colormap; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4 + hx]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4 + hx] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = 4 * thread->num_cores; uint32_t fg = shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light)); uint32_t fg_red = (fg >> 16) & 0xff; @@ -793,7 +800,7 @@ public: uint32_t blue = (fg_blue * alpha + bg_blue * inv_alpha) / 64; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; - source += 4; + source += sincr; dest += pitch; } while (--count); } @@ -832,16 +839,17 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; colormap = dc_colormap; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = 4 * thread->num_cores; uint32_t fg = shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light)); uint32_t fg_red = (fg >> 16) & 0xff; @@ -864,7 +872,7 @@ public: dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; } - source += 4; + source += sincr; dest += pitch; } while (--count); } @@ -876,16 +884,17 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; colormap = dc_colormap; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = 4 * thread->num_cores; __m128i fg = _mm_unpackhi_epi8(_mm_set1_epi32(shade_pal_index_simple(dc_color, calc_light_multiplier(dc_light))), _mm_setzero_si128()); __m128i alpha_one = _mm_set1_epi16(64); @@ -913,7 +922,7 @@ public: __m128i color = _mm_packus_epi16(color_lo, color_hi); _mm_storeu_si128((__m128i*)dest, color); - source += 4; + source += sincr; dest += pitch; } while (--count); } @@ -955,15 +964,16 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4 + hx]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4 + hx] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = 4 * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -986,7 +996,7 @@ public: uint32_t blue = clamp((fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 0, 255); *dest = 0xff000000 | (red << 16) | (green << 8) | blue; - source += 4; + source += sincr; dest += pitch; } while (--count); } @@ -1026,15 +1036,16 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = 4 * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -1060,7 +1071,7 @@ public: dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; } - source += 4; + source += sincr; dest += pitch; } while (--count); } @@ -1071,15 +1082,16 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = 4 * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); uint32_t *palette = (uint32_t*)GPalette.BaseColors; @@ -1121,7 +1133,7 @@ public: __m128i color = _mm_packus_epi16(color_lo, color_hi); _mm_storeu_si128((__m128i*)dest, color); - source += 4; + source += sincr; dest += pitch; } while (--count); } @@ -1157,7 +1169,7 @@ public: __m128i color = _mm_packus_epi16(color_lo, color_hi); _mm_storeu_si128((__m128i*)dest, color); - source += 4; + source += sincr; dest += pitch; } while (--count); } @@ -1200,15 +1212,16 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4 + hx]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4 + hx] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = 4 * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -1231,7 +1244,7 @@ public: uint32_t blue = clamp((0x10000 - fg_blue * fg_alpha + bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; - source += 4; + source += sincr; dest += pitch; } while (--count); } @@ -1270,15 +1283,16 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = 4 * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -1305,7 +1319,7 @@ public: dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; } - source += 4; + source += sincr; dest += pitch; } while (--count); } @@ -1346,15 +1360,16 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4 + hx]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4 + hx] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = 4 * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -1377,7 +1392,7 @@ public: uint32_t blue = clamp((0x10000 + fg_blue * fg_alpha - bg_blue * bg_alpha) / 256, 256, 256 + 255) - 256; *dest = 0xff000000 | (red << 16) | (green << 8) | blue; - source += 4; + source += sincr; dest += pitch; } while (--count); } @@ -1416,15 +1431,16 @@ public: uint32_t *dest; int count; int pitch; + int sincr; - count = yh - yl; - if (count < 0) + count = thread->count_for_thread(yl, yh - yl + 1); + if (count <= 0) return; - count++; - dest = ylookup[yl] + sx + (uint32_t*)dc_destorg; - source = &thread->dc_temp_rgba[yl * 4]; - pitch = dc_pitch; + dest = thread->dest_for_thread(yl, dc_pitch, ylookup[yl] + sx + (uint32_t*)dc_destorg); + source = &thread->dc_temp_rgba[yl * 4] + thread->skipped_by_thread(yl) * 4; + pitch = dc_pitch * thread->num_cores; + sincr = 4 * thread->num_cores; uint32_t light = calc_light_multiplier(dc_light); ShadeConstants shade_constants = dc_shade_constants; @@ -1451,7 +1467,7 @@ public: dest[i] = 0xff000000 | (red << 16) | (green << 8) | blue; } - source += 4; + source += sincr; dest += pitch; } while (--count); }