diff --git a/src/r_draw.h b/src/r_draw.h index 3f97a7a65f..409b7c01be 100644 --- a/src/r_draw.h +++ b/src/r_draw.h @@ -458,25 +458,34 @@ public: // Number of active threads int num_cores = 1; + // Range of rows processed this pass + int pass_start_y = 0; + int pass_end_y = 300; + uint32_t dc_temp_rgbabuff_rgba[MAXHEIGHT * 4]; uint32_t *dc_temp_rgba; // Checks if a line is rendered by this thread bool line_skipped_by_thread(int line) { - return line % num_cores != core; + return line < pass_start_y || line >= pass_end_y || line % num_cores != core; } // The number of lines to skip to reach the first line to be rendered by this thread int skipped_by_thread(int first_line) { - return (num_cores - (first_line - core) % num_cores) % num_cores; + int pass_skip = MAX(pass_start_y - first_line, 0); + int core_skip = (num_cores - (first_line + pass_skip - core) % num_cores) % num_cores; + return pass_skip + core_skip; } // The number of lines to be rendered by this thread int count_for_thread(int first_line, int count) { - return (count - skipped_by_thread(first_line) + num_cores - 1) / num_cores; + int lines_until_pass_end = MAX(pass_end_y - first_line, 0); + count = MIN(count, lines_until_pass_end); + int c = (count - skipped_by_thread(first_line) + num_cores - 1) / num_cores; + return MAX(c, 0); } // Calculate the dest address for the first line to be rendered by this thread @@ -522,6 +531,8 @@ class DrawerCommandQueue bool no_threading = false; DrawerThread single_core_thread; + int num_passes = 2; + int rows_in_pass = 540; void StartThreads(); void StopThreads(); diff --git a/src/r_draw_rgba.cpp b/src/r_draw_rgba.cpp index fa632cb5d3..b81ee4cca4 100644 --- a/src/r_draw_rgba.cpp +++ b/src/r_draw_rgba.cpp @@ -104,11 +104,19 @@ void DrawerCommandQueue::Finish() thread.core = 0; thread.num_cores = queue->threads.size() + 1; - size_t size = queue->active_commands.size(); - for (size_t i = 0; i < size; i++) + for (int pass = 0; pass < queue->num_passes; pass++) { - auto &command = queue->active_commands[i]; - command->Execute(&thread); + thread.pass_start_y = pass * queue->rows_in_pass; + thread.pass_end_y = (pass + 1) * queue->rows_in_pass; + if (pass + 1 == queue->num_passes) + thread.pass_end_y = MAX(thread.pass_end_y, MAXHEIGHT); + + size_t size = queue->active_commands.size(); + for (size_t i = 0; i < size; i++) + { + auto &command = queue->active_commands[i]; + command->Execute(&thread); + } } // Wait for everyone to finish: @@ -156,11 +164,19 @@ void DrawerCommandQueue::StartThreads() start_lock.unlock(); // Do the work: - size_t size = queue->active_commands.size(); - for (size_t i = 0; i < size; i++) + for (int pass = 0; pass < queue->num_passes; pass++) { - auto &command = queue->active_commands[i]; - command->Execute(thread); + thread->pass_start_y = pass * queue->rows_in_pass; + thread->pass_end_y = (pass + 1) * queue->rows_in_pass; + if (pass + 1 == queue->num_passes) + thread->pass_end_y = MAX(thread->pass_end_y, MAXHEIGHT); + + size_t size = queue->active_commands.size(); + for (size_t i = 0; i < size; i++) + { + auto &command = queue->active_commands[i]; + command->Execute(thread); + } } // Notify main thread that we finished: @@ -1611,6 +1627,79 @@ public: BYTE xshift = yshift - ds_xbits; int xmask = ((1 << ds_xbits) - 1) << ds_ybits; + int sse_count = count / 4; + count -= sse_count * 4; + + if (shade_constants.simple_shade) + { + SSE_SHADE_SIMPLE_INIT(light); + + while (sse_count--) + { + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p0 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p1 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p2 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p3 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + // Lookup pixel from flat texture tile + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + SSE_SHADE_SIMPLE(fg); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + else + { + SSE_SHADE_INIT(light, shade_constants); + + while (sse_count--) + { + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p0 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p1 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p2 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + spot = ((xfrac >> xshift) & xmask) + (yfrac >> yshift); + uint32_t p3 = source[spot]; + xfrac += xstep; + yfrac += ystep; + + // Lookup pixel from flat texture tile + __m128i fg = _mm_set_epi32(p3, p2, p1, p0); + SSE_SHADE(fg, shade_constants); + _mm_storeu_si128((__m128i*)dest, fg); + dest += 4; + } + } + + if (count == 0) + return; + do { // Current texture index in u,v. diff --git a/src/r_drawt_rgba.cpp b/src/r_drawt_rgba.cpp index c2caec0c2c..28c86d3f57 100644 --- a/src/r_drawt_rgba.cpp +++ b/src/r_drawt_rgba.cpp @@ -1528,40 +1528,38 @@ public: fracstep = dc_iscale; frac = dc_texturefrac; - { - const BYTE *source = dc_source; + const BYTE *source = dc_source; - if (count & 1) { - *dest = source[frac >> FRACBITS]; dest += 4; frac += fracstep; - } - if (count & 2) { - dest[0] = source[frac >> FRACBITS]; frac += fracstep; - dest[4] = source[frac >> FRACBITS]; frac += fracstep; - dest += 8; - } - if (count & 4) { - dest[0] = source[frac >> FRACBITS]; frac += fracstep; - dest[4] = source[frac >> FRACBITS]; frac += fracstep; - dest[8] = source[frac >> FRACBITS]; frac += fracstep; - dest[12] = source[frac >> FRACBITS]; frac += fracstep; - dest += 16; - } - count >>= 3; - if (!count) return; - - do - { - dest[0] = source[frac >> FRACBITS]; frac += fracstep; - dest[4] = source[frac >> FRACBITS]; frac += fracstep; - dest[8] = source[frac >> FRACBITS]; frac += fracstep; - dest[12] = source[frac >> FRACBITS]; frac += fracstep; - dest[16] = source[frac >> FRACBITS]; frac += fracstep; - dest[20] = source[frac >> FRACBITS]; frac += fracstep; - dest[24] = source[frac >> FRACBITS]; frac += fracstep; - dest[28] = source[frac >> FRACBITS]; frac += fracstep; - dest += 32; - } while (--count); + if (count & 1) { + *dest = source[frac >> FRACBITS]; dest += 4; frac += fracstep; } + if (count & 2) { + dest[0] = source[frac >> FRACBITS]; frac += fracstep; + dest[4] = source[frac >> FRACBITS]; frac += fracstep; + dest += 8; + } + if (count & 4) { + dest[0] = source[frac >> FRACBITS]; frac += fracstep; + dest[4] = source[frac >> FRACBITS]; frac += fracstep; + dest[8] = source[frac >> FRACBITS]; frac += fracstep; + dest[12] = source[frac >> FRACBITS]; frac += fracstep; + dest += 16; + } + count >>= 3; + if (!count) return; + + do + { + dest[0] = source[frac >> FRACBITS]; frac += fracstep; + dest[4] = source[frac >> FRACBITS]; frac += fracstep; + dest[8] = source[frac >> FRACBITS]; frac += fracstep; + dest[12] = source[frac >> FRACBITS]; frac += fracstep; + dest[16] = source[frac >> FRACBITS]; frac += fracstep; + dest[20] = source[frac >> FRACBITS]; frac += fracstep; + dest[24] = source[frac >> FRACBITS]; frac += fracstep; + dest[28] = source[frac >> FRACBITS]; frac += fracstep; + dest += 32; + } while (--count); } };