diff --git a/src/polyrenderer/drawers/poly_triangle.cpp b/src/polyrenderer/drawers/poly_triangle.cpp index 53face08c..f75b5e43d 100644 --- a/src/polyrenderer/drawers/poly_triangle.cpp +++ b/src/polyrenderer/drawers/poly_triangle.cpp @@ -127,8 +127,13 @@ void PolyTriangleThreadData::ClearStencil(uint8_t value) int height = buffer->Height(); uint8_t *data = buffer->Values(); - data += core * width; - for (int y = core; y < height; y += num_cores) + int start_y = numa_node * height / num_numa_nodes; + int end_y = (numa_node + 1) * height / num_numa_nodes; + int core_skip = (num_cores - (start_y - core) % num_cores) % num_cores; + start_y += core_skip; + + data += start_y * width; + for (int y = start_y; y < end_y; y += num_cores) { memset(data, value, width); data += num_cores * width; @@ -146,6 +151,8 @@ void PolyTriangleThreadData::SetViewport(int x, int y, int width, int height, ui dest_height = new_dest_height; dest_pitch = new_dest_pitch; dest_bgra = new_dest_bgra; + numa_start_y = numa_node * dest_height / num_numa_nodes; + numa_end_y = (numa_node + 1) * dest_height / num_numa_nodes; ccw = true; weaponScene = false; } @@ -642,7 +649,7 @@ int PolyTriangleThreadData::ClipEdge(const ShadedTriVertex *verts, ShadedTriVert PolyTriangleThreadData *PolyTriangleThreadData::Get(DrawerThread *thread) { if (!thread->poly) - thread->poly = std::make_shared(thread->core, thread->num_cores); + thread->poly = std::make_shared(thread->core, thread->num_cores, thread->numa_node, thread->num_numa_nodes); return thread->poly.get(); } diff --git a/src/polyrenderer/drawers/poly_triangle.h b/src/polyrenderer/drawers/poly_triangle.h index d2a49ec97..6ed9ee8a4 100644 --- a/src/polyrenderer/drawers/poly_triangle.h +++ b/src/polyrenderer/drawers/poly_triangle.h @@ -48,7 +48,7 @@ public: class PolyTriangleThreadData { public: - PolyTriangleThreadData(int32_t core, int32_t num_cores) : core(core), num_cores(num_cores) { } + PolyTriangleThreadData(int32_t core, int32_t num_cores, int32_t numa_node, int32_t num_numa_nodes) : core(core), num_cores(num_cores), numa_node(numa_node), num_numa_nodes(num_numa_nodes) { } void ClearStencil(uint8_t value); void SetViewport(int x, int y, int width, int height, uint8_t *dest, int dest_width, int dest_height, int dest_pitch, bool dest_bgra); @@ -63,12 +63,18 @@ public: int32_t core; int32_t num_cores; + int32_t numa_node; + int32_t num_numa_nodes; + + int numa_start_y; + int numa_end_y; // The number of lines to skip to reach the first line to be rendered by this thread int skipped_by_thread(int first_line) { - int core_skip = (num_cores - (first_line - core) % num_cores) % num_cores; - return core_skip; + int clip_first_line = MAX(first_line, numa_start_y); + int core_skip = (num_cores - (clip_first_line - core) % num_cores) % num_cores; + return clip_first_line + core_skip - first_line; } static PolyTriangleThreadData *Get(DrawerThread *thread); diff --git a/src/polyrenderer/drawers/screen_triangle.cpp b/src/polyrenderer/drawers/screen_triangle.cpp index f8b1a51f4..30f023166 100644 --- a/src/polyrenderer/drawers/screen_triangle.cpp +++ b/src/polyrenderer/drawers/screen_triangle.cpp @@ -59,9 +59,9 @@ void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, PolyTriangleThreadDat SortVertices(args, sortedVertices); int clipright = args->clipright; - int clipbottom = args->clipbottom; + int cliptop = thread->numa_start_y; + int clipbottom = MIN(args->clipbottom, thread->numa_end_y); - // Ranges that different triangles edges are active int topY = (int)(sortedVertices[0]->y + 0.5f); int midY = (int)(sortedVertices[1]->y + 0.5f); int bottomY = (int)(sortedVertices[2]->y + 0.5f); @@ -1567,6 +1567,7 @@ void DrawRect8(const void *destOrg, int destWidth, int destHeight, int destPitch uint32_t stepV = (int32_t)(fstepV * 0x1000000); uint32_t posV = startV; + y1 = MIN(y1, thread->numa_end_y); int num_cores = thread->num_cores; int skip = thread->skipped_by_thread(y0); posV += skip * stepV; @@ -1817,6 +1818,7 @@ void DrawRectOpt32(const void *destOrg, int destWidth, int destHeight, int destP uint32_t stepV = (int32_t)(fstepV * 0x1000000); uint32_t posV = startV; + y1 = MIN(y1, thread->numa_end_y); int num_cores = thread->num_cores; int skip = thread->skipped_by_thread(y0); posV += skip * stepV; diff --git a/src/posix/i_system.h b/src/posix/i_system.h index 7f468f143..1ff06a79e 100644 --- a/src/posix/i_system.h +++ b/src/posix/i_system.h @@ -35,6 +35,8 @@ #endif #include "doomtype.h" +#include +#include struct ticcmd_t; struct WadStuff; @@ -170,4 +172,8 @@ static inline char *strlwr(char *str) return str; } +inline int I_GetNumaNodeCount() { return 1; } +inline int I_GetNumaNodeThreadCount(int numaNode) { return std::max(std::thread::hardware_concurrency(), 1); } +inline void I_SetThreadNumaNode(std::thread &thread, int numaNode) { } + #endif diff --git a/src/swrenderer/drawers/r_thread.cpp b/src/swrenderer/drawers/r_thread.cpp index 0edf0d7f7..74956f14e 100644 --- a/src/swrenderer/drawers/r_thread.cpp +++ b/src/swrenderer/drawers/r_thread.cpp @@ -174,7 +174,11 @@ void DrawerThreads::StartThreads() { std::unique_lock lock(threads_mutex); - int num_threads = std::thread::hardware_concurrency(); + int num_numathreads = 0; + for (int i = 0; i < I_GetNumaNodeCount(); i++) + num_numathreads += I_GetNumaNodeThreadCount(i); + + int num_threads = num_numathreads; if (num_threads == 0) num_threads = 4; @@ -189,13 +193,41 @@ void DrawerThreads::StartThreads() threads.resize(num_threads); - for (int i = 0; i < num_threads; i++) + if (num_threads == num_numathreads) { - DrawerThreads *queue = this; - DrawerThread *thread = &threads[i]; - thread->core = i; - thread->num_cores = num_threads; - thread->thread = std::thread([=]() { queue->WorkerMain(thread); }); + int curThread = 0; + for (int numaNode = 0; numaNode < I_GetNumaNodeCount(); numaNode++) + { + for (int i = 0; i < I_GetNumaNodeThreadCount(numaNode); i++) + { + DrawerThreads *queue = this; + DrawerThread *thread = &threads[curThread++]; + thread->core = i; + thread->num_cores = I_GetNumaNodeThreadCount(numaNode); + thread->numa_node = numaNode; + thread->num_numa_nodes = I_GetNumaNodeCount(); + thread->numa_start_y = numaNode * viewheight / I_GetNumaNodeCount(); + thread->numa_end_y = (numaNode + 1) * viewheight / I_GetNumaNodeCount(); + thread->thread = std::thread([=]() { queue->WorkerMain(thread); }); + I_SetThreadNumaNode(thread->thread, numaNode); + } + } + } + else + { + for (int i = 0; i < num_threads; i++) + { + DrawerThreads *queue = this; + DrawerThread *thread = &threads[i]; + thread->core = i; + thread->num_cores = num_threads; + thread->numa_node = 0; + thread->num_numa_nodes = 1; + thread->numa_start_y = 0; + thread->numa_end_y = viewheight; + thread->thread = std::thread([=]() { queue->WorkerMain(thread); }); + I_SetThreadNumaNode(thread->thread, 0); + } } } } diff --git a/src/swrenderer/drawers/r_thread.h b/src/swrenderer/drawers/r_thread.h index c2e8b9c80..f2d1d4d0e 100644 --- a/src/swrenderer/drawers/r_thread.h +++ b/src/swrenderer/drawers/r_thread.h @@ -47,6 +47,16 @@ public: // Number of active threads int num_cores = 1; + // NUMA node this thread belongs to + int numa_node = 0; + + // Number of active NUMA nodes + int num_numa_nodes = 1; + + // Active range for the numa block the cores are part of + int numa_start_y = 0; + int numa_end_y = 0; + // Working buffer used by the tilted (sloped) span drawer const uint8_t *tiltlighting[MAXWIDTH]; @@ -57,19 +67,21 @@ public: // Checks if a line is rendered by this thread bool line_skipped_by_thread(int line) { - return line % num_cores != core; + return line < numa_start_y || line >= numa_end_y || line % num_cores != core; } // The number of lines to skip to reach the first line to be rendered by this thread int skipped_by_thread(int first_line) { - int core_skip = (num_cores - (first_line - core) % num_cores) % num_cores; - return core_skip; + int clip_first_line = MAX(first_line, numa_start_y); + int core_skip = (num_cores - (clip_first_line - core) % num_cores) % num_cores; + return clip_first_line + core_skip - first_line; } // The number of lines to be rendered by this thread int count_for_thread(int first_line, int count) { + count = MIN(count, numa_end_y - first_line); int c = (count - skipped_by_thread(first_line) + num_cores - 1) / num_cores; return MAX(c, 0); } diff --git a/src/win32/i_system.cpp b/src/win32/i_system.cpp index 44d2d4395..1e5c78640 100644 --- a/src/win32/i_system.cpp +++ b/src/win32/i_system.cpp @@ -50,6 +50,7 @@ #include #include #include +#include #include @@ -1470,3 +1471,76 @@ int _stat64i32(const char *path, struct _stat64i32 *buffer) return 0; } #endif + +struct NumaNode +{ + uint64_t affinityMask = 0; + int threadCount = 0; +}; +static TArray numaNodes; + +static void SetupNumaNodes() +{ + if (numaNodes.Size() == 0) + { + // Query processors in the system + DWORD_PTR processMask = 0, systemMask = 0; + BOOL result = GetProcessAffinityMask(GetCurrentProcess(), &processMask, &systemMask); + if (result) + { + // Find the numa node each processor belongs to + std::map nodes; + for (int i = 0; i < sizeof(DWORD_PTR) * 8; i++) + { + DWORD_PTR processorMask = (((DWORD_PTR)1) << i); + if (processMask & processorMask) + { + UCHAR nodeNumber = 0; + result = GetNumaProcessorNode(i, &nodeNumber); + if (nodeNumber != 0xff) + { + nodes[nodeNumber].affinityMask |= (uint64_t)processorMask; + nodes[nodeNumber].threadCount++; + } + } + } + + // Convert map to a list + for (const auto &it : nodes) + { + numaNodes.Push(it.second); + } + } + + // Fall back to a single node if something went wrong + if (numaNodes.Size() == 0) + { + NumaNode node; + node.threadCount = std::thread::hardware_concurrency(); + if (node.threadCount == 0) + node.threadCount = 1; + numaNodes.Push(node); + } + } +} + +int I_GetNumaNodeCount() +{ + SetupNumaNodes(); + return numaNodes.Size(); +} + +int I_GetNumaNodeThreadCount(int numaNode) +{ + SetupNumaNodes(); + return numaNodes[numaNode].threadCount; +} + +void I_SetThreadNumaNode(std::thread &thread, int numaNode) +{ + if (numaNodes.Size() > 1) + { + HANDLE handle = (HANDLE)thread.native_handle(); + SetThreadAffinityMask(handle, (DWORD_PTR)numaNodes[numaNode].affinityMask); + } +} diff --git a/src/win32/i_system.h b/src/win32/i_system.h index a5f7b5d0b..903203daf 100644 --- a/src/win32/i_system.h +++ b/src/win32/i_system.h @@ -29,6 +29,7 @@ #define __I_SYSTEM__ #include "doomtype.h" +#include struct ticcmd_t; struct WadStuff; @@ -186,4 +187,8 @@ inline int I_FindAttr(findstate_t *fileinfo) #define FA_DIREC 0x00000010 #define FA_ARCH 0x00000020 +int I_GetNumaNodeCount(); +int I_GetNumaNodeThreadCount(int numaNode); +void I_SetThreadNumaNode(std::thread &thread, int numaNode); + #endif