mirror of
https://github.com/ZDoom/gzdoom-gles.git
synced 2024-11-28 23:11:58 +00:00
- add NUMA awareness to drawer threads
This commit is contained in:
parent
4859c3d301
commit
3e9f531b5f
8 changed files with 162 additions and 18 deletions
|
@ -127,8 +127,13 @@ void PolyTriangleThreadData::ClearStencil(uint8_t value)
|
||||||
int height = buffer->Height();
|
int height = buffer->Height();
|
||||||
uint8_t *data = buffer->Values();
|
uint8_t *data = buffer->Values();
|
||||||
|
|
||||||
data += core * width;
|
int start_y = numa_node * height / num_numa_nodes;
|
||||||
for (int y = core; y < height; y += num_cores)
|
int end_y = (numa_node + 1) * height / num_numa_nodes;
|
||||||
|
int core_skip = (num_cores - (start_y - core) % num_cores) % num_cores;
|
||||||
|
start_y += core_skip;
|
||||||
|
|
||||||
|
data += start_y * width;
|
||||||
|
for (int y = start_y; y < end_y; y += num_cores)
|
||||||
{
|
{
|
||||||
memset(data, value, width);
|
memset(data, value, width);
|
||||||
data += num_cores * width;
|
data += num_cores * width;
|
||||||
|
@ -146,6 +151,8 @@ void PolyTriangleThreadData::SetViewport(int x, int y, int width, int height, ui
|
||||||
dest_height = new_dest_height;
|
dest_height = new_dest_height;
|
||||||
dest_pitch = new_dest_pitch;
|
dest_pitch = new_dest_pitch;
|
||||||
dest_bgra = new_dest_bgra;
|
dest_bgra = new_dest_bgra;
|
||||||
|
numa_start_y = numa_node * dest_height / num_numa_nodes;
|
||||||
|
numa_end_y = (numa_node + 1) * dest_height / num_numa_nodes;
|
||||||
ccw = true;
|
ccw = true;
|
||||||
weaponScene = false;
|
weaponScene = false;
|
||||||
}
|
}
|
||||||
|
@ -642,7 +649,7 @@ int PolyTriangleThreadData::ClipEdge(const ShadedTriVertex *verts, ShadedTriVert
|
||||||
PolyTriangleThreadData *PolyTriangleThreadData::Get(DrawerThread *thread)
|
PolyTriangleThreadData *PolyTriangleThreadData::Get(DrawerThread *thread)
|
||||||
{
|
{
|
||||||
if (!thread->poly)
|
if (!thread->poly)
|
||||||
thread->poly = std::make_shared<PolyTriangleThreadData>(thread->core, thread->num_cores);
|
thread->poly = std::make_shared<PolyTriangleThreadData>(thread->core, thread->num_cores, thread->numa_node, thread->num_numa_nodes);
|
||||||
return thread->poly.get();
|
return thread->poly.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,7 @@ public:
|
||||||
class PolyTriangleThreadData
|
class PolyTriangleThreadData
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
PolyTriangleThreadData(int32_t core, int32_t num_cores) : core(core), num_cores(num_cores) { }
|
PolyTriangleThreadData(int32_t core, int32_t num_cores, int32_t numa_node, int32_t num_numa_nodes) : core(core), num_cores(num_cores), numa_node(numa_node), num_numa_nodes(num_numa_nodes) { }
|
||||||
|
|
||||||
void ClearStencil(uint8_t value);
|
void ClearStencil(uint8_t value);
|
||||||
void SetViewport(int x, int y, int width, int height, uint8_t *dest, int dest_width, int dest_height, int dest_pitch, bool dest_bgra);
|
void SetViewport(int x, int y, int width, int height, uint8_t *dest, int dest_width, int dest_height, int dest_pitch, bool dest_bgra);
|
||||||
|
@ -63,12 +63,18 @@ public:
|
||||||
|
|
||||||
int32_t core;
|
int32_t core;
|
||||||
int32_t num_cores;
|
int32_t num_cores;
|
||||||
|
int32_t numa_node;
|
||||||
|
int32_t num_numa_nodes;
|
||||||
|
|
||||||
|
int numa_start_y;
|
||||||
|
int numa_end_y;
|
||||||
|
|
||||||
// The number of lines to skip to reach the first line to be rendered by this thread
|
// The number of lines to skip to reach the first line to be rendered by this thread
|
||||||
int skipped_by_thread(int first_line)
|
int skipped_by_thread(int first_line)
|
||||||
{
|
{
|
||||||
int core_skip = (num_cores - (first_line - core) % num_cores) % num_cores;
|
int clip_first_line = MAX(first_line, numa_start_y);
|
||||||
return core_skip;
|
int core_skip = (num_cores - (clip_first_line - core) % num_cores) % num_cores;
|
||||||
|
return clip_first_line + core_skip - first_line;
|
||||||
}
|
}
|
||||||
|
|
||||||
static PolyTriangleThreadData *Get(DrawerThread *thread);
|
static PolyTriangleThreadData *Get(DrawerThread *thread);
|
||||||
|
|
|
@ -59,9 +59,9 @@ void ScreenTriangle::Draw(const TriDrawTriangleArgs *args, PolyTriangleThreadDat
|
||||||
SortVertices(args, sortedVertices);
|
SortVertices(args, sortedVertices);
|
||||||
|
|
||||||
int clipright = args->clipright;
|
int clipright = args->clipright;
|
||||||
int clipbottom = args->clipbottom;
|
int cliptop = thread->numa_start_y;
|
||||||
|
int clipbottom = MIN(args->clipbottom, thread->numa_end_y);
|
||||||
|
|
||||||
// Ranges that different triangles edges are active
|
|
||||||
int topY = (int)(sortedVertices[0]->y + 0.5f);
|
int topY = (int)(sortedVertices[0]->y + 0.5f);
|
||||||
int midY = (int)(sortedVertices[1]->y + 0.5f);
|
int midY = (int)(sortedVertices[1]->y + 0.5f);
|
||||||
int bottomY = (int)(sortedVertices[2]->y + 0.5f);
|
int bottomY = (int)(sortedVertices[2]->y + 0.5f);
|
||||||
|
@ -1567,6 +1567,7 @@ void DrawRect8(const void *destOrg, int destWidth, int destHeight, int destPitch
|
||||||
uint32_t stepV = (int32_t)(fstepV * 0x1000000);
|
uint32_t stepV = (int32_t)(fstepV * 0x1000000);
|
||||||
|
|
||||||
uint32_t posV = startV;
|
uint32_t posV = startV;
|
||||||
|
y1 = MIN(y1, thread->numa_end_y);
|
||||||
int num_cores = thread->num_cores;
|
int num_cores = thread->num_cores;
|
||||||
int skip = thread->skipped_by_thread(y0);
|
int skip = thread->skipped_by_thread(y0);
|
||||||
posV += skip * stepV;
|
posV += skip * stepV;
|
||||||
|
@ -1817,6 +1818,7 @@ void DrawRectOpt32(const void *destOrg, int destWidth, int destHeight, int destP
|
||||||
uint32_t stepV = (int32_t)(fstepV * 0x1000000);
|
uint32_t stepV = (int32_t)(fstepV * 0x1000000);
|
||||||
|
|
||||||
uint32_t posV = startV;
|
uint32_t posV = startV;
|
||||||
|
y1 = MIN(y1, thread->numa_end_y);
|
||||||
int num_cores = thread->num_cores;
|
int num_cores = thread->num_cores;
|
||||||
int skip = thread->skipped_by_thread(y0);
|
int skip = thread->skipped_by_thread(y0);
|
||||||
posV += skip * stepV;
|
posV += skip * stepV;
|
||||||
|
|
|
@ -35,6 +35,8 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "doomtype.h"
|
#include "doomtype.h"
|
||||||
|
#include <thread>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
struct ticcmd_t;
|
struct ticcmd_t;
|
||||||
struct WadStuff;
|
struct WadStuff;
|
||||||
|
@ -170,4 +172,8 @@ static inline char *strlwr(char *str)
|
||||||
return str;
|
return str;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline int I_GetNumaNodeCount() { return 1; }
|
||||||
|
inline int I_GetNumaNodeThreadCount(int numaNode) { return std::max<int>(std::thread::hardware_concurrency(), 1); }
|
||||||
|
inline void I_SetThreadNumaNode(std::thread &thread, int numaNode) { }
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -174,7 +174,11 @@ void DrawerThreads::StartThreads()
|
||||||
{
|
{
|
||||||
std::unique_lock<std::mutex> lock(threads_mutex);
|
std::unique_lock<std::mutex> lock(threads_mutex);
|
||||||
|
|
||||||
int num_threads = std::thread::hardware_concurrency();
|
int num_numathreads = 0;
|
||||||
|
for (int i = 0; i < I_GetNumaNodeCount(); i++)
|
||||||
|
num_numathreads += I_GetNumaNodeThreadCount(i);
|
||||||
|
|
||||||
|
int num_threads = num_numathreads;
|
||||||
if (num_threads == 0)
|
if (num_threads == 0)
|
||||||
num_threads = 4;
|
num_threads = 4;
|
||||||
|
|
||||||
|
@ -189,13 +193,41 @@ void DrawerThreads::StartThreads()
|
||||||
|
|
||||||
threads.resize(num_threads);
|
threads.resize(num_threads);
|
||||||
|
|
||||||
for (int i = 0; i < num_threads; i++)
|
if (num_threads == num_numathreads)
|
||||||
{
|
{
|
||||||
DrawerThreads *queue = this;
|
int curThread = 0;
|
||||||
DrawerThread *thread = &threads[i];
|
for (int numaNode = 0; numaNode < I_GetNumaNodeCount(); numaNode++)
|
||||||
thread->core = i;
|
{
|
||||||
thread->num_cores = num_threads;
|
for (int i = 0; i < I_GetNumaNodeThreadCount(numaNode); i++)
|
||||||
thread->thread = std::thread([=]() { queue->WorkerMain(thread); });
|
{
|
||||||
|
DrawerThreads *queue = this;
|
||||||
|
DrawerThread *thread = &threads[curThread++];
|
||||||
|
thread->core = i;
|
||||||
|
thread->num_cores = I_GetNumaNodeThreadCount(numaNode);
|
||||||
|
thread->numa_node = numaNode;
|
||||||
|
thread->num_numa_nodes = I_GetNumaNodeCount();
|
||||||
|
thread->numa_start_y = numaNode * viewheight / I_GetNumaNodeCount();
|
||||||
|
thread->numa_end_y = (numaNode + 1) * viewheight / I_GetNumaNodeCount();
|
||||||
|
thread->thread = std::thread([=]() { queue->WorkerMain(thread); });
|
||||||
|
I_SetThreadNumaNode(thread->thread, numaNode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int i = 0; i < num_threads; i++)
|
||||||
|
{
|
||||||
|
DrawerThreads *queue = this;
|
||||||
|
DrawerThread *thread = &threads[i];
|
||||||
|
thread->core = i;
|
||||||
|
thread->num_cores = num_threads;
|
||||||
|
thread->numa_node = 0;
|
||||||
|
thread->num_numa_nodes = 1;
|
||||||
|
thread->numa_start_y = 0;
|
||||||
|
thread->numa_end_y = viewheight;
|
||||||
|
thread->thread = std::thread([=]() { queue->WorkerMain(thread); });
|
||||||
|
I_SetThreadNumaNode(thread->thread, 0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,6 +47,16 @@ public:
|
||||||
// Number of active threads
|
// Number of active threads
|
||||||
int num_cores = 1;
|
int num_cores = 1;
|
||||||
|
|
||||||
|
// NUMA node this thread belongs to
|
||||||
|
int numa_node = 0;
|
||||||
|
|
||||||
|
// Number of active NUMA nodes
|
||||||
|
int num_numa_nodes = 1;
|
||||||
|
|
||||||
|
// Active range for the numa block the cores are part of
|
||||||
|
int numa_start_y = 0;
|
||||||
|
int numa_end_y = 0;
|
||||||
|
|
||||||
// Working buffer used by the tilted (sloped) span drawer
|
// Working buffer used by the tilted (sloped) span drawer
|
||||||
const uint8_t *tiltlighting[MAXWIDTH];
|
const uint8_t *tiltlighting[MAXWIDTH];
|
||||||
|
|
||||||
|
@ -57,19 +67,21 @@ public:
|
||||||
// Checks if a line is rendered by this thread
|
// Checks if a line is rendered by this thread
|
||||||
bool line_skipped_by_thread(int line)
|
bool line_skipped_by_thread(int line)
|
||||||
{
|
{
|
||||||
return line % num_cores != core;
|
return line < numa_start_y || line >= numa_end_y || line % num_cores != core;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The number of lines to skip to reach the first line to be rendered by this thread
|
// The number of lines to skip to reach the first line to be rendered by this thread
|
||||||
int skipped_by_thread(int first_line)
|
int skipped_by_thread(int first_line)
|
||||||
{
|
{
|
||||||
int core_skip = (num_cores - (first_line - core) % num_cores) % num_cores;
|
int clip_first_line = MAX(first_line, numa_start_y);
|
||||||
return core_skip;
|
int core_skip = (num_cores - (clip_first_line - core) % num_cores) % num_cores;
|
||||||
|
return clip_first_line + core_skip - first_line;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The number of lines to be rendered by this thread
|
// The number of lines to be rendered by this thread
|
||||||
int count_for_thread(int first_line, int count)
|
int count_for_thread(int first_line, int count)
|
||||||
{
|
{
|
||||||
|
count = MIN(count, numa_end_y - first_line);
|
||||||
int c = (count - skipped_by_thread(first_line) + num_cores - 1) / num_cores;
|
int c = (count - skipped_by_thread(first_line) + num_cores - 1) / num_cores;
|
||||||
return MAX(c, 0);
|
return MAX(c, 0);
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,6 +50,7 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <process.h>
|
#include <process.h>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
|
|
||||||
|
@ -1470,3 +1471,76 @@ int _stat64i32(const char *path, struct _stat64i32 *buffer)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
struct NumaNode
|
||||||
|
{
|
||||||
|
uint64_t affinityMask = 0;
|
||||||
|
int threadCount = 0;
|
||||||
|
};
|
||||||
|
static TArray<NumaNode> numaNodes;
|
||||||
|
|
||||||
|
static void SetupNumaNodes()
|
||||||
|
{
|
||||||
|
if (numaNodes.Size() == 0)
|
||||||
|
{
|
||||||
|
// Query processors in the system
|
||||||
|
DWORD_PTR processMask = 0, systemMask = 0;
|
||||||
|
BOOL result = GetProcessAffinityMask(GetCurrentProcess(), &processMask, &systemMask);
|
||||||
|
if (result)
|
||||||
|
{
|
||||||
|
// Find the numa node each processor belongs to
|
||||||
|
std::map<int, NumaNode> nodes;
|
||||||
|
for (int i = 0; i < sizeof(DWORD_PTR) * 8; i++)
|
||||||
|
{
|
||||||
|
DWORD_PTR processorMask = (((DWORD_PTR)1) << i);
|
||||||
|
if (processMask & processorMask)
|
||||||
|
{
|
||||||
|
UCHAR nodeNumber = 0;
|
||||||
|
result = GetNumaProcessorNode(i, &nodeNumber);
|
||||||
|
if (nodeNumber != 0xff)
|
||||||
|
{
|
||||||
|
nodes[nodeNumber].affinityMask |= (uint64_t)processorMask;
|
||||||
|
nodes[nodeNumber].threadCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert map to a list
|
||||||
|
for (const auto &it : nodes)
|
||||||
|
{
|
||||||
|
numaNodes.Push(it.second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to a single node if something went wrong
|
||||||
|
if (numaNodes.Size() == 0)
|
||||||
|
{
|
||||||
|
NumaNode node;
|
||||||
|
node.threadCount = std::thread::hardware_concurrency();
|
||||||
|
if (node.threadCount == 0)
|
||||||
|
node.threadCount = 1;
|
||||||
|
numaNodes.Push(node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int I_GetNumaNodeCount()
|
||||||
|
{
|
||||||
|
SetupNumaNodes();
|
||||||
|
return numaNodes.Size();
|
||||||
|
}
|
||||||
|
|
||||||
|
int I_GetNumaNodeThreadCount(int numaNode)
|
||||||
|
{
|
||||||
|
SetupNumaNodes();
|
||||||
|
return numaNodes[numaNode].threadCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
void I_SetThreadNumaNode(std::thread &thread, int numaNode)
|
||||||
|
{
|
||||||
|
if (numaNodes.Size() > 1)
|
||||||
|
{
|
||||||
|
HANDLE handle = (HANDLE)thread.native_handle();
|
||||||
|
SetThreadAffinityMask(handle, (DWORD_PTR)numaNodes[numaNode].affinityMask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -29,6 +29,7 @@
|
||||||
#define __I_SYSTEM__
|
#define __I_SYSTEM__
|
||||||
|
|
||||||
#include "doomtype.h"
|
#include "doomtype.h"
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
struct ticcmd_t;
|
struct ticcmd_t;
|
||||||
struct WadStuff;
|
struct WadStuff;
|
||||||
|
@ -186,4 +187,8 @@ inline int I_FindAttr(findstate_t *fileinfo)
|
||||||
#define FA_DIREC 0x00000010
|
#define FA_DIREC 0x00000010
|
||||||
#define FA_ARCH 0x00000020
|
#define FA_ARCH 0x00000020
|
||||||
|
|
||||||
|
int I_GetNumaNodeCount();
|
||||||
|
int I_GetNumaNodeThreadCount(int numaNode);
|
||||||
|
void I_SetThreadNumaNode(std::thread &thread, int numaNode);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in a new issue