Copied CullingThreadPool to renderer for making an id Tech 5 version

This commit is contained in:
Robert Beckebans 2024-09-02 21:00:01 +02:00
parent abff15168b
commit 1e2b1be338
8 changed files with 839 additions and 6 deletions

View file

@ -97,31 +97,43 @@ public:
// Submit the jobs in this list.
void Submit( idParallelJobList* waitForJobList = NULL, int parallelism = JOBLIST_PARALLELISM_DEFAULT );
// Wait for the jobs in this list to finish. Will spin in place if any jobs are not done.
void Wait();
// Try to wait for the jobs in this list to finish but either way return immediately. Returns true if all jobs are done.
bool TryWait();
// returns true if the job list has been submitted.
bool IsSubmitted() const;
// Get the number of jobs executed in this job list.
unsigned int GetNumExecutedJobs() const;
// Get the number of sync points.
unsigned int GetNumSyncs() const;
// Time at which the job list was submitted.
uint64 GetSubmitTimeMicroSec() const;
// Time at which execution of this job list started.
uint64 GetStartTimeMicroSec() const;
// Time at which all jobs in the list were executed.
uint64 GetFinishTimeMicroSec() const;
// Time the host thread waited for this job list to finish.
uint64 GetWaitTimeMicroSec() const;
// Get the total time all units spent processing this job list.
uint64 GetTotalProcessingTimeMicroSec() const;
// Get the total time all units wasted while processing this job list.
uint64 GetTotalWastedTimeMicroSec() const;
// Time the given unit spent processing this job list.
uint64 GetUnitProcessingTimeMicroSec( int unit ) const;
// Time the given unit wasted while processing this job list.
uint64 GetUnitWastedTimeMicroSec( int unit ) const;

View file

@ -0,0 +1,503 @@
////////////////////////////////////////////////////////////////////////////////
// Copyright 2017 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
////////////////////////////////////////////////////////////////////////////////
#include <assert.h>
#include "CullingThreadpool.h"
#define SAFE_DELETE(X) {if (X != nullptr) delete X; X = nullptr;}
#define SAFE_DELETE_ARRAY(X) {if (X != nullptr) delete[] X; X = nullptr;}
template<class T> CullingThreadpool::StateData<T>::StateData( unsigned int maxJobs ) :
mMaxJobs( maxJobs ),
mCurrentIdx( ~0 )
{
mData = new T[mMaxJobs];
}
template<class T> CullingThreadpool::StateData<T>::~StateData()
{
SAFE_DELETE_ARRAY( mData );
}
template<class T> void CullingThreadpool::StateData<T>::AddData( const T& data )
{
mCurrentIdx++;
mData[mCurrentIdx % mMaxJobs] = data;
}
template<class T> const T* CullingThreadpool::StateData<T>::GetData() const
{
return &mData[mCurrentIdx % mMaxJobs];
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Helper class: Mostly lockless queue for render jobs
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
CullingThreadpool::RenderJobQueue::RenderJobQueue( unsigned int nBins, unsigned int maxJobs ) :
mNumBins( nBins ),
mMaxJobs( maxJobs )
{
mRenderPtrs = new std::atomic_uint[mNumBins];
mBinMutexes = new std::atomic_uint[mNumBins];
for( unsigned int i = 0; i < mNumBins; ++i )
{
mBinMutexes[i] = 0;
}
mJobs = new Job[mMaxJobs];
for( unsigned int i = 0; i < mMaxJobs; ++i )
{
mJobs[i].mRenderJobs = new TriList[mNumBins];
}
// Compute worst case job size (we allocate memory for the worst case)
const unsigned int TriSize = 3 * 3;
const unsigned int MaxTrisPerJob = TRIS_PER_JOB * 6;
const unsigned int MaxJobSize = MaxTrisPerJob * TriSize;
mTrilistData = new float[MaxJobSize * mMaxJobs * mNumBins];
// Setup trilist objects used for binning
for( unsigned int i = 0; i < mMaxJobs; ++i )
{
for( unsigned int j = 0; j < mNumBins; ++j )
{
int idx = i * mNumBins + j;
TriList& tList = mJobs[i].mRenderJobs[j];
tList.mNumTriangles = MaxTrisPerJob;
tList.mTriIdx = 0;
tList.mPtr = mTrilistData + idx * MaxJobSize;
}
}
// Clear render queue
Reset();
}
CullingThreadpool::RenderJobQueue::~RenderJobQueue()
{
SAFE_DELETE_ARRAY( mRenderPtrs );
SAFE_DELETE_ARRAY( mBinMutexes );
for( unsigned int i = 0; i < mMaxJobs; ++i )
{
SAFE_DELETE_ARRAY( mJobs[i].mRenderJobs );
}
SAFE_DELETE_ARRAY( mJobs );
SAFE_DELETE_ARRAY( mTrilistData );
}
inline unsigned int CullingThreadpool::RenderJobQueue::GetMinRenderPtr() const
{
unsigned int minRenderPtr = mRenderPtrs[0];
for( unsigned int i = 1; i < mNumBins; ++i )
{
unsigned int renderPtr = mRenderPtrs[i];
minRenderPtr = renderPtr < minRenderPtr ? renderPtr : minRenderPtr;
}
return minRenderPtr;
}
inline void CullingThreadpool::RenderJobQueue::AdvanceRenderJob( int binIdx )
{
mRenderPtrs[binIdx]++;
mBinMutexes[binIdx] = 0;
}
inline unsigned int CullingThreadpool::RenderJobQueue::GetBestGlobalQueue() const
{
// Find least advanced queue
unsigned int bestBin = ~0, bestPtr = mWritePtr;
for( unsigned int i = 0; i < mNumBins; ++i )
{
if( mRenderPtrs[i] < bestPtr && mBinMutexes[i] == 0 )
{
bestBin = i;
bestPtr = mRenderPtrs[i];
}
}
return bestBin;
}
inline bool CullingThreadpool::RenderJobQueue::IsPipelineEmpty() const
{
return GetMinRenderPtr() == mWritePtr;
}
inline bool CullingThreadpool::RenderJobQueue::CanWrite() const
{
return mWritePtr - GetMinRenderPtr() < mMaxJobs;
}
inline bool CullingThreadpool::RenderJobQueue::CanBin() const
{
return mBinningPtr < mWritePtr && mBinningPtr - GetMinRenderPtr() < mMaxJobs;
}
inline CullingThreadpool::RenderJobQueue::Job* CullingThreadpool::RenderJobQueue::GetWriteJob()
{
return &mJobs[mWritePtr % mMaxJobs];
}
inline void CullingThreadpool::RenderJobQueue::AdvanceWriteJob()
{
mWritePtr++;
}
inline CullingThreadpool::RenderJobQueue::Job* CullingThreadpool::RenderJobQueue::GetBinningJob()
{
unsigned int binningPtr = mBinningPtr;
if( binningPtr < mWritePtr && binningPtr - GetMinRenderPtr() < mMaxJobs )
{
if( mBinningPtr.compare_exchange_strong( binningPtr, binningPtr + 1 ) )
{
mJobs[binningPtr % mMaxJobs].mBinningJobStartedIdx = binningPtr;
return &mJobs[binningPtr % mMaxJobs];
}
}
return nullptr;
}
inline void CullingThreadpool::RenderJobQueue::FinishedBinningJob( Job* job )
{
job->mBinningJobCompletedIdx = job->mBinningJobStartedIdx;
}
inline CullingThreadpool::RenderJobQueue::Job* CullingThreadpool::RenderJobQueue::GetRenderJob( int binIdx )
{
// Attempt to lock bin mutex
unsigned int expected = 0;
if( !mBinMutexes[binIdx].compare_exchange_strong( expected, 1 ) )
{
return nullptr;
}
// Check any items in the queue, and bail if empty
if( mRenderPtrs[binIdx] != mJobs[mRenderPtrs[binIdx] % mMaxJobs].mBinningJobCompletedIdx )
{
mBinMutexes[binIdx] = 0;
return nullptr;
}
return &mJobs[mRenderPtrs[binIdx] % mMaxJobs];
}
void CullingThreadpool::RenderJobQueue::Reset()
{
mWritePtr = 0;
mBinningPtr = 0;
for( unsigned int i = 0; i < mNumBins; ++i )
{
mRenderPtrs[i] = 0;
}
for( unsigned int i = 0; i < mMaxJobs; ++i )
{
mJobs[i].mBinningJobCompletedIdx = -1;
mJobs[i].mBinningJobStartedIdx = -1;
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Culling threadpool private helper functions
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void CullingThreadpool::SetupScissors()
{
unsigned int width, height;
mMOC->GetResolution( width, height );
unsigned int binWidth;
unsigned int binHeight;
mMOC->ComputeBinWidthHeight( mBinsW, mBinsH, binWidth, binHeight );
for( unsigned int ty = 0; ty < mBinsH; ++ty )
{
for( unsigned int tx = 0; tx < mBinsW; ++tx )
{
unsigned int threadIdx = tx + ty * mBinsW;
// Adjust rects on final row / col to match resolution
mRects[threadIdx].mMinX = tx * binWidth;
mRects[threadIdx].mMaxX = tx + 1 == mBinsW ? width : ( tx + 1 ) * binWidth;
mRects[threadIdx].mMinY = ty * binHeight;
mRects[threadIdx].mMaxY = ty + 1 == mBinsH ? height : ( ty + 1 ) * binHeight;
}
}
}
void CullingThreadpool::ThreadRun( CullingThreadpool* threadPool, unsigned int threadId )
{
threadPool->ThreadMain( threadId );
}
void CullingThreadpool::ThreadMain( unsigned int threadIdx )
{
while( true )
{
bool threadIsIdle = true;
unsigned int threadBinIdx = threadIdx;
// Wait for threads to be woken up (low CPU load sleep)
std::unique_lock<std::mutex> lock( mSuspendedMutex );
mNumSuspendedThreads++;
mSuspendedCV.wait( lock, [&] {return !mSuspendThreads; } );
mNumSuspendedThreads--;
lock.unlock();
// Loop until suspended again
while( !mSuspendThreads || !threadIsIdle )
{
if( mKillThreads )
{
return;
}
threadIsIdle = false;
// Prio 1: Process any render jobs local to this thread
unsigned int binIdx = threadBinIdx;
threadBinIdx = threadBinIdx + mNumThreads < mNumBins ? threadBinIdx + mNumThreads : threadIdx;
RenderJobQueue::Job* job = mRenderQueue->GetRenderJob( binIdx );
if( job != nullptr )
{
if( job->mRenderJobs[binIdx].mTriIdx > 0 )
{
mMOC->RenderTrilist( job->mRenderJobs[binIdx], &mRects[binIdx] );
}
mRenderQueue->AdvanceRenderJob( binIdx );
continue;
}
// Prio 2: Process any outstanding setup/binning jobs
if( mRenderQueue->CanBin() )
{
// If no more rasterization jobs, get next binning job
RenderJobQueue::Job* job = mRenderQueue->GetBinningJob();
if( job != nullptr )
{
RenderJobQueue::BinningJob& sjob = job->mBinningJob;
for( unsigned int i = 0; i < mNumBins; ++i )
{
job->mRenderJobs[i].mTriIdx = 0;
}
mMOC->BinTriangles( sjob.mVerts, sjob.mTris, sjob.nTris, job->mRenderJobs, mBinsW, mBinsH, sjob.mMatrix, sjob.mBfWinding, sjob.mClipPlanes, *sjob.mVtxLayout );
mRenderQueue->FinishedBinningJob( job );
}
continue;
}
// Prio 3: No work is available, work steal from another thread's queue
if( mNumBins > mNumThreads )
{
binIdx = mRenderQueue->GetBestGlobalQueue();
if( binIdx < mRenderQueue->mNumBins )
{
RenderJobQueue::Job* job = mRenderQueue->GetRenderJob( binIdx );
if( job != nullptr )
{
if( job->mRenderJobs[binIdx].mTriIdx > 0 )
{
mMOC->RenderTrilist( job->mRenderJobs[binIdx], &mRects[binIdx] );
}
mRenderQueue->AdvanceRenderJob( binIdx );
}
continue;
}
}
// No work available: Yield this thread
std::this_thread::yield();
threadIsIdle = true;
}
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Culling threadpool public API, similar to the MaskedOcclusionCulling class
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
CullingThreadpool::CullingThreadpool( unsigned int numThreads, unsigned int binsW, unsigned int binsH, unsigned int maxJobs ) :
mNumThreads( numThreads ),
mMaxJobs( maxJobs ),
mBinsW( binsW ),
mBinsH( binsH ),
mKillThreads( false ),
mSuspendThreads( true ),
mNumSuspendedThreads( 0 ),
mModelToClipMatrices( maxJobs ),
mVertexLayouts( maxJobs ),
mMOC( nullptr )
{
mNumBins = mBinsW * mBinsH;
assert( mNumBins >= mNumThreads ); // Having less bins than threads is a bad idea!
mRects = new ScissorRect[mNumBins];
mRenderQueue = new RenderJobQueue( mNumBins, mMaxJobs );
// Add default vertex layout and matrix
mVertexLayouts.AddData( VertexLayout( 16, 4, 12 ) );
mCurrentMatrix = nullptr;
mThreads = new std::thread[mNumThreads];
for( unsigned int i = 0; i < mNumThreads; ++i )
{
mThreads[i] = std::thread( ThreadRun, this, i );
}
}
CullingThreadpool::~CullingThreadpool()
{
// Wait for threads to terminate
if( mThreads != nullptr || !mKillThreads )
{
WakeThreads();
mKillThreads = true;
for( unsigned int i = 0; i < mNumThreads; ++i )
{
mThreads[i].join();
}
}
// Free memory
SAFE_DELETE( mRenderQueue );
SAFE_DELETE_ARRAY( mRects );
SAFE_DELETE_ARRAY( mThreads );
}
void CullingThreadpool::WakeThreads()
{
// Wait for all threads to be in suspended mode
while( mNumSuspendedThreads < mNumThreads )
{
std::this_thread::yield();
}
// Send wake up event
std::unique_lock<std::mutex> lock( mSuspendedMutex );
mSuspendThreads = false;
lock.unlock();
mSuspendedCV.notify_all();
}
void CullingThreadpool::SuspendThreads()
{
// Signal threads to go into suspended mode (after finishing all outstanding work)
mSuspendThreads = true;
}
void CullingThreadpool::Flush()
{
// Wait for pipeline to be empty (i.e. all work is finished)
while( !mRenderQueue->IsPipelineEmpty() )
{
std::this_thread::yield();
}
// Reset queue counters
mRenderQueue->Reset();
}
void CullingThreadpool::SetBuffer( MaskedOcclusionCulling* moc )
{
Flush();
mMOC = moc;
SetupScissors();
}
void CullingThreadpool::SetResolution( unsigned int width, unsigned int height )
{
Flush();
mMOC->SetResolution( width, height );
SetupScissors();
}
void CullingThreadpool::SetNearClipPlane( float nearDist )
{
Flush();
mMOC->SetNearClipPlane( nearDist );
}
void CullingThreadpool::SetMatrix( const float* modelToClipMatrix )
{
// Treat nullptr matrix as a special case, otherwise copy the contents of the pointer and add to state
if( modelToClipMatrix == nullptr )
{
mCurrentMatrix = nullptr;
}
else
{
mModelToClipMatrices.AddData( Matrix4x4( modelToClipMatrix ) );
mCurrentMatrix = mModelToClipMatrices.GetData()->mValues;
}
}
void CullingThreadpool::SetVertexLayout( const VertexLayout& vtxLayout )
{
mVertexLayouts.AddData( vtxLayout );
}
void CullingThreadpool::ClearBuffer()
{
Flush();
mMOC->ClearBuffer();
}
void CullingThreadpool::RenderTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask )
{
#if MOC_RECORDER_ENABLE != 0
mMOC->RecordRenderTriangles( inVtx, inTris, nTris, mCurrentMatrix, clipPlaneMask, bfWinding, *mVertexLayouts.GetData( ) );
#endif
for( int i = 0; i < nTris; i += TRIS_PER_JOB )
{
// Yield if work queue is full
while( !mRenderQueue->CanWrite() )
{
std::this_thread::yield();
}
// Create new renderjob
RenderJobQueue::Job* job = mRenderQueue->GetWriteJob();
job->mBinningJob.mVerts = inVtx;
job->mBinningJob.mTris = inTris + i * 3;
job->mBinningJob.nTris = nTris - i < TRIS_PER_JOB ? nTris - i : TRIS_PER_JOB;
job->mBinningJob.mMatrix = mCurrentMatrix;
job->mBinningJob.mClipPlanes = clipPlaneMask;
job->mBinningJob.mBfWinding = bfWinding;
job->mBinningJob.mVtxLayout = mVertexLayouts.GetData();
mRenderQueue->AdvanceWriteJob();
}
}
CullingThreadpool::CullingResult CullingThreadpool::TestRect( float xmin, float ymin, float xmax, float ymax, float wmin )
{
return mMOC->TestRect( xmin, ymin, xmax, ymax, wmin );
}
CullingThreadpool::CullingResult CullingThreadpool::TestTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask )
{
return mMOC->TestTriangles( inVtx, inTris, nTris, mCurrentMatrix, bfWinding, clipPlaneMask, *mVertexLayouts.GetData() );
}
void CullingThreadpool::ComputePixelDepthBuffer( float* depthData, bool flipY )
{
Flush();
mMOC->ComputePixelDepthBuffer( depthData, flipY );
}

View file

@ -0,0 +1,311 @@
////////////////////////////////////////////////////////////////////////////////
// Copyright 2017 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
////////////////////////////////////////////////////////////////////////////////
#pragma once
/*!
* \file CullingThreadpool.h
* \brief Worker threadpool example for threaded masked occlusion culling.
*
* This class implements a threadpool for occluder rendering. Calls to CullingThreadpool::RenderTriangle()
* will immediately return, after adding work items to a queue, and occluder rendering is performed
* by worker threads as quickly as possible. Occlusion queries are performed directly on the calling
* threadand can be performed either synchronosly, by calling Flush() before executing the query, or
* asynchronosly, by performing the query without waiting for the worker threads to finish.
*
* Note that this implementation should be considered an example rather than the best threading
* solution. You may want to integrate threading in your own task system, and it may also be beneficial
* to thread the traversal code. Refer to MaskedOcclusionCulling::BinTriangles() and
* MaskedOcclusionCulling::RenderTrilist() for functions that can be used to make your own
* threaded culling system.
*/
#include <thread>
#include <atomic>
#include <mutex>
#include <condition_variable>
#include "MaskedOcclusionCulling.h"
class CullingThreadpool
{
protected:
static const int TRIS_PER_JOB = 1024; // Maximum number of triangles per job (bigger drawcalls are split), affects memory requirements
typedef MaskedOcclusionCulling::CullingResult CullingResult;
typedef MaskedOcclusionCulling::ClipPlanes ClipPlanes;
typedef MaskedOcclusionCulling::BackfaceWinding BackfaceWinding;
typedef MaskedOcclusionCulling::ScissorRect ScissorRect;
typedef MaskedOcclusionCulling::VertexLayout VertexLayout;
typedef MaskedOcclusionCulling::TriList TriList;
// Small utility class for 4x4 matrices
struct Matrix4x4
{
float mValues[16];
Matrix4x4() {}
Matrix4x4( const float* matrix )
{
for( int i = 0; i < 16; ++i )
{
mValues[i] = matrix[i];
}
}
};
// Internal utility class for a (mostly) lockless queue for binning & rendering jobs
struct RenderJobQueue
{
struct BinningJob
{
const float* mVerts;
const unsigned int* mTris;
unsigned int nTris;
const float* mMatrix;
ClipPlanes mClipPlanes;
BackfaceWinding mBfWinding;
const VertexLayout* mVtxLayout;
};
struct Job
{
volatile unsigned int mBinningJobStartedIdx;
volatile unsigned int mBinningJobCompletedIdx;
BinningJob mBinningJob;
TriList* mRenderJobs;
};
unsigned int mNumBins;
unsigned int mMaxJobs;
volatile unsigned int mWritePtr;
std::atomic_uint mBinningPtr;
std::atomic_uint* mRenderPtrs;
std::atomic_uint* mBinMutexes;
float* mTrilistData;
Job* mJobs;
RenderJobQueue( unsigned int nBins, unsigned int maxJobs );
~RenderJobQueue();
unsigned int GetMinRenderPtr() const;
unsigned int GetBestGlobalQueue() const;
bool IsPipelineEmpty() const;
bool CanWrite() const;
bool CanBin() const;
Job* GetWriteJob();
void AdvanceWriteJob();
Job* GetBinningJob();
void FinishedBinningJob( Job* job );
Job* GetRenderJob( int binIdx );
void AdvanceRenderJob( int binIdx );
void Reset();
};
// Internal utility class for state (matrix / vertex layout)
template<class T> struct StateData
{
unsigned int mMaxJobs;
unsigned int mCurrentIdx;
T* mData;
StateData( unsigned int maxJobs );
~StateData();
void AddData( const T& data );
const T* GetData() const;
};
// Number of worker threads and bins
unsigned int mNumThreads;
unsigned int mNumBins;
unsigned int mMaxJobs;
unsigned int mBinsW;
unsigned int mBinsH;
// Threads and control variables
std::mutex mSuspendedMutex;
std::condition_variable mSuspendedCV;
volatile bool mKillThreads;
volatile bool mSuspendThreads;
volatile unsigned int mNumSuspendedThreads;
std::thread* mThreads;
// State variables and command queue
const float* mCurrentMatrix;
StateData<Matrix4x4> mModelToClipMatrices;
StateData<VertexLayout> mVertexLayouts;
RenderJobQueue* mRenderQueue;
// Occlusion culling object and related scissor rectangles
ScissorRect* mRects;
MaskedOcclusionCulling* mMOC;
void SetupScissors();
static void ThreadRun( CullingThreadpool* threadPool, unsigned int threadId );
void ThreadMain( unsigned int threadIdx );
public:
/*!
* \brief Creates a new threadpool for masked occlusion culling. This object has a
* similar API to the MaskedOcclusionCulling class, but performs occluder
* rendering asynchronously on worker threads (similar to how DX/GL works).
*
* \param numThreads Number of worker threads to perform occluder rendering. Best
* balance may be scene/machine dependent, but it's good practice to leave at
* least one full core (2 threads with hyperthreading) for the main thread.
* \param binsW The screen is divided into binsW x binsH rectangular bins for load
* balancing. The number of bins should be atleast equal to the number of
* worker threads.
* \param binsH See description for the binsW parameter.
* \param maxJobs Maximum number of jobs that may be in flight at any given time. If
* the caller thread generates jobs faster than the worker threads can finish
* them, then the job queue will fill up and the caller thread will stall once
* "maxJobs" items have been queued up. For culling systems interleaving occlusion
* queries and rendering, this value should be kept quite low to minimize false
* positives (see TestRect()). We've observed that 32 [default] items typically
* works well for our interleaved queries, while also allowing good load-balancing,
* and this is the recommended setting.
*/
CullingThreadpool( unsigned int numThreads, unsigned int binsW, unsigned int binsH, unsigned int maxJobs = 32 );
/*!
* \brief Destroys the threadpool and terminates all worker threads.
*/
~CullingThreadpool();
/*!
* \brief Wakes up culling worker threads from suspended sleep, and puts them in a
* ready state (using an idle spinlock with significantly higher CPU overhead).
*
* It may take on the order of 100us to wake up the threads, so this function should
* preferably be called slightly ahead of starting occlusion culling work.
*/
void WakeThreads();
/*!
* \brief Suspend all culling worker threads to a low CPU overhead sleep state.
*
* For performance and latency reasons, the culling work is performed in an active
* processing loop (with no thread sleeping) with high CPU overhead. In a system
* with more worker threads it's important to put the culling worker threads in a
* low overhead sleep state after occlusion culling work has completed.
*/
void SuspendThreads();
/*!
* \brief Waits for all outstanding occluder rendering work to complete. Can be used
* to ensure that rendering has completed before performing a TestRect() or
* TestTriangles() call.
*/
void Flush();
/*
* \brief Sets the MaskedOcclusionCulling object (buffer) to be used for rendering and
* testing calls. This method causes a Flush() to ensure that all unfinished
* rendering is completed.
*/
void SetBuffer( MaskedOcclusionCulling* moc );
/*
* \brief Changes the resolution of the occlusion buffer, see MaskedOcclusionCulling::SetResolution().
* This method causes a Flush() to ensure that all unfinished rendering is completed.
*/
void SetResolution( unsigned int width, unsigned int height );
/*
* \brief Sets the near clipping plane, see MaskedOcclusionCulling::SetNearClipPlane(). This
* method causes a Flush() to ensure that all unfinished rendering is completed.
*/
void SetNearClipPlane( float nearDist );
/*
* \brief Sets the model to clipspace transform matrix used for the RenderTriangles() and TestTriangles()
* function calls. The contents of the matrix is copied, and it's safe to modify it without calling
* Flush(). The copy may be costly, which is the reason for passing this parameter as "state".
*
* \param modelToClipMatrix All vertices will be transformed by the specified model to clipspace matrix.
* Passing nullptr [default] disables the transform (equivalent to using an identity matrix).
*/
void SetMatrix( const float* modelToClipMatrix = nullptr );
/*
* \brief Sets the vertex layout used for the RenderTriangles() and TestTriangles() function calls.
* The vertex layout is copied, and it's safe to modify it without calling Flush(). The copy
* may be costly, which is the reason for passing this parameter as "state".
*
* \param vtxLayout A struct specifying the vertex layout (see struct for detailed
* description). For best performance, it is advicable to store position data
* as compactly in memory as possible.
*/
void SetVertexLayout( const VertexLayout& vtxLayout = VertexLayout( 16, 4, 12 ) );
/*
* \brief Clears the occlusion buffer, see MaskedOcclusionCulling::ClearBuffer(). This method
* causes a Flush() to ensure that all unfinished rendering is completed.
*/
void ClearBuffer();
/*
* \brief Asynchronously render occluder triangles, see MaskedOcclusionCulling::RenderTriangles().
*
* This method puts the drawcall into a command queue, and immediately returns. The rendering is
* performed by the worker threads at the earliest opportunity.
*
* <B>Important:</B> As rendering is performed asynchronously, the application is not allowed to
* change the contents of the *inVtx or *inTris buffers until after rendering is completed. If
* you wish to use dynamic buffers, the application must perform a Flush() to ensure that rendering
* is finished, or make sure to rotate between more buffers than the maximum number of outstanding
* render jobs (see the CullingThreadpool() constructor).
*/
void RenderTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding = MaskedOcclusionCulling::BACKFACE_CW, ClipPlanes clipPlaneMask = MaskedOcclusionCulling::CLIP_PLANE_ALL );
/*
* \brief Occlusion query for a rectangle with a given depth, see MaskedOcclusionCulling::TestRect().
*
* <B>Important:</B> This method is performed on the main thread and does not wait for outstanding
* occluder rendering to be finished. To ensure that all occluder rendering is completed you must
* perform a Flush() prior to calling this function.
*
* It is conservatively correct to perform occlusion queries without calling Flush() (it may only
* lead to objects being incorrectly classified as visible), and it can lead to much better performance
* if occlusion queries are used for traversing a BVH or similar data structure. It's possible to
* use "asynchronous" queries during traversal, and removing false positives later, when rendering
* has completed.
*/
CullingResult TestRect( float xmin, float ymin, float xmax, float ymax, float wmin );
/*
* \brief Occlusion query for a mesh, see MaskedOcclusionCulling::TestTriangles().
*
* <B>Important:</B> See the TestRect() method for a brief discussion about asynchronous occlusion
* queries.
*/
CullingResult TestTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding = MaskedOcclusionCulling::BACKFACE_CW, ClipPlanes clipPlaneMask = MaskedOcclusionCulling::CLIP_PLANE_ALL );
/*!
* \brief Creates a per-pixel depth buffer from the hierarchical z buffer representation, see
* MaskedOcclusionCulling::ComputePixelDepthBuffer(). This method causes a Flush() to
* ensure that all unfinished rendering is completed.
*/
void ComputePixelDepthBuffer( float* depthData, bool flipY );
};

View file

@ -64,13 +64,15 @@ SURFACES
#include "ModelOverlay.h"
#include "Interaction.h"
#define MOC_MULTITHREADED 0
// RB begin
#define MOC_MULTITHREADED 1
#if MOC_MULTITHREADED
class CullingThreadpool;
#endif
class MaskedOcclusionCulling;
// RB end
class MaskedOcclusionCulling; // RB
class idRenderWorldLocal;
struct viewEntity_t;
struct viewLight_t;

View file

@ -36,7 +36,7 @@ If you have questions concerning this license or the applicable additional terms
#if defined(USE_INTRINSICS_SSE)
#if MOC_MULTITHREADED
#include "../libs/moc/CullingThreadPool.h"
#include "CullingThreadPool.h"
#else
#include "../libs/moc/MaskedOcclusionCulling.h"
#endif

View file

@ -32,7 +32,7 @@ If you have questions concerning this license or the applicable additional terms
#if defined(USE_INTRINSICS_SSE)
#if MOC_MULTITHREADED
#include "../libs/moc/CullingThreadPool.h"
#include "CullingThreadPool.h"
#else
#include "../libs/moc/MaskedOcclusionCulling.h"
#endif

View file

@ -32,7 +32,7 @@ If you have questions concerning this license or the applicable additional terms
#if defined(USE_INTRINSICS_SSE)
#if MOC_MULTITHREADED
#include "../libs/moc/CullingThreadPool.h"
#include "CullingThreadPool.h"
#else
#include "../libs/moc/MaskedOcclusionCulling.h"
#endif

View file

@ -31,7 +31,7 @@ If you have questions concerning this license or the applicable additional terms
#if defined(USE_INTRINSICS_SSE)
#if MOC_MULTITHREADED
#include "../libs/moc/CullingThreadPool.h"
#include "CullingThreadPool.h"
#else
#include "../libs/moc/MaskedOcclusionCulling.h"
#endif
@ -580,6 +580,11 @@ void R_FillMaskedOcclusionBufferWithModels( viewDef_t* viewDef )
}
}
#if MOC_MULTITHREADED
// wait for jobs to be finished
tr.maskedOcclusionThreaded->Flush();
#endif
int endTime = Sys_Microseconds();
tr.pc.mocMicroSec += endTime - startTime;