mirror of
https://github.com/id-Software/DOOM-3-BFG.git
synced 2025-03-14 06:34:10 +00:00
Copied CullingThreadPool to renderer for making an id Tech 5 version
This commit is contained in:
parent
abff15168b
commit
1e2b1be338
8 changed files with 839 additions and 6 deletions
|
@ -97,31 +97,43 @@ public:
|
|||
|
||||
// Submit the jobs in this list.
|
||||
void Submit( idParallelJobList* waitForJobList = NULL, int parallelism = JOBLIST_PARALLELISM_DEFAULT );
|
||||
|
||||
// Wait for the jobs in this list to finish. Will spin in place if any jobs are not done.
|
||||
void Wait();
|
||||
|
||||
// Try to wait for the jobs in this list to finish but either way return immediately. Returns true if all jobs are done.
|
||||
bool TryWait();
|
||||
|
||||
// returns true if the job list has been submitted.
|
||||
bool IsSubmitted() const;
|
||||
|
||||
// Get the number of jobs executed in this job list.
|
||||
unsigned int GetNumExecutedJobs() const;
|
||||
|
||||
// Get the number of sync points.
|
||||
unsigned int GetNumSyncs() const;
|
||||
|
||||
// Time at which the job list was submitted.
|
||||
uint64 GetSubmitTimeMicroSec() const;
|
||||
|
||||
// Time at which execution of this job list started.
|
||||
uint64 GetStartTimeMicroSec() const;
|
||||
|
||||
// Time at which all jobs in the list were executed.
|
||||
uint64 GetFinishTimeMicroSec() const;
|
||||
|
||||
// Time the host thread waited for this job list to finish.
|
||||
uint64 GetWaitTimeMicroSec() const;
|
||||
|
||||
// Get the total time all units spent processing this job list.
|
||||
uint64 GetTotalProcessingTimeMicroSec() const;
|
||||
|
||||
// Get the total time all units wasted while processing this job list.
|
||||
uint64 GetTotalWastedTimeMicroSec() const;
|
||||
|
||||
// Time the given unit spent processing this job list.
|
||||
uint64 GetUnitProcessingTimeMicroSec( int unit ) const;
|
||||
|
||||
// Time the given unit wasted while processing this job list.
|
||||
uint64 GetUnitWastedTimeMicroSec( int unit ) const;
|
||||
|
||||
|
|
503
neo/renderer/CullingThreadpool.cpp
Normal file
503
neo/renderer/CullingThreadpool.cpp
Normal file
|
@ -0,0 +1,503 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Copyright 2017 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#include <assert.h>
|
||||
#include "CullingThreadpool.h"
|
||||
|
||||
#define SAFE_DELETE(X) {if (X != nullptr) delete X; X = nullptr;}
|
||||
#define SAFE_DELETE_ARRAY(X) {if (X != nullptr) delete[] X; X = nullptr;}
|
||||
|
||||
template<class T> CullingThreadpool::StateData<T>::StateData( unsigned int maxJobs ) :
|
||||
mMaxJobs( maxJobs ),
|
||||
mCurrentIdx( ~0 )
|
||||
{
|
||||
mData = new T[mMaxJobs];
|
||||
}
|
||||
|
||||
template<class T> CullingThreadpool::StateData<T>::~StateData()
|
||||
{
|
||||
SAFE_DELETE_ARRAY( mData );
|
||||
}
|
||||
|
||||
template<class T> void CullingThreadpool::StateData<T>::AddData( const T& data )
|
||||
{
|
||||
mCurrentIdx++;
|
||||
mData[mCurrentIdx % mMaxJobs] = data;
|
||||
}
|
||||
|
||||
template<class T> const T* CullingThreadpool::StateData<T>::GetData() const
|
||||
{
|
||||
return &mData[mCurrentIdx % mMaxJobs];
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Helper class: Mostly lockless queue for render jobs
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
CullingThreadpool::RenderJobQueue::RenderJobQueue( unsigned int nBins, unsigned int maxJobs ) :
|
||||
mNumBins( nBins ),
|
||||
mMaxJobs( maxJobs )
|
||||
{
|
||||
mRenderPtrs = new std::atomic_uint[mNumBins];
|
||||
mBinMutexes = new std::atomic_uint[mNumBins];
|
||||
for( unsigned int i = 0; i < mNumBins; ++i )
|
||||
{
|
||||
mBinMutexes[i] = 0;
|
||||
}
|
||||
|
||||
mJobs = new Job[mMaxJobs];
|
||||
for( unsigned int i = 0; i < mMaxJobs; ++i )
|
||||
{
|
||||
mJobs[i].mRenderJobs = new TriList[mNumBins];
|
||||
}
|
||||
|
||||
// Compute worst case job size (we allocate memory for the worst case)
|
||||
const unsigned int TriSize = 3 * 3;
|
||||
const unsigned int MaxTrisPerJob = TRIS_PER_JOB * 6;
|
||||
const unsigned int MaxJobSize = MaxTrisPerJob * TriSize;
|
||||
mTrilistData = new float[MaxJobSize * mMaxJobs * mNumBins];
|
||||
|
||||
// Setup trilist objects used for binning
|
||||
for( unsigned int i = 0; i < mMaxJobs; ++i )
|
||||
{
|
||||
for( unsigned int j = 0; j < mNumBins; ++j )
|
||||
{
|
||||
int idx = i * mNumBins + j;
|
||||
TriList& tList = mJobs[i].mRenderJobs[j];
|
||||
tList.mNumTriangles = MaxTrisPerJob;
|
||||
tList.mTriIdx = 0;
|
||||
tList.mPtr = mTrilistData + idx * MaxJobSize;
|
||||
}
|
||||
}
|
||||
|
||||
// Clear render queue
|
||||
Reset();
|
||||
}
|
||||
|
||||
CullingThreadpool::RenderJobQueue::~RenderJobQueue()
|
||||
{
|
||||
SAFE_DELETE_ARRAY( mRenderPtrs );
|
||||
SAFE_DELETE_ARRAY( mBinMutexes );
|
||||
for( unsigned int i = 0; i < mMaxJobs; ++i )
|
||||
{
|
||||
SAFE_DELETE_ARRAY( mJobs[i].mRenderJobs );
|
||||
}
|
||||
SAFE_DELETE_ARRAY( mJobs );
|
||||
SAFE_DELETE_ARRAY( mTrilistData );
|
||||
}
|
||||
|
||||
inline unsigned int CullingThreadpool::RenderJobQueue::GetMinRenderPtr() const
|
||||
{
|
||||
unsigned int minRenderPtr = mRenderPtrs[0];
|
||||
for( unsigned int i = 1; i < mNumBins; ++i )
|
||||
{
|
||||
unsigned int renderPtr = mRenderPtrs[i];
|
||||
minRenderPtr = renderPtr < minRenderPtr ? renderPtr : minRenderPtr;
|
||||
}
|
||||
return minRenderPtr;
|
||||
}
|
||||
|
||||
inline void CullingThreadpool::RenderJobQueue::AdvanceRenderJob( int binIdx )
|
||||
{
|
||||
mRenderPtrs[binIdx]++;
|
||||
mBinMutexes[binIdx] = 0;
|
||||
}
|
||||
|
||||
inline unsigned int CullingThreadpool::RenderJobQueue::GetBestGlobalQueue() const
|
||||
{
|
||||
// Find least advanced queue
|
||||
unsigned int bestBin = ~0, bestPtr = mWritePtr;
|
||||
for( unsigned int i = 0; i < mNumBins; ++i )
|
||||
{
|
||||
if( mRenderPtrs[i] < bestPtr && mBinMutexes[i] == 0 )
|
||||
{
|
||||
bestBin = i;
|
||||
bestPtr = mRenderPtrs[i];
|
||||
}
|
||||
}
|
||||
return bestBin;
|
||||
}
|
||||
|
||||
inline bool CullingThreadpool::RenderJobQueue::IsPipelineEmpty() const
|
||||
{
|
||||
return GetMinRenderPtr() == mWritePtr;
|
||||
}
|
||||
|
||||
inline bool CullingThreadpool::RenderJobQueue::CanWrite() const
|
||||
{
|
||||
return mWritePtr - GetMinRenderPtr() < mMaxJobs;
|
||||
}
|
||||
|
||||
inline bool CullingThreadpool::RenderJobQueue::CanBin() const
|
||||
{
|
||||
return mBinningPtr < mWritePtr && mBinningPtr - GetMinRenderPtr() < mMaxJobs;
|
||||
}
|
||||
|
||||
inline CullingThreadpool::RenderJobQueue::Job* CullingThreadpool::RenderJobQueue::GetWriteJob()
|
||||
{
|
||||
return &mJobs[mWritePtr % mMaxJobs];
|
||||
}
|
||||
|
||||
inline void CullingThreadpool::RenderJobQueue::AdvanceWriteJob()
|
||||
{
|
||||
mWritePtr++;
|
||||
}
|
||||
|
||||
inline CullingThreadpool::RenderJobQueue::Job* CullingThreadpool::RenderJobQueue::GetBinningJob()
|
||||
{
|
||||
unsigned int binningPtr = mBinningPtr;
|
||||
if( binningPtr < mWritePtr && binningPtr - GetMinRenderPtr() < mMaxJobs )
|
||||
{
|
||||
if( mBinningPtr.compare_exchange_strong( binningPtr, binningPtr + 1 ) )
|
||||
{
|
||||
mJobs[binningPtr % mMaxJobs].mBinningJobStartedIdx = binningPtr;
|
||||
return &mJobs[binningPtr % mMaxJobs];
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
inline void CullingThreadpool::RenderJobQueue::FinishedBinningJob( Job* job )
|
||||
{
|
||||
job->mBinningJobCompletedIdx = job->mBinningJobStartedIdx;
|
||||
}
|
||||
|
||||
inline CullingThreadpool::RenderJobQueue::Job* CullingThreadpool::RenderJobQueue::GetRenderJob( int binIdx )
|
||||
{
|
||||
// Attempt to lock bin mutex
|
||||
unsigned int expected = 0;
|
||||
if( !mBinMutexes[binIdx].compare_exchange_strong( expected, 1 ) )
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Check any items in the queue, and bail if empty
|
||||
if( mRenderPtrs[binIdx] != mJobs[mRenderPtrs[binIdx] % mMaxJobs].mBinningJobCompletedIdx )
|
||||
{
|
||||
mBinMutexes[binIdx] = 0;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return &mJobs[mRenderPtrs[binIdx] % mMaxJobs];
|
||||
}
|
||||
|
||||
void CullingThreadpool::RenderJobQueue::Reset()
|
||||
{
|
||||
mWritePtr = 0;
|
||||
mBinningPtr = 0;
|
||||
|
||||
for( unsigned int i = 0; i < mNumBins; ++i )
|
||||
{
|
||||
mRenderPtrs[i] = 0;
|
||||
}
|
||||
|
||||
for( unsigned int i = 0; i < mMaxJobs; ++i )
|
||||
{
|
||||
mJobs[i].mBinningJobCompletedIdx = -1;
|
||||
mJobs[i].mBinningJobStartedIdx = -1;
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Culling threadpool private helper functions
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void CullingThreadpool::SetupScissors()
|
||||
{
|
||||
unsigned int width, height;
|
||||
mMOC->GetResolution( width, height );
|
||||
|
||||
unsigned int binWidth;
|
||||
unsigned int binHeight;
|
||||
mMOC->ComputeBinWidthHeight( mBinsW, mBinsH, binWidth, binHeight );
|
||||
|
||||
for( unsigned int ty = 0; ty < mBinsH; ++ty )
|
||||
{
|
||||
for( unsigned int tx = 0; tx < mBinsW; ++tx )
|
||||
{
|
||||
unsigned int threadIdx = tx + ty * mBinsW;
|
||||
|
||||
// Adjust rects on final row / col to match resolution
|
||||
mRects[threadIdx].mMinX = tx * binWidth;
|
||||
mRects[threadIdx].mMaxX = tx + 1 == mBinsW ? width : ( tx + 1 ) * binWidth;
|
||||
mRects[threadIdx].mMinY = ty * binHeight;
|
||||
mRects[threadIdx].mMaxY = ty + 1 == mBinsH ? height : ( ty + 1 ) * binHeight;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CullingThreadpool::ThreadRun( CullingThreadpool* threadPool, unsigned int threadId )
|
||||
{
|
||||
threadPool->ThreadMain( threadId );
|
||||
}
|
||||
|
||||
void CullingThreadpool::ThreadMain( unsigned int threadIdx )
|
||||
{
|
||||
while( true )
|
||||
{
|
||||
bool threadIsIdle = true;
|
||||
unsigned int threadBinIdx = threadIdx;
|
||||
|
||||
// Wait for threads to be woken up (low CPU load sleep)
|
||||
std::unique_lock<std::mutex> lock( mSuspendedMutex );
|
||||
mNumSuspendedThreads++;
|
||||
mSuspendedCV.wait( lock, [&] {return !mSuspendThreads; } );
|
||||
mNumSuspendedThreads--;
|
||||
lock.unlock();
|
||||
|
||||
// Loop until suspended again
|
||||
while( !mSuspendThreads || !threadIsIdle )
|
||||
{
|
||||
if( mKillThreads )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
threadIsIdle = false;
|
||||
|
||||
// Prio 1: Process any render jobs local to this thread
|
||||
unsigned int binIdx = threadBinIdx;
|
||||
threadBinIdx = threadBinIdx + mNumThreads < mNumBins ? threadBinIdx + mNumThreads : threadIdx;
|
||||
RenderJobQueue::Job* job = mRenderQueue->GetRenderJob( binIdx );
|
||||
if( job != nullptr )
|
||||
{
|
||||
if( job->mRenderJobs[binIdx].mTriIdx > 0 )
|
||||
{
|
||||
mMOC->RenderTrilist( job->mRenderJobs[binIdx], &mRects[binIdx] );
|
||||
}
|
||||
|
||||
mRenderQueue->AdvanceRenderJob( binIdx );
|
||||
continue;
|
||||
}
|
||||
|
||||
// Prio 2: Process any outstanding setup/binning jobs
|
||||
if( mRenderQueue->CanBin() )
|
||||
{
|
||||
// If no more rasterization jobs, get next binning job
|
||||
RenderJobQueue::Job* job = mRenderQueue->GetBinningJob();
|
||||
if( job != nullptr )
|
||||
{
|
||||
RenderJobQueue::BinningJob& sjob = job->mBinningJob;
|
||||
for( unsigned int i = 0; i < mNumBins; ++i )
|
||||
{
|
||||
job->mRenderJobs[i].mTriIdx = 0;
|
||||
}
|
||||
mMOC->BinTriangles( sjob.mVerts, sjob.mTris, sjob.nTris, job->mRenderJobs, mBinsW, mBinsH, sjob.mMatrix, sjob.mBfWinding, sjob.mClipPlanes, *sjob.mVtxLayout );
|
||||
mRenderQueue->FinishedBinningJob( job );
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Prio 3: No work is available, work steal from another thread's queue
|
||||
if( mNumBins > mNumThreads )
|
||||
{
|
||||
binIdx = mRenderQueue->GetBestGlobalQueue();
|
||||
if( binIdx < mRenderQueue->mNumBins )
|
||||
{
|
||||
RenderJobQueue::Job* job = mRenderQueue->GetRenderJob( binIdx );
|
||||
if( job != nullptr )
|
||||
{
|
||||
if( job->mRenderJobs[binIdx].mTriIdx > 0 )
|
||||
{
|
||||
mMOC->RenderTrilist( job->mRenderJobs[binIdx], &mRects[binIdx] );
|
||||
}
|
||||
|
||||
mRenderQueue->AdvanceRenderJob( binIdx );
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// No work available: Yield this thread
|
||||
std::this_thread::yield();
|
||||
threadIsIdle = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Culling threadpool public API, similar to the MaskedOcclusionCulling class
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
CullingThreadpool::CullingThreadpool( unsigned int numThreads, unsigned int binsW, unsigned int binsH, unsigned int maxJobs ) :
|
||||
mNumThreads( numThreads ),
|
||||
mMaxJobs( maxJobs ),
|
||||
mBinsW( binsW ),
|
||||
mBinsH( binsH ),
|
||||
mKillThreads( false ),
|
||||
mSuspendThreads( true ),
|
||||
mNumSuspendedThreads( 0 ),
|
||||
mModelToClipMatrices( maxJobs ),
|
||||
mVertexLayouts( maxJobs ),
|
||||
mMOC( nullptr )
|
||||
{
|
||||
mNumBins = mBinsW * mBinsH;
|
||||
assert( mNumBins >= mNumThreads ); // Having less bins than threads is a bad idea!
|
||||
|
||||
mRects = new ScissorRect[mNumBins];
|
||||
mRenderQueue = new RenderJobQueue( mNumBins, mMaxJobs );
|
||||
|
||||
// Add default vertex layout and matrix
|
||||
mVertexLayouts.AddData( VertexLayout( 16, 4, 12 ) );
|
||||
mCurrentMatrix = nullptr;
|
||||
|
||||
mThreads = new std::thread[mNumThreads];
|
||||
for( unsigned int i = 0; i < mNumThreads; ++i )
|
||||
{
|
||||
mThreads[i] = std::thread( ThreadRun, this, i );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
CullingThreadpool::~CullingThreadpool()
|
||||
{
|
||||
// Wait for threads to terminate
|
||||
if( mThreads != nullptr || !mKillThreads )
|
||||
{
|
||||
WakeThreads();
|
||||
mKillThreads = true;
|
||||
for( unsigned int i = 0; i < mNumThreads; ++i )
|
||||
{
|
||||
mThreads[i].join();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Free memory
|
||||
SAFE_DELETE( mRenderQueue );
|
||||
SAFE_DELETE_ARRAY( mRects );
|
||||
SAFE_DELETE_ARRAY( mThreads );
|
||||
}
|
||||
|
||||
void CullingThreadpool::WakeThreads()
|
||||
{
|
||||
// Wait for all threads to be in suspended mode
|
||||
while( mNumSuspendedThreads < mNumThreads )
|
||||
{
|
||||
std::this_thread::yield();
|
||||
}
|
||||
|
||||
// Send wake up event
|
||||
std::unique_lock<std::mutex> lock( mSuspendedMutex );
|
||||
mSuspendThreads = false;
|
||||
lock.unlock();
|
||||
mSuspendedCV.notify_all();
|
||||
}
|
||||
|
||||
void CullingThreadpool::SuspendThreads()
|
||||
{
|
||||
// Signal threads to go into suspended mode (after finishing all outstanding work)
|
||||
mSuspendThreads = true;
|
||||
}
|
||||
|
||||
void CullingThreadpool::Flush()
|
||||
{
|
||||
// Wait for pipeline to be empty (i.e. all work is finished)
|
||||
while( !mRenderQueue->IsPipelineEmpty() )
|
||||
{
|
||||
std::this_thread::yield();
|
||||
}
|
||||
|
||||
// Reset queue counters
|
||||
mRenderQueue->Reset();
|
||||
}
|
||||
|
||||
void CullingThreadpool::SetBuffer( MaskedOcclusionCulling* moc )
|
||||
{
|
||||
Flush();
|
||||
mMOC = moc;
|
||||
SetupScissors();
|
||||
}
|
||||
|
||||
void CullingThreadpool::SetResolution( unsigned int width, unsigned int height )
|
||||
{
|
||||
Flush();
|
||||
mMOC->SetResolution( width, height );
|
||||
SetupScissors();
|
||||
}
|
||||
|
||||
void CullingThreadpool::SetNearClipPlane( float nearDist )
|
||||
{
|
||||
Flush();
|
||||
mMOC->SetNearClipPlane( nearDist );
|
||||
}
|
||||
|
||||
void CullingThreadpool::SetMatrix( const float* modelToClipMatrix )
|
||||
{
|
||||
// Treat nullptr matrix as a special case, otherwise copy the contents of the pointer and add to state
|
||||
if( modelToClipMatrix == nullptr )
|
||||
{
|
||||
mCurrentMatrix = nullptr;
|
||||
}
|
||||
else
|
||||
{
|
||||
mModelToClipMatrices.AddData( Matrix4x4( modelToClipMatrix ) );
|
||||
mCurrentMatrix = mModelToClipMatrices.GetData()->mValues;
|
||||
}
|
||||
}
|
||||
|
||||
void CullingThreadpool::SetVertexLayout( const VertexLayout& vtxLayout )
|
||||
{
|
||||
mVertexLayouts.AddData( vtxLayout );
|
||||
}
|
||||
|
||||
void CullingThreadpool::ClearBuffer()
|
||||
{
|
||||
Flush();
|
||||
mMOC->ClearBuffer();
|
||||
}
|
||||
|
||||
void CullingThreadpool::RenderTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask )
|
||||
{
|
||||
#if MOC_RECORDER_ENABLE != 0
|
||||
mMOC->RecordRenderTriangles( inVtx, inTris, nTris, mCurrentMatrix, clipPlaneMask, bfWinding, *mVertexLayouts.GetData( ) );
|
||||
#endif
|
||||
|
||||
for( int i = 0; i < nTris; i += TRIS_PER_JOB )
|
||||
{
|
||||
// Yield if work queue is full
|
||||
while( !mRenderQueue->CanWrite() )
|
||||
{
|
||||
std::this_thread::yield();
|
||||
}
|
||||
|
||||
// Create new renderjob
|
||||
RenderJobQueue::Job* job = mRenderQueue->GetWriteJob();
|
||||
job->mBinningJob.mVerts = inVtx;
|
||||
job->mBinningJob.mTris = inTris + i * 3;
|
||||
job->mBinningJob.nTris = nTris - i < TRIS_PER_JOB ? nTris - i : TRIS_PER_JOB;
|
||||
job->mBinningJob.mMatrix = mCurrentMatrix;
|
||||
job->mBinningJob.mClipPlanes = clipPlaneMask;
|
||||
job->mBinningJob.mBfWinding = bfWinding;
|
||||
job->mBinningJob.mVtxLayout = mVertexLayouts.GetData();
|
||||
mRenderQueue->AdvanceWriteJob();
|
||||
}
|
||||
}
|
||||
|
||||
CullingThreadpool::CullingResult CullingThreadpool::TestRect( float xmin, float ymin, float xmax, float ymax, float wmin )
|
||||
{
|
||||
return mMOC->TestRect( xmin, ymin, xmax, ymax, wmin );
|
||||
}
|
||||
|
||||
CullingThreadpool::CullingResult CullingThreadpool::TestTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask )
|
||||
{
|
||||
return mMOC->TestTriangles( inVtx, inTris, nTris, mCurrentMatrix, bfWinding, clipPlaneMask, *mVertexLayouts.GetData() );
|
||||
}
|
||||
|
||||
void CullingThreadpool::ComputePixelDepthBuffer( float* depthData, bool flipY )
|
||||
{
|
||||
Flush();
|
||||
mMOC->ComputePixelDepthBuffer( depthData, flipY );
|
||||
}
|
311
neo/renderer/CullingThreadpool.h
Normal file
311
neo/renderer/CullingThreadpool.h
Normal file
|
@ -0,0 +1,311 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Copyright 2017 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#pragma once
|
||||
|
||||
/*!
|
||||
* \file CullingThreadpool.h
|
||||
* \brief Worker threadpool example for threaded masked occlusion culling.
|
||||
*
|
||||
* This class implements a threadpool for occluder rendering. Calls to CullingThreadpool::RenderTriangle()
|
||||
* will immediately return, after adding work items to a queue, and occluder rendering is performed
|
||||
* by worker threads as quickly as possible. Occlusion queries are performed directly on the calling
|
||||
* threadand can be performed either synchronosly, by calling Flush() before executing the query, or
|
||||
* asynchronosly, by performing the query without waiting for the worker threads to finish.
|
||||
*
|
||||
* Note that this implementation should be considered an example rather than the best threading
|
||||
* solution. You may want to integrate threading in your own task system, and it may also be beneficial
|
||||
* to thread the traversal code. Refer to MaskedOcclusionCulling::BinTriangles() and
|
||||
* MaskedOcclusionCulling::RenderTrilist() for functions that can be used to make your own
|
||||
* threaded culling system.
|
||||
*/
|
||||
|
||||
#include <thread>
|
||||
#include <atomic>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
|
||||
#include "MaskedOcclusionCulling.h"
|
||||
|
||||
class CullingThreadpool
|
||||
{
|
||||
protected:
|
||||
static const int TRIS_PER_JOB = 1024; // Maximum number of triangles per job (bigger drawcalls are split), affects memory requirements
|
||||
|
||||
typedef MaskedOcclusionCulling::CullingResult CullingResult;
|
||||
typedef MaskedOcclusionCulling::ClipPlanes ClipPlanes;
|
||||
typedef MaskedOcclusionCulling::BackfaceWinding BackfaceWinding;
|
||||
typedef MaskedOcclusionCulling::ScissorRect ScissorRect;
|
||||
typedef MaskedOcclusionCulling::VertexLayout VertexLayout;
|
||||
typedef MaskedOcclusionCulling::TriList TriList;
|
||||
|
||||
// Small utility class for 4x4 matrices
|
||||
struct Matrix4x4
|
||||
{
|
||||
float mValues[16];
|
||||
Matrix4x4() {}
|
||||
Matrix4x4( const float* matrix )
|
||||
{
|
||||
for( int i = 0; i < 16; ++i )
|
||||
{
|
||||
mValues[i] = matrix[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Internal utility class for a (mostly) lockless queue for binning & rendering jobs
|
||||
struct RenderJobQueue
|
||||
{
|
||||
struct BinningJob
|
||||
{
|
||||
const float* mVerts;
|
||||
const unsigned int* mTris;
|
||||
unsigned int nTris;
|
||||
|
||||
const float* mMatrix;
|
||||
ClipPlanes mClipPlanes;
|
||||
BackfaceWinding mBfWinding;
|
||||
const VertexLayout* mVtxLayout;
|
||||
};
|
||||
|
||||
struct Job
|
||||
{
|
||||
volatile unsigned int mBinningJobStartedIdx;
|
||||
volatile unsigned int mBinningJobCompletedIdx;
|
||||
BinningJob mBinningJob;
|
||||
TriList* mRenderJobs;
|
||||
};
|
||||
|
||||
unsigned int mNumBins;
|
||||
unsigned int mMaxJobs;
|
||||
|
||||
volatile unsigned int mWritePtr;
|
||||
std::atomic_uint mBinningPtr;
|
||||
std::atomic_uint* mRenderPtrs;
|
||||
std::atomic_uint* mBinMutexes;
|
||||
|
||||
float* mTrilistData;
|
||||
Job* mJobs;
|
||||
|
||||
RenderJobQueue( unsigned int nBins, unsigned int maxJobs );
|
||||
~RenderJobQueue();
|
||||
|
||||
unsigned int GetMinRenderPtr() const;
|
||||
unsigned int GetBestGlobalQueue() const;
|
||||
bool IsPipelineEmpty() const;
|
||||
|
||||
bool CanWrite() const;
|
||||
bool CanBin() const;
|
||||
|
||||
Job* GetWriteJob();
|
||||
void AdvanceWriteJob();
|
||||
|
||||
Job* GetBinningJob();
|
||||
void FinishedBinningJob( Job* job );
|
||||
|
||||
Job* GetRenderJob( int binIdx );
|
||||
void AdvanceRenderJob( int binIdx );
|
||||
|
||||
void Reset();
|
||||
};
|
||||
|
||||
// Internal utility class for state (matrix / vertex layout)
|
||||
template<class T> struct StateData
|
||||
{
|
||||
unsigned int mMaxJobs;
|
||||
unsigned int mCurrentIdx;
|
||||
T* mData;
|
||||
|
||||
StateData( unsigned int maxJobs );
|
||||
~StateData();
|
||||
void AddData( const T& data );
|
||||
const T* GetData() const;
|
||||
};
|
||||
|
||||
// Number of worker threads and bins
|
||||
unsigned int mNumThreads;
|
||||
unsigned int mNumBins;
|
||||
unsigned int mMaxJobs;
|
||||
unsigned int mBinsW;
|
||||
unsigned int mBinsH;
|
||||
|
||||
// Threads and control variables
|
||||
std::mutex mSuspendedMutex;
|
||||
std::condition_variable mSuspendedCV;
|
||||
volatile bool mKillThreads;
|
||||
volatile bool mSuspendThreads;
|
||||
volatile unsigned int mNumSuspendedThreads;
|
||||
std::thread* mThreads;
|
||||
|
||||
// State variables and command queue
|
||||
const float* mCurrentMatrix;
|
||||
StateData<Matrix4x4> mModelToClipMatrices;
|
||||
StateData<VertexLayout> mVertexLayouts;
|
||||
RenderJobQueue* mRenderQueue;
|
||||
|
||||
// Occlusion culling object and related scissor rectangles
|
||||
ScissorRect* mRects;
|
||||
MaskedOcclusionCulling* mMOC;
|
||||
|
||||
void SetupScissors();
|
||||
|
||||
static void ThreadRun( CullingThreadpool* threadPool, unsigned int threadId );
|
||||
void ThreadMain( unsigned int threadIdx );
|
||||
|
||||
public:
|
||||
/*!
|
||||
* \brief Creates a new threadpool for masked occlusion culling. This object has a
|
||||
* similar API to the MaskedOcclusionCulling class, but performs occluder
|
||||
* rendering asynchronously on worker threads (similar to how DX/GL works).
|
||||
*
|
||||
* \param numThreads Number of worker threads to perform occluder rendering. Best
|
||||
* balance may be scene/machine dependent, but it's good practice to leave at
|
||||
* least one full core (2 threads with hyperthreading) for the main thread.
|
||||
* \param binsW The screen is divided into binsW x binsH rectangular bins for load
|
||||
* balancing. The number of bins should be atleast equal to the number of
|
||||
* worker threads.
|
||||
* \param binsH See description for the binsW parameter.
|
||||
* \param maxJobs Maximum number of jobs that may be in flight at any given time. If
|
||||
* the caller thread generates jobs faster than the worker threads can finish
|
||||
* them, then the job queue will fill up and the caller thread will stall once
|
||||
* "maxJobs" items have been queued up. For culling systems interleaving occlusion
|
||||
* queries and rendering, this value should be kept quite low to minimize false
|
||||
* positives (see TestRect()). We've observed that 32 [default] items typically
|
||||
* works well for our interleaved queries, while also allowing good load-balancing,
|
||||
* and this is the recommended setting.
|
||||
*/
|
||||
CullingThreadpool( unsigned int numThreads, unsigned int binsW, unsigned int binsH, unsigned int maxJobs = 32 );
|
||||
|
||||
/*!
|
||||
* \brief Destroys the threadpool and terminates all worker threads.
|
||||
*/
|
||||
~CullingThreadpool();
|
||||
|
||||
/*!
|
||||
* \brief Wakes up culling worker threads from suspended sleep, and puts them in a
|
||||
* ready state (using an idle spinlock with significantly higher CPU overhead).
|
||||
*
|
||||
* It may take on the order of 100us to wake up the threads, so this function should
|
||||
* preferably be called slightly ahead of starting occlusion culling work.
|
||||
*/
|
||||
void WakeThreads();
|
||||
|
||||
/*!
|
||||
* \brief Suspend all culling worker threads to a low CPU overhead sleep state.
|
||||
*
|
||||
* For performance and latency reasons, the culling work is performed in an active
|
||||
* processing loop (with no thread sleeping) with high CPU overhead. In a system
|
||||
* with more worker threads it's important to put the culling worker threads in a
|
||||
* low overhead sleep state after occlusion culling work has completed.
|
||||
*/
|
||||
void SuspendThreads();
|
||||
|
||||
/*!
|
||||
* \brief Waits for all outstanding occluder rendering work to complete. Can be used
|
||||
* to ensure that rendering has completed before performing a TestRect() or
|
||||
* TestTriangles() call.
|
||||
*/
|
||||
void Flush();
|
||||
|
||||
/*
|
||||
* \brief Sets the MaskedOcclusionCulling object (buffer) to be used for rendering and
|
||||
* testing calls. This method causes a Flush() to ensure that all unfinished
|
||||
* rendering is completed.
|
||||
*/
|
||||
void SetBuffer( MaskedOcclusionCulling* moc );
|
||||
|
||||
/*
|
||||
* \brief Changes the resolution of the occlusion buffer, see MaskedOcclusionCulling::SetResolution().
|
||||
* This method causes a Flush() to ensure that all unfinished rendering is completed.
|
||||
*/
|
||||
void SetResolution( unsigned int width, unsigned int height );
|
||||
|
||||
/*
|
||||
* \brief Sets the near clipping plane, see MaskedOcclusionCulling::SetNearClipPlane(). This
|
||||
* method causes a Flush() to ensure that all unfinished rendering is completed.
|
||||
*/
|
||||
void SetNearClipPlane( float nearDist );
|
||||
|
||||
/*
|
||||
* \brief Sets the model to clipspace transform matrix used for the RenderTriangles() and TestTriangles()
|
||||
* function calls. The contents of the matrix is copied, and it's safe to modify it without calling
|
||||
* Flush(). The copy may be costly, which is the reason for passing this parameter as "state".
|
||||
*
|
||||
* \param modelToClipMatrix All vertices will be transformed by the specified model to clipspace matrix.
|
||||
* Passing nullptr [default] disables the transform (equivalent to using an identity matrix).
|
||||
*/
|
||||
void SetMatrix( const float* modelToClipMatrix = nullptr );
|
||||
|
||||
/*
|
||||
* \brief Sets the vertex layout used for the RenderTriangles() and TestTriangles() function calls.
|
||||
* The vertex layout is copied, and it's safe to modify it without calling Flush(). The copy
|
||||
* may be costly, which is the reason for passing this parameter as "state".
|
||||
*
|
||||
* \param vtxLayout A struct specifying the vertex layout (see struct for detailed
|
||||
* description). For best performance, it is advicable to store position data
|
||||
* as compactly in memory as possible.
|
||||
*/
|
||||
void SetVertexLayout( const VertexLayout& vtxLayout = VertexLayout( 16, 4, 12 ) );
|
||||
|
||||
/*
|
||||
* \brief Clears the occlusion buffer, see MaskedOcclusionCulling::ClearBuffer(). This method
|
||||
* causes a Flush() to ensure that all unfinished rendering is completed.
|
||||
*/
|
||||
void ClearBuffer();
|
||||
|
||||
/*
|
||||
* \brief Asynchronously render occluder triangles, see MaskedOcclusionCulling::RenderTriangles().
|
||||
*
|
||||
* This method puts the drawcall into a command queue, and immediately returns. The rendering is
|
||||
* performed by the worker threads at the earliest opportunity.
|
||||
*
|
||||
* <B>Important:</B> As rendering is performed asynchronously, the application is not allowed to
|
||||
* change the contents of the *inVtx or *inTris buffers until after rendering is completed. If
|
||||
* you wish to use dynamic buffers, the application must perform a Flush() to ensure that rendering
|
||||
* is finished, or make sure to rotate between more buffers than the maximum number of outstanding
|
||||
* render jobs (see the CullingThreadpool() constructor).
|
||||
*/
|
||||
void RenderTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding = MaskedOcclusionCulling::BACKFACE_CW, ClipPlanes clipPlaneMask = MaskedOcclusionCulling::CLIP_PLANE_ALL );
|
||||
|
||||
/*
|
||||
* \brief Occlusion query for a rectangle with a given depth, see MaskedOcclusionCulling::TestRect().
|
||||
*
|
||||
* <B>Important:</B> This method is performed on the main thread and does not wait for outstanding
|
||||
* occluder rendering to be finished. To ensure that all occluder rendering is completed you must
|
||||
* perform a Flush() prior to calling this function.
|
||||
*
|
||||
* It is conservatively correct to perform occlusion queries without calling Flush() (it may only
|
||||
* lead to objects being incorrectly classified as visible), and it can lead to much better performance
|
||||
* if occlusion queries are used for traversing a BVH or similar data structure. It's possible to
|
||||
* use "asynchronous" queries during traversal, and removing false positives later, when rendering
|
||||
* has completed.
|
||||
*/
|
||||
CullingResult TestRect( float xmin, float ymin, float xmax, float ymax, float wmin );
|
||||
|
||||
/*
|
||||
* \brief Occlusion query for a mesh, see MaskedOcclusionCulling::TestTriangles().
|
||||
*
|
||||
* <B>Important:</B> See the TestRect() method for a brief discussion about asynchronous occlusion
|
||||
* queries.
|
||||
*/
|
||||
CullingResult TestTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding = MaskedOcclusionCulling::BACKFACE_CW, ClipPlanes clipPlaneMask = MaskedOcclusionCulling::CLIP_PLANE_ALL );
|
||||
|
||||
/*!
|
||||
* \brief Creates a per-pixel depth buffer from the hierarchical z buffer representation, see
|
||||
* MaskedOcclusionCulling::ComputePixelDepthBuffer(). This method causes a Flush() to
|
||||
* ensure that all unfinished rendering is completed.
|
||||
*/
|
||||
void ComputePixelDepthBuffer( float* depthData, bool flipY );
|
||||
};
|
|
@ -64,13 +64,15 @@ SURFACES
|
|||
#include "ModelOverlay.h"
|
||||
#include "Interaction.h"
|
||||
|
||||
#define MOC_MULTITHREADED 0
|
||||
// RB begin
|
||||
#define MOC_MULTITHREADED 1
|
||||
|
||||
#if MOC_MULTITHREADED
|
||||
class CullingThreadpool;
|
||||
#endif
|
||||
class MaskedOcclusionCulling;
|
||||
// RB end
|
||||
|
||||
class MaskedOcclusionCulling; // RB
|
||||
class idRenderWorldLocal;
|
||||
struct viewEntity_t;
|
||||
struct viewLight_t;
|
||||
|
|
|
@ -36,7 +36,7 @@ If you have questions concerning this license or the applicable additional terms
|
|||
|
||||
#if defined(USE_INTRINSICS_SSE)
|
||||
#if MOC_MULTITHREADED
|
||||
#include "../libs/moc/CullingThreadPool.h"
|
||||
#include "CullingThreadPool.h"
|
||||
#else
|
||||
#include "../libs/moc/MaskedOcclusionCulling.h"
|
||||
#endif
|
||||
|
|
|
@ -32,7 +32,7 @@ If you have questions concerning this license or the applicable additional terms
|
|||
|
||||
#if defined(USE_INTRINSICS_SSE)
|
||||
#if MOC_MULTITHREADED
|
||||
#include "../libs/moc/CullingThreadPool.h"
|
||||
#include "CullingThreadPool.h"
|
||||
#else
|
||||
#include "../libs/moc/MaskedOcclusionCulling.h"
|
||||
#endif
|
||||
|
|
|
@ -32,7 +32,7 @@ If you have questions concerning this license or the applicable additional terms
|
|||
|
||||
#if defined(USE_INTRINSICS_SSE)
|
||||
#if MOC_MULTITHREADED
|
||||
#include "../libs/moc/CullingThreadPool.h"
|
||||
#include "CullingThreadPool.h"
|
||||
#else
|
||||
#include "../libs/moc/MaskedOcclusionCulling.h"
|
||||
#endif
|
||||
|
|
|
@ -31,7 +31,7 @@ If you have questions concerning this license or the applicable additional terms
|
|||
|
||||
#if defined(USE_INTRINSICS_SSE)
|
||||
#if MOC_MULTITHREADED
|
||||
#include "../libs/moc/CullingThreadPool.h"
|
||||
#include "CullingThreadPool.h"
|
||||
#else
|
||||
#include "../libs/moc/MaskedOcclusionCulling.h"
|
||||
#endif
|
||||
|
@ -580,6 +580,11 @@ void R_FillMaskedOcclusionBufferWithModels( viewDef_t* viewDef )
|
|||
}
|
||||
}
|
||||
|
||||
#if MOC_MULTITHREADED
|
||||
// wait for jobs to be finished
|
||||
tr.maskedOcclusionThreaded->Flush();
|
||||
#endif
|
||||
|
||||
int endTime = Sys_Microseconds();
|
||||
|
||||
tr.pc.mocMicroSec += endTime - startTime;
|
||||
|
|
Loading…
Reference in a new issue