mirror of
https://github.com/id-Software/DOOM-3-BFG.git
synced 2025-03-14 06:34:10 +00:00
Added Masked Software Occlusion Culling lib by Intel
This commit is contained in:
parent
f6cd2386a7
commit
9e919c8d76
16 changed files with 6021 additions and 59 deletions
|
@ -428,6 +428,8 @@ if(NOT APPLE)
|
|||
add_subdirectory(tools/compilers)
|
||||
endif()
|
||||
|
||||
add_subdirectory(libs/moc)
|
||||
|
||||
file(GLOB NATVIS_SOURCES .natvis)
|
||||
|
||||
file(GLOB AAS_INCLUDES aas/*.h)
|
||||
|
|
85
neo/libs/moc/CMakeLists.txt
Normal file
85
neo/libs/moc/CMakeLists.txt
Normal file
|
@ -0,0 +1,85 @@
|
|||
#
|
||||
# CMake file for the masked occlusion culling library
|
||||
#
|
||||
set(CMAKE_SUPPRESS_REGENERATION true)
|
||||
option(USE_AVX512 "Enable experimental AVX-512 support" OFF)
|
||||
set(CMAKE_CONFIGURATION_TYPES Debug Release)
|
||||
|
||||
#
|
||||
# Lists of all files included in the library
|
||||
#
|
||||
set( MOC_AVX512_FILES MaskedOcclusionCullingAVX512.cpp )
|
||||
set( MOC_AVX2_FILES MaskedOcclusionCullingAVX2.cpp )
|
||||
set( MOC_SSE_FILES MaskedOcclusionCulling.cpp CullingThreadpool.cpp )
|
||||
set( MOC_INCLUDE_FILES MaskedOcclusionCulling.h CullingThreadpool.h CompilerSpecific.inl MaskedOcclusionCullingCommon.inl )
|
||||
set( MOC_FILES ${MOC_AVX512_FILES} ${MOC_AVX2_FILES} ${MOC_SSE_FILES} ${MOC_INCLUDE_FILES} )
|
||||
|
||||
#
|
||||
# Common compiler flags
|
||||
#
|
||||
if(MSVC)
|
||||
if(MSVC_VERSION LESS 1900)
|
||||
set(CMAKE_CXX_FLAGS "-std=c++11")
|
||||
endif()
|
||||
else()
|
||||
set(CMAKE_CXX_FLAGS "-std=c++11 -m64")
|
||||
endif()
|
||||
|
||||
if(MSVC)
|
||||
#
|
||||
# Setup compiler flags for AVX-512 files (MSVC)
|
||||
#
|
||||
|
||||
if (USE_AVX512)
|
||||
SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX512_FILES} PROPERTIES COMPILE_FLAGS "-DUSE_AVX512=1 /arch:AVX2" )
|
||||
else()
|
||||
SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX512_FILES} PROPERTIES COMPILE_FLAGS "/arch:AVX2" )
|
||||
endif()
|
||||
|
||||
#
|
||||
# Setup compiler flags for AVX2 files (MSVC)
|
||||
#
|
||||
SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX2_FILES} PROPERTIES COMPILE_FLAGS "/arch:AVX2" )
|
||||
|
||||
#
|
||||
# Setup compiler flags for SSE4.1 / SSE2 files (MSVC)
|
||||
#
|
||||
if(NOT "${CMAKE_GENERATOR}" MATCHES "(Win64|IA64)")
|
||||
# SSE2 is always enabled on 64-bit architectures, specifying redundant flag produces a compiler warning
|
||||
if(MSVC_VERSION LESS 1900)
|
||||
SET_SOURCE_FILES_PROPERTIES( ${MOC_SSE_FILES} PROPERTIES COMPILE_FLAGS "/arch:SSE2" )
|
||||
endif()
|
||||
endif()
|
||||
|
||||
else()
|
||||
|
||||
#
|
||||
# Setup compiler flags for AVX-512 files
|
||||
#
|
||||
if (USE_AVX512)
|
||||
SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX512_FILES} PROPERTIES COMPILE_FLAGS "-DUSE_AVX512=1 -mavx512f -mavx512bw -mavx512dq -mavx2 -mfma -msse4.1" )
|
||||
else()
|
||||
SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX512_FILES} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -msse4.1" )
|
||||
endif()
|
||||
|
||||
#
|
||||
# Setup compiler flags for AVX2 files
|
||||
#
|
||||
SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX2_FILES} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -msse4.1" )
|
||||
|
||||
#
|
||||
# Setup compiler flags for SSE4.1 / SSE2 files
|
||||
#
|
||||
SET_SOURCE_FILES_PROPERTIES( ${MOC_SSE_FILES} PROPERTIES COMPILE_FLAGS "-msse4.1" )
|
||||
|
||||
endif()
|
||||
|
||||
#
|
||||
# Create masked occlusion culling library
|
||||
#
|
||||
add_library( MaskedOcclusionCulling ${MOC_FILES} )
|
||||
|
||||
#
|
||||
# Add folder to include path
|
||||
#
|
||||
target_include_directories(MaskedOcclusionCulling PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
98
neo/libs/moc/CompilerSpecific.inl
Normal file
98
neo/libs/moc/CompilerSpecific.inl
Normal file
|
@ -0,0 +1,98 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Copyright 2017 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Common shared include file to hide compiler/os specific functions from the rest of the code.
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__)
|
||||
#define __MICROSOFT_COMPILER
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32) && (defined(_MSC_VER) || defined(__INTEL_COMPILER) || defined(__clang__)) // Windows: MSVC / Intel compiler / clang
|
||||
#include <intrin.h>
|
||||
#include <new.h>
|
||||
|
||||
#define FORCE_INLINE __forceinline
|
||||
|
||||
FORCE_INLINE unsigned long find_clear_lsb(unsigned int *mask)
|
||||
{
|
||||
unsigned long idx;
|
||||
_BitScanForward(&idx, *mask);
|
||||
*mask &= *mask - 1;
|
||||
return idx;
|
||||
}
|
||||
|
||||
FORCE_INLINE void *aligned_alloc(size_t alignment, size_t size)
|
||||
{
|
||||
return _aligned_malloc(size, alignment);
|
||||
}
|
||||
|
||||
FORCE_INLINE void aligned_free(void *ptr)
|
||||
{
|
||||
_aligned_free(ptr);
|
||||
}
|
||||
|
||||
#elif defined(__GNUG__) || defined(__clang__) // G++ or clang
|
||||
#include <cpuid.h>
|
||||
#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)
|
||||
#include <malloc/malloc.h> // memalign
|
||||
#else
|
||||
#include <malloc.h> // memalign
|
||||
#endif
|
||||
#include <mm_malloc.h>
|
||||
#include <immintrin.h>
|
||||
#include <new>
|
||||
|
||||
#define FORCE_INLINE inline
|
||||
|
||||
FORCE_INLINE unsigned long find_clear_lsb(unsigned int *mask)
|
||||
{
|
||||
unsigned long idx;
|
||||
idx = __builtin_ctzl(*mask);
|
||||
*mask &= *mask - 1;
|
||||
return idx;
|
||||
}
|
||||
|
||||
FORCE_INLINE void *aligned_alloc(size_t alignment, size_t size)
|
||||
{
|
||||
return memalign(alignment, size);
|
||||
}
|
||||
|
||||
FORCE_INLINE void aligned_free(void *ptr)
|
||||
{
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
FORCE_INLINE void __cpuidex(int* cpuinfo, int function, int subfunction)
|
||||
{
|
||||
__cpuid_count(function, subfunction, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
|
||||
}
|
||||
|
||||
FORCE_INLINE unsigned long long _xgetbv(unsigned int index)
|
||||
{
|
||||
unsigned int eax, edx;
|
||||
__asm__ __volatile__(
|
||||
"xgetbv;"
|
||||
: "=a" (eax), "=d"(edx)
|
||||
: "c" (index)
|
||||
);
|
||||
return ((unsigned long long)edx << 32) | eax;
|
||||
}
|
||||
|
||||
#else
|
||||
#error Unsupported compiler
|
||||
#endif
|
503
neo/libs/moc/CullingThreadpool.cpp
Normal file
503
neo/libs/moc/CullingThreadpool.cpp
Normal file
|
@ -0,0 +1,503 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Copyright 2017 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#include <assert.h>
|
||||
#include "CullingThreadpool.h"
|
||||
|
||||
#define SAFE_DELETE(X) {if (X != nullptr) delete X; X = nullptr;}
|
||||
#define SAFE_DELETE_ARRAY(X) {if (X != nullptr) delete[] X; X = nullptr;}
|
||||
|
||||
template<class T> CullingThreadpool::StateData<T>::StateData( unsigned int maxJobs ) :
|
||||
mMaxJobs( maxJobs ),
|
||||
mCurrentIdx( ~0 )
|
||||
{
|
||||
mData = new T[mMaxJobs];
|
||||
}
|
||||
|
||||
template<class T> CullingThreadpool::StateData<T>::~StateData()
|
||||
{
|
||||
SAFE_DELETE_ARRAY( mData );
|
||||
}
|
||||
|
||||
template<class T> void CullingThreadpool::StateData<T>::AddData( const T& data )
|
||||
{
|
||||
mCurrentIdx++;
|
||||
mData[mCurrentIdx % mMaxJobs] = data;
|
||||
}
|
||||
|
||||
template<class T> const T* CullingThreadpool::StateData<T>::GetData() const
|
||||
{
|
||||
return &mData[mCurrentIdx % mMaxJobs];
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Helper class: Mostly lockless queue for render jobs
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
CullingThreadpool::RenderJobQueue::RenderJobQueue( unsigned int nBins, unsigned int maxJobs ) :
|
||||
mNumBins( nBins ),
|
||||
mMaxJobs( maxJobs )
|
||||
{
|
||||
mRenderPtrs = new std::atomic_uint[mNumBins];
|
||||
mBinMutexes = new std::atomic_uint[mNumBins];
|
||||
for( unsigned int i = 0; i < mNumBins; ++i )
|
||||
{
|
||||
mBinMutexes[i] = 0;
|
||||
}
|
||||
|
||||
mJobs = new Job[mMaxJobs];
|
||||
for( unsigned int i = 0; i < mMaxJobs; ++i )
|
||||
{
|
||||
mJobs[i].mRenderJobs = new TriList[mNumBins];
|
||||
}
|
||||
|
||||
// Compute worst case job size (we allocate memory for the worst case)
|
||||
const unsigned int TriSize = 3 * 3;
|
||||
const unsigned int MaxTrisPerJob = TRIS_PER_JOB * 6;
|
||||
const unsigned int MaxJobSize = MaxTrisPerJob * TriSize;
|
||||
mTrilistData = new float[MaxJobSize * mMaxJobs * mNumBins];
|
||||
|
||||
// Setup trilist objects used for binning
|
||||
for( unsigned int i = 0; i < mMaxJobs; ++i )
|
||||
{
|
||||
for( unsigned int j = 0; j < mNumBins; ++j )
|
||||
{
|
||||
int idx = i * mNumBins + j;
|
||||
TriList& tList = mJobs[i].mRenderJobs[j];
|
||||
tList.mNumTriangles = MaxTrisPerJob;
|
||||
tList.mTriIdx = 0;
|
||||
tList.mPtr = mTrilistData + idx * MaxJobSize;
|
||||
}
|
||||
}
|
||||
|
||||
// Clear render queue
|
||||
Reset();
|
||||
}
|
||||
|
||||
CullingThreadpool::RenderJobQueue::~RenderJobQueue()
|
||||
{
|
||||
SAFE_DELETE_ARRAY( mRenderPtrs );
|
||||
SAFE_DELETE_ARRAY( mBinMutexes );
|
||||
for( unsigned int i = 0; i < mMaxJobs; ++i )
|
||||
{
|
||||
SAFE_DELETE_ARRAY( mJobs[i].mRenderJobs );
|
||||
}
|
||||
SAFE_DELETE_ARRAY( mJobs );
|
||||
SAFE_DELETE_ARRAY( mTrilistData );
|
||||
}
|
||||
|
||||
inline unsigned int CullingThreadpool::RenderJobQueue::GetMinRenderPtr() const
|
||||
{
|
||||
unsigned int minRenderPtr = mRenderPtrs[0];
|
||||
for( unsigned int i = 1; i < mNumBins; ++i )
|
||||
{
|
||||
unsigned int renderPtr = mRenderPtrs[i];
|
||||
minRenderPtr = renderPtr < minRenderPtr ? renderPtr : minRenderPtr;
|
||||
}
|
||||
return minRenderPtr;
|
||||
}
|
||||
|
||||
inline void CullingThreadpool::RenderJobQueue::AdvanceRenderJob( int binIdx )
|
||||
{
|
||||
mRenderPtrs[binIdx]++;
|
||||
mBinMutexes[binIdx] = 0;
|
||||
}
|
||||
|
||||
inline unsigned int CullingThreadpool::RenderJobQueue::GetBestGlobalQueue() const
|
||||
{
|
||||
// Find least advanced queue
|
||||
unsigned int bestBin = ~0, bestPtr = mWritePtr;
|
||||
for( unsigned int i = 0; i < mNumBins; ++i )
|
||||
{
|
||||
if( mRenderPtrs[i] < bestPtr && mBinMutexes[i] == 0 )
|
||||
{
|
||||
bestBin = i;
|
||||
bestPtr = mRenderPtrs[i];
|
||||
}
|
||||
}
|
||||
return bestBin;
|
||||
}
|
||||
|
||||
inline bool CullingThreadpool::RenderJobQueue::IsPipelineEmpty() const
|
||||
{
|
||||
return GetMinRenderPtr() == mWritePtr;
|
||||
}
|
||||
|
||||
inline bool CullingThreadpool::RenderJobQueue::CanWrite() const
|
||||
{
|
||||
return mWritePtr - GetMinRenderPtr() < mMaxJobs;
|
||||
}
|
||||
|
||||
inline bool CullingThreadpool::RenderJobQueue::CanBin() const
|
||||
{
|
||||
return mBinningPtr < mWritePtr && mBinningPtr - GetMinRenderPtr() < mMaxJobs;
|
||||
}
|
||||
|
||||
inline CullingThreadpool::RenderJobQueue::Job* CullingThreadpool::RenderJobQueue::GetWriteJob()
|
||||
{
|
||||
return &mJobs[mWritePtr % mMaxJobs];
|
||||
}
|
||||
|
||||
inline void CullingThreadpool::RenderJobQueue::AdvanceWriteJob()
|
||||
{
|
||||
mWritePtr++;
|
||||
}
|
||||
|
||||
inline CullingThreadpool::RenderJobQueue::Job* CullingThreadpool::RenderJobQueue::GetBinningJob()
|
||||
{
|
||||
unsigned int binningPtr = mBinningPtr;
|
||||
if( binningPtr < mWritePtr && binningPtr - GetMinRenderPtr() < mMaxJobs )
|
||||
{
|
||||
if( mBinningPtr.compare_exchange_strong( binningPtr, binningPtr + 1 ) )
|
||||
{
|
||||
mJobs[binningPtr % mMaxJobs].mBinningJobStartedIdx = binningPtr;
|
||||
return &mJobs[binningPtr % mMaxJobs];
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
inline void CullingThreadpool::RenderJobQueue::FinishedBinningJob( Job* job )
|
||||
{
|
||||
job->mBinningJobCompletedIdx = job->mBinningJobStartedIdx;
|
||||
}
|
||||
|
||||
inline CullingThreadpool::RenderJobQueue::Job* CullingThreadpool::RenderJobQueue::GetRenderJob( int binIdx )
|
||||
{
|
||||
// Attempt to lock bin mutex
|
||||
unsigned int expected = 0;
|
||||
if( !mBinMutexes[binIdx].compare_exchange_strong( expected, 1 ) )
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Check any items in the queue, and bail if empty
|
||||
if( mRenderPtrs[binIdx] != mJobs[mRenderPtrs[binIdx] % mMaxJobs].mBinningJobCompletedIdx )
|
||||
{
|
||||
mBinMutexes[binIdx] = 0;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return &mJobs[mRenderPtrs[binIdx] % mMaxJobs];
|
||||
}
|
||||
|
||||
void CullingThreadpool::RenderJobQueue::Reset()
|
||||
{
|
||||
mWritePtr = 0;
|
||||
mBinningPtr = 0;
|
||||
|
||||
for( unsigned int i = 0; i < mNumBins; ++i )
|
||||
{
|
||||
mRenderPtrs[i] = 0;
|
||||
}
|
||||
|
||||
for( unsigned int i = 0; i < mMaxJobs; ++i )
|
||||
{
|
||||
mJobs[i].mBinningJobCompletedIdx = -1;
|
||||
mJobs[i].mBinningJobStartedIdx = -1;
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Culling threadpool private helper functions
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void CullingThreadpool::SetupScissors()
|
||||
{
|
||||
unsigned int width, height;
|
||||
mMOC->GetResolution( width, height );
|
||||
|
||||
unsigned int binWidth;
|
||||
unsigned int binHeight;
|
||||
mMOC->ComputeBinWidthHeight( mBinsW, mBinsH, binWidth, binHeight );
|
||||
|
||||
for( unsigned int ty = 0; ty < mBinsH; ++ty )
|
||||
{
|
||||
for( unsigned int tx = 0; tx < mBinsW; ++tx )
|
||||
{
|
||||
unsigned int threadIdx = tx + ty * mBinsW;
|
||||
|
||||
// Adjust rects on final row / col to match resolution
|
||||
mRects[threadIdx].mMinX = tx * binWidth;
|
||||
mRects[threadIdx].mMaxX = tx + 1 == mBinsW ? width : ( tx + 1 ) * binWidth;
|
||||
mRects[threadIdx].mMinY = ty * binHeight;
|
||||
mRects[threadIdx].mMaxY = ty + 1 == mBinsH ? height : ( ty + 1 ) * binHeight;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CullingThreadpool::ThreadRun( CullingThreadpool* threadPool, unsigned int threadId )
|
||||
{
|
||||
threadPool->ThreadMain( threadId );
|
||||
}
|
||||
|
||||
void CullingThreadpool::ThreadMain( unsigned int threadIdx )
|
||||
{
|
||||
while( true )
|
||||
{
|
||||
bool threadIsIdle = true;
|
||||
unsigned int threadBinIdx = threadIdx;
|
||||
|
||||
// Wait for threads to be woken up (low CPU load sleep)
|
||||
std::unique_lock<std::mutex> lock( mSuspendedMutex );
|
||||
mNumSuspendedThreads++;
|
||||
mSuspendedCV.wait( lock, [&] {return !mSuspendThreads; } );
|
||||
mNumSuspendedThreads--;
|
||||
lock.unlock();
|
||||
|
||||
// Loop until suspended again
|
||||
while( !mSuspendThreads || !threadIsIdle )
|
||||
{
|
||||
if( mKillThreads )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
threadIsIdle = false;
|
||||
|
||||
// Prio 1: Process any render jobs local to this thread
|
||||
unsigned int binIdx = threadBinIdx;
|
||||
threadBinIdx = threadBinIdx + mNumThreads < mNumBins ? threadBinIdx + mNumThreads : threadIdx;
|
||||
RenderJobQueue::Job* job = mRenderQueue->GetRenderJob( binIdx );
|
||||
if( job != nullptr )
|
||||
{
|
||||
if( job->mRenderJobs[binIdx].mTriIdx > 0 )
|
||||
{
|
||||
mMOC->RenderTrilist( job->mRenderJobs[binIdx], &mRects[binIdx] );
|
||||
}
|
||||
|
||||
mRenderQueue->AdvanceRenderJob( binIdx );
|
||||
continue;
|
||||
}
|
||||
|
||||
// Prio 2: Process any outstanding setup/binning jobs
|
||||
if( mRenderQueue->CanBin() )
|
||||
{
|
||||
// If no more rasterization jobs, get next binning job
|
||||
RenderJobQueue::Job* job = mRenderQueue->GetBinningJob();
|
||||
if( job != nullptr )
|
||||
{
|
||||
RenderJobQueue::BinningJob& sjob = job->mBinningJob;
|
||||
for( unsigned int i = 0; i < mNumBins; ++i )
|
||||
{
|
||||
job->mRenderJobs[i].mTriIdx = 0;
|
||||
}
|
||||
mMOC->BinTriangles( sjob.mVerts, sjob.mTris, sjob.nTris, job->mRenderJobs, mBinsW, mBinsH, sjob.mMatrix, sjob.mBfWinding, sjob.mClipPlanes, *sjob.mVtxLayout );
|
||||
mRenderQueue->FinishedBinningJob( job );
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Prio 3: No work is available, work steal from another thread's queue
|
||||
if( mNumBins > mNumThreads )
|
||||
{
|
||||
binIdx = mRenderQueue->GetBestGlobalQueue();
|
||||
if( binIdx < mRenderQueue->mNumBins )
|
||||
{
|
||||
RenderJobQueue::Job* job = mRenderQueue->GetRenderJob( binIdx );
|
||||
if( job != nullptr )
|
||||
{
|
||||
if( job->mRenderJobs[binIdx].mTriIdx > 0 )
|
||||
{
|
||||
mMOC->RenderTrilist( job->mRenderJobs[binIdx], &mRects[binIdx] );
|
||||
}
|
||||
|
||||
mRenderQueue->AdvanceRenderJob( binIdx );
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// No work available: Yield this thread
|
||||
std::this_thread::yield();
|
||||
threadIsIdle = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Culling threadpool public API, similar to the MaskedOcclusionCulling class
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
CullingThreadpool::CullingThreadpool( unsigned int numThreads, unsigned int binsW, unsigned int binsH, unsigned int maxJobs ) :
|
||||
mNumThreads( numThreads ),
|
||||
mMaxJobs( maxJobs ),
|
||||
mBinsW( binsW ),
|
||||
mBinsH( binsH ),
|
||||
mKillThreads( false ),
|
||||
mSuspendThreads( true ),
|
||||
mNumSuspendedThreads( 0 ),
|
||||
mModelToClipMatrices( maxJobs ),
|
||||
mVertexLayouts( maxJobs ),
|
||||
mMOC( nullptr )
|
||||
{
|
||||
mNumBins = mBinsW * mBinsH;
|
||||
assert( mNumBins >= mNumThreads ); // Having less bins than threads is a bad idea!
|
||||
|
||||
mRects = new ScissorRect[mNumBins];
|
||||
mRenderQueue = new RenderJobQueue( mNumBins, mMaxJobs );
|
||||
|
||||
// Add default vertex layout and matrix
|
||||
mVertexLayouts.AddData( VertexLayout( 16, 4, 12 ) );
|
||||
mCurrentMatrix = nullptr;
|
||||
|
||||
mThreads = new std::thread[mNumThreads];
|
||||
for( unsigned int i = 0; i < mNumThreads; ++i )
|
||||
{
|
||||
mThreads[i] = std::thread( ThreadRun, this, i );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
CullingThreadpool::~CullingThreadpool()
|
||||
{
|
||||
// Wait for threads to terminate
|
||||
if( mThreads != nullptr || !mKillThreads )
|
||||
{
|
||||
WakeThreads();
|
||||
mKillThreads = true;
|
||||
for( unsigned int i = 0; i < mNumThreads; ++i )
|
||||
{
|
||||
mThreads[i].join();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Free memory
|
||||
SAFE_DELETE( mRenderQueue );
|
||||
SAFE_DELETE_ARRAY( mRects );
|
||||
SAFE_DELETE_ARRAY( mThreads );
|
||||
}
|
||||
|
||||
void CullingThreadpool::WakeThreads()
|
||||
{
|
||||
// Wait for all threads to be in suspended mode
|
||||
while( mNumSuspendedThreads < mNumThreads )
|
||||
{
|
||||
std::this_thread::yield();
|
||||
}
|
||||
|
||||
// Send wake up event
|
||||
std::unique_lock<std::mutex> lock( mSuspendedMutex );
|
||||
mSuspendThreads = false;
|
||||
lock.unlock();
|
||||
mSuspendedCV.notify_all();
|
||||
}
|
||||
|
||||
void CullingThreadpool::SuspendThreads()
|
||||
{
|
||||
// Signal threads to go into suspended mode (after finishing all outstanding work)
|
||||
mSuspendThreads = true;
|
||||
}
|
||||
|
||||
void CullingThreadpool::Flush()
|
||||
{
|
||||
// Wait for pipeline to be empty (i.e. all work is finished)
|
||||
while( !mRenderQueue->IsPipelineEmpty() )
|
||||
{
|
||||
std::this_thread::yield();
|
||||
}
|
||||
|
||||
// Reset queue counters
|
||||
mRenderQueue->Reset();
|
||||
}
|
||||
|
||||
void CullingThreadpool::SetBuffer( MaskedOcclusionCulling* moc )
|
||||
{
|
||||
Flush();
|
||||
mMOC = moc;
|
||||
SetupScissors();
|
||||
}
|
||||
|
||||
void CullingThreadpool::SetResolution( unsigned int width, unsigned int height )
|
||||
{
|
||||
Flush();
|
||||
mMOC->SetResolution( width, height );
|
||||
SetupScissors();
|
||||
}
|
||||
|
||||
void CullingThreadpool::SetNearClipPlane( float nearDist )
|
||||
{
|
||||
Flush();
|
||||
mMOC->SetNearClipPlane( nearDist );
|
||||
}
|
||||
|
||||
void CullingThreadpool::SetMatrix( const float* modelToClipMatrix )
|
||||
{
|
||||
// Treat nullptr matrix as a special case, otherwise copy the contents of the pointer and add to state
|
||||
if( modelToClipMatrix == nullptr )
|
||||
{
|
||||
mCurrentMatrix = nullptr;
|
||||
}
|
||||
else
|
||||
{
|
||||
mModelToClipMatrices.AddData( Matrix4x4( modelToClipMatrix ) );
|
||||
mCurrentMatrix = mModelToClipMatrices.GetData()->mValues;
|
||||
}
|
||||
}
|
||||
|
||||
void CullingThreadpool::SetVertexLayout( const VertexLayout& vtxLayout )
|
||||
{
|
||||
mVertexLayouts.AddData( vtxLayout );
|
||||
}
|
||||
|
||||
void CullingThreadpool::ClearBuffer()
|
||||
{
|
||||
Flush();
|
||||
mMOC->ClearBuffer();
|
||||
}
|
||||
|
||||
void CullingThreadpool::RenderTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask )
|
||||
{
|
||||
#if MOC_RECORDER_ENABLE != 0
|
||||
mMOC->RecordRenderTriangles( inVtx, inTris, nTris, mCurrentMatrix, clipPlaneMask, bfWinding, *mVertexLayouts.GetData( ) );
|
||||
#endif
|
||||
|
||||
for( int i = 0; i < nTris; i += TRIS_PER_JOB )
|
||||
{
|
||||
// Yield if work queue is full
|
||||
while( !mRenderQueue->CanWrite() )
|
||||
{
|
||||
std::this_thread::yield();
|
||||
}
|
||||
|
||||
// Create new renderjob
|
||||
RenderJobQueue::Job* job = mRenderQueue->GetWriteJob();
|
||||
job->mBinningJob.mVerts = inVtx;
|
||||
job->mBinningJob.mTris = inTris + i * 3;
|
||||
job->mBinningJob.nTris = nTris - i < TRIS_PER_JOB ? nTris - i : TRIS_PER_JOB;
|
||||
job->mBinningJob.mMatrix = mCurrentMatrix;
|
||||
job->mBinningJob.mClipPlanes = clipPlaneMask;
|
||||
job->mBinningJob.mBfWinding = bfWinding;
|
||||
job->mBinningJob.mVtxLayout = mVertexLayouts.GetData();
|
||||
mRenderQueue->AdvanceWriteJob();
|
||||
}
|
||||
}
|
||||
|
||||
CullingThreadpool::CullingResult CullingThreadpool::TestRect( float xmin, float ymin, float xmax, float ymax, float wmin )
|
||||
{
|
||||
return mMOC->TestRect( xmin, ymin, xmax, ymax, wmin );
|
||||
}
|
||||
|
||||
CullingThreadpool::CullingResult CullingThreadpool::TestTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask )
|
||||
{
|
||||
return mMOC->TestTriangles( inVtx, inTris, nTris, mCurrentMatrix, bfWinding, clipPlaneMask, *mVertexLayouts.GetData() );
|
||||
}
|
||||
|
||||
void CullingThreadpool::ComputePixelDepthBuffer( float* depthData, bool flipY )
|
||||
{
|
||||
Flush();
|
||||
mMOC->ComputePixelDepthBuffer( depthData, flipY );
|
||||
}
|
311
neo/libs/moc/CullingThreadpool.h
Normal file
311
neo/libs/moc/CullingThreadpool.h
Normal file
|
@ -0,0 +1,311 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Copyright 2017 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#pragma once
|
||||
|
||||
/*!
|
||||
* \file CullingThreadpool.h
|
||||
* \brief Worker threadpool example for threaded masked occlusion culling.
|
||||
*
|
||||
* This class implements a threadpool for occluder rendering. Calls to CullingThreadpool::RenderTriangle()
|
||||
* will immediately return, after adding work items to a queue, and occluder rendering is performed
|
||||
* by worker threads as quickly as possible. Occlusion queries are performed directly on the calling
|
||||
* threadand can be performed either synchronosly, by calling Flush() before executing the query, or
|
||||
* asynchronosly, by performing the query without waiting for the worker threads to finish.
|
||||
*
|
||||
* Note that this implementation should be considered an example rather than the best threading
|
||||
* solution. You may want to integrate threading in your own task system, and it may also be beneficial
|
||||
* to thread the traversal code. Refer to MaskedOcclusionCulling::BinTriangles() and
|
||||
* MaskedOcclusionCulling::RenderTrilist() for functions that can be used to make your own
|
||||
* threaded culling system.
|
||||
*/
|
||||
|
||||
#include <thread>
|
||||
#include <atomic>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
|
||||
#include "MaskedOcclusionCulling.h"
|
||||
|
||||
class CullingThreadpool
|
||||
{
|
||||
protected:
|
||||
static const int TRIS_PER_JOB = 1024; // Maximum number of triangles per job (bigger drawcalls are split), affects memory requirements
|
||||
|
||||
typedef MaskedOcclusionCulling::CullingResult CullingResult;
|
||||
typedef MaskedOcclusionCulling::ClipPlanes ClipPlanes;
|
||||
typedef MaskedOcclusionCulling::BackfaceWinding BackfaceWinding;
|
||||
typedef MaskedOcclusionCulling::ScissorRect ScissorRect;
|
||||
typedef MaskedOcclusionCulling::VertexLayout VertexLayout;
|
||||
typedef MaskedOcclusionCulling::TriList TriList;
|
||||
|
||||
// Small utility class for 4x4 matrices
|
||||
struct Matrix4x4
|
||||
{
|
||||
float mValues[16];
|
||||
Matrix4x4() {}
|
||||
Matrix4x4( const float* matrix )
|
||||
{
|
||||
for( int i = 0; i < 16; ++i )
|
||||
{
|
||||
mValues[i] = matrix[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Internal utility class for a (mostly) lockless queue for binning & rendering jobs
|
||||
struct RenderJobQueue
|
||||
{
|
||||
struct BinningJob
|
||||
{
|
||||
const float* mVerts;
|
||||
const unsigned int* mTris;
|
||||
unsigned int nTris;
|
||||
|
||||
const float* mMatrix;
|
||||
ClipPlanes mClipPlanes;
|
||||
BackfaceWinding mBfWinding;
|
||||
const VertexLayout* mVtxLayout;
|
||||
};
|
||||
|
||||
struct Job
|
||||
{
|
||||
volatile unsigned int mBinningJobStartedIdx;
|
||||
volatile unsigned int mBinningJobCompletedIdx;
|
||||
BinningJob mBinningJob;
|
||||
TriList* mRenderJobs;
|
||||
};
|
||||
|
||||
unsigned int mNumBins;
|
||||
unsigned int mMaxJobs;
|
||||
|
||||
volatile unsigned int mWritePtr;
|
||||
std::atomic_uint mBinningPtr;
|
||||
std::atomic_uint* mRenderPtrs;
|
||||
std::atomic_uint* mBinMutexes;
|
||||
|
||||
float* mTrilistData;
|
||||
Job* mJobs;
|
||||
|
||||
RenderJobQueue( unsigned int nBins, unsigned int maxJobs );
|
||||
~RenderJobQueue();
|
||||
|
||||
unsigned int GetMinRenderPtr() const;
|
||||
unsigned int GetBestGlobalQueue() const;
|
||||
bool IsPipelineEmpty() const;
|
||||
|
||||
bool CanWrite() const;
|
||||
bool CanBin() const;
|
||||
|
||||
Job* GetWriteJob();
|
||||
void AdvanceWriteJob();
|
||||
|
||||
Job* GetBinningJob();
|
||||
void FinishedBinningJob( Job* job );
|
||||
|
||||
Job* GetRenderJob( int binIdx );
|
||||
void AdvanceRenderJob( int binIdx );
|
||||
|
||||
void Reset();
|
||||
};
|
||||
|
||||
// Internal utility class for state (matrix / vertex layout)
|
||||
template<class T> struct StateData
|
||||
{
|
||||
unsigned int mMaxJobs;
|
||||
unsigned int mCurrentIdx;
|
||||
T* mData;
|
||||
|
||||
StateData( unsigned int maxJobs );
|
||||
~StateData();
|
||||
void AddData( const T& data );
|
||||
const T* GetData() const;
|
||||
};
|
||||
|
||||
// Number of worker threads and bins
|
||||
unsigned int mNumThreads;
|
||||
unsigned int mNumBins;
|
||||
unsigned int mMaxJobs;
|
||||
unsigned int mBinsW;
|
||||
unsigned int mBinsH;
|
||||
|
||||
// Threads and control variables
|
||||
std::mutex mSuspendedMutex;
|
||||
std::condition_variable mSuspendedCV;
|
||||
volatile bool mKillThreads;
|
||||
volatile bool mSuspendThreads;
|
||||
volatile unsigned int mNumSuspendedThreads;
|
||||
std::thread* mThreads;
|
||||
|
||||
// State variables and command queue
|
||||
const float* mCurrentMatrix;
|
||||
StateData<Matrix4x4> mModelToClipMatrices;
|
||||
StateData<VertexLayout> mVertexLayouts;
|
||||
RenderJobQueue* mRenderQueue;
|
||||
|
||||
// Occlusion culling object and related scissor rectangles
|
||||
ScissorRect* mRects;
|
||||
MaskedOcclusionCulling* mMOC;
|
||||
|
||||
void SetupScissors();
|
||||
|
||||
static void ThreadRun( CullingThreadpool* threadPool, unsigned int threadId );
|
||||
void ThreadMain( unsigned int threadIdx );
|
||||
|
||||
public:
|
||||
/*!
|
||||
* \brief Creates a new threadpool for masked occlusion culling. This object has a
|
||||
* similar API to the MaskedOcclusionCulling class, but performs occluder
|
||||
* rendering asynchronously on worker threads (similar to how DX/GL works).
|
||||
*
|
||||
* \param numThreads Number of worker threads to perform occluder rendering. Best
|
||||
* balance may be scene/machine dependent, but it's good practice to leave at
|
||||
* least one full core (2 threads with hyperthreading) for the main thread.
|
||||
* \param binsW The screen is divided into binsW x binsH rectangular bins for load
|
||||
* balancing. The number of bins should be atleast equal to the number of
|
||||
* worker threads.
|
||||
* \param binsH See description for the binsW parameter.
|
||||
* \param maxJobs Maximum number of jobs that may be in flight at any given time. If
|
||||
* the caller thread generates jobs faster than the worker threads can finish
|
||||
* them, then the job queue will fill up and the caller thread will stall once
|
||||
* "maxJobs" items have been queued up. For culling systems interleaving occlusion
|
||||
* queries and rendering, this value should be kept quite low to minimize false
|
||||
* positives (see TestRect()). We've observed that 32 [default] items typically
|
||||
* works well for our interleaved queries, while also allowing good load-balancing,
|
||||
* and this is the recommended setting.
|
||||
*/
|
||||
CullingThreadpool( unsigned int numThreads, unsigned int binsW, unsigned int binsH, unsigned int maxJobs = 32 );
|
||||
|
||||
/*!
|
||||
* \brief Destroys the threadpool and terminates all worker threads.
|
||||
*/
|
||||
~CullingThreadpool();
|
||||
|
||||
/*!
|
||||
* \brief Wakes up culling worker threads from suspended sleep, and puts them in a
|
||||
* ready state (using an idle spinlock with significantly higher CPU overhead).
|
||||
*
|
||||
* It may take on the order of 100us to wake up the threads, so this function should
|
||||
* preferably be called slightly ahead of starting occlusion culling work.
|
||||
*/
|
||||
void WakeThreads();
|
||||
|
||||
/*!
|
||||
* \brief Suspend all culling worker threads to a low CPU overhead sleep state.
|
||||
*
|
||||
* For performance and latency reasons, the culling work is performed in an active
|
||||
* processing loop (with no thread sleeping) with high CPU overhead. In a system
|
||||
* with more worker threads it's important to put the culling worker threads in a
|
||||
* low overhead sleep state after occlusion culling work has completed.
|
||||
*/
|
||||
void SuspendThreads();
|
||||
|
||||
/*!
|
||||
* \brief Waits for all outstanding occluder rendering work to complete. Can be used
|
||||
* to ensure that rendering has completed before performing a TestRect() or
|
||||
* TestTriangles() call.
|
||||
*/
|
||||
void Flush();
|
||||
|
||||
/*
|
||||
* \brief Sets the MaskedOcclusionCulling object (buffer) to be used for rendering and
|
||||
* testing calls. This method causes a Flush() to ensure that all unfinished
|
||||
* rendering is completed.
|
||||
*/
|
||||
void SetBuffer( MaskedOcclusionCulling* moc );
|
||||
|
||||
/*
|
||||
* \brief Changes the resolution of the occlusion buffer, see MaskedOcclusionCulling::SetResolution().
|
||||
* This method causes a Flush() to ensure that all unfinished rendering is completed.
|
||||
*/
|
||||
void SetResolution( unsigned int width, unsigned int height );
|
||||
|
||||
/*
|
||||
* \brief Sets the near clipping plane, see MaskedOcclusionCulling::SetNearClipPlane(). This
|
||||
* method causes a Flush() to ensure that all unfinished rendering is completed.
|
||||
*/
|
||||
void SetNearClipPlane( float nearDist );
|
||||
|
||||
/*
|
||||
* \brief Sets the model to clipspace transform matrix used for the RenderTriangles() and TestTriangles()
|
||||
* function calls. The contents of the matrix is copied, and it's safe to modify it without calling
|
||||
* Flush(). The copy may be costly, which is the reason for passing this parameter as "state".
|
||||
*
|
||||
* \param modelToClipMatrix All vertices will be transformed by the specified model to clipspace matrix.
|
||||
* Passing nullptr [default] disables the transform (equivalent to using an identity matrix).
|
||||
*/
|
||||
void SetMatrix( const float* modelToClipMatrix = nullptr );
|
||||
|
||||
/*
|
||||
* \brief Sets the vertex layout used for the RenderTriangles() and TestTriangles() function calls.
|
||||
* The vertex layout is copied, and it's safe to modify it without calling Flush(). The copy
|
||||
* may be costly, which is the reason for passing this parameter as "state".
|
||||
*
|
||||
* \param vtxLayout A struct specifying the vertex layout (see struct for detailed
|
||||
* description). For best performance, it is advicable to store position data
|
||||
* as compactly in memory as possible.
|
||||
*/
|
||||
void SetVertexLayout( const VertexLayout& vtxLayout = VertexLayout( 16, 4, 12 ) );
|
||||
|
||||
/*
|
||||
* \brief Clears the occlusion buffer, see MaskedOcclusionCulling::ClearBuffer(). This method
|
||||
* causes a Flush() to ensure that all unfinished rendering is completed.
|
||||
*/
|
||||
void ClearBuffer();
|
||||
|
||||
/*
|
||||
* \brief Asynchronously render occluder triangles, see MaskedOcclusionCulling::RenderTriangles().
|
||||
*
|
||||
* This method puts the drawcall into a command queue, and immediately returns. The rendering is
|
||||
* performed by the worker threads at the earliest opportunity.
|
||||
*
|
||||
* <B>Important:</B> As rendering is performed asynchronously, the application is not allowed to
|
||||
* change the contents of the *inVtx or *inTris buffers until after rendering is completed. If
|
||||
* you wish to use dynamic buffers, the application must perform a Flush() to ensure that rendering
|
||||
* is finished, or make sure to rotate between more buffers than the maximum number of outstanding
|
||||
* render jobs (see the CullingThreadpool() constructor).
|
||||
*/
|
||||
void RenderTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding = MaskedOcclusionCulling::BACKFACE_CW, ClipPlanes clipPlaneMask = MaskedOcclusionCulling::CLIP_PLANE_ALL );
|
||||
|
||||
/*
|
||||
* \brief Occlusion query for a rectangle with a given depth, see MaskedOcclusionCulling::TestRect().
|
||||
*
|
||||
* <B>Important:</B> This method is performed on the main thread and does not wait for outstanding
|
||||
* occluder rendering to be finished. To ensure that all occluder rendering is completed you must
|
||||
* perform a Flush() prior to calling this function.
|
||||
*
|
||||
* It is conservatively correct to perform occlusion queries without calling Flush() (it may only
|
||||
* lead to objects being incorrectly classified as visible), and it can lead to much better performance
|
||||
* if occlusion queries are used for traversing a BVH or similar data structure. It's possible to
|
||||
* use "asynchronous" queries during traversal, and removing false positives later, when rendering
|
||||
* has completed.
|
||||
*/
|
||||
CullingResult TestRect( float xmin, float ymin, float xmax, float ymax, float wmin );
|
||||
|
||||
/*
|
||||
* \brief Occlusion query for a mesh, see MaskedOcclusionCulling::TestTriangles().
|
||||
*
|
||||
* <B>Important:</B> See the TestRect() method for a brief discussion about asynchronous occlusion
|
||||
* queries.
|
||||
*/
|
||||
CullingResult TestTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding = MaskedOcclusionCulling::BACKFACE_CW, ClipPlanes clipPlaneMask = MaskedOcclusionCulling::CLIP_PLANE_ALL );
|
||||
|
||||
/*!
|
||||
* \brief Creates a per-pixel depth buffer from the hierarchical z buffer representation, see
|
||||
* MaskedOcclusionCulling::ComputePixelDepthBuffer(). This method causes a Flush() to
|
||||
* ensure that all unfinished rendering is completed.
|
||||
*/
|
||||
void ComputePixelDepthBuffer( float* depthData, bool flipY );
|
||||
};
|
528
neo/libs/moc/MaskedOcclusionCulling.cpp
Normal file
528
neo/libs/moc/MaskedOcclusionCulling.cpp
Normal file
|
@ -0,0 +1,528 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Copyright 2017 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#include <vector>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <float.h>
|
||||
#include "MaskedOcclusionCulling.h"
|
||||
#include "CompilerSpecific.inl"
|
||||
|
||||
#if MOC_RECORDER_ENABLE
|
||||
#include "FrameRecorder.h"
|
||||
#endif
|
||||
|
||||
#if defined(__AVX__) || defined(__AVX2__)
|
||||
// For performance reasons, the MaskedOcclusionCullingAVX2/512.cpp files should be compiled with VEX encoding for SSE instructions (to avoid
|
||||
// AVX-SSE transition penalties, see https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties). However, this file
|
||||
// _must_ be compiled without VEX encoding to allow backwards compatibility. Best practice is to use lowest supported target platform
|
||||
// (/arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files.
|
||||
#error The MaskedOcclusionCulling.cpp should be compiled with lowest supported target platform, e.g. /arch:SSE2
|
||||
#endif
|
||||
|
||||
static MaskedOcclusionCulling::Implementation DetectCPUFeatures( MaskedOcclusionCulling::pfnAlignedAlloc alignedAlloc, MaskedOcclusionCulling::pfnAlignedFree alignedFree )
|
||||
{
|
||||
struct CpuInfo
|
||||
{
|
||||
int regs[4];
|
||||
};
|
||||
|
||||
// Get regular CPUID values
|
||||
int regs[4];
|
||||
__cpuidex( regs, 0, 0 );
|
||||
|
||||
// MOCVectorAllocator<CpuInfo> mocalloc( alignedAlloc, alignedFree );
|
||||
// std::vector<CpuInfo, MOCVectorAllocator<CpuInfo>> cpuId( mocalloc ), cpuIdEx( mocalloc );
|
||||
// cpuId.resize( regs[0] );
|
||||
size_t cpuIdCount = regs[0];
|
||||
CpuInfo* cpuId = ( CpuInfo* )alignedAlloc( 64, sizeof( CpuInfo ) * cpuIdCount );
|
||||
|
||||
for( size_t i = 0; i < cpuIdCount; ++i )
|
||||
{
|
||||
__cpuidex( cpuId[i].regs, ( int )i, 0 );
|
||||
}
|
||||
|
||||
// Get extended CPUID values
|
||||
__cpuidex( regs, 0x80000000, 0 );
|
||||
|
||||
//cpuIdEx.resize(regs[0] - 0x80000000);
|
||||
size_t cpuIdExCount = regs[0] - 0x80000000;
|
||||
CpuInfo* cpuIdEx = ( CpuInfo* )alignedAlloc( 64, sizeof( CpuInfo ) * cpuIdExCount );
|
||||
|
||||
for( size_t i = 0; i < cpuIdExCount; ++i )
|
||||
{
|
||||
__cpuidex( cpuIdEx[i].regs, 0x80000000 + ( int )i, 0 );
|
||||
}
|
||||
|
||||
#define TEST_BITS(A, B) (((A) & (B)) == (B))
|
||||
#define TEST_FMA_MOVE_OXSAVE (cpuIdCount >= 1 && TEST_BITS(cpuId[1].regs[2], (1 << 12) | (1 << 22) | (1 << 27)))
|
||||
#define TEST_LZCNT (cpuIdExCount >= 1 && TEST_BITS(cpuIdEx[1].regs[2], 0x20))
|
||||
#define TEST_SSE41 (cpuIdCount >= 1 && TEST_BITS(cpuId[1].regs[2], (1 << 19)))
|
||||
#define TEST_XMM_YMM (cpuIdCount >= 1 && TEST_BITS(_xgetbv(0), (1 << 2) | (1 << 1)))
|
||||
#define TEST_OPMASK_ZMM (cpuIdCount >= 1 && TEST_BITS(_xgetbv(0), (1 << 7) | (1 << 6) | (1 << 5)))
|
||||
#define TEST_BMI1_BMI2_AVX2 (cpuIdCount >= 7 && TEST_BITS(cpuId[7].regs[1], (1 << 3) | (1 << 5) | (1 << 8)))
|
||||
#define TEST_AVX512_F_BW_DQ (cpuIdCount >= 7 && TEST_BITS(cpuId[7].regs[1], (1 << 16) | (1 << 17) | (1 << 30)))
|
||||
|
||||
MaskedOcclusionCulling::Implementation retVal = MaskedOcclusionCulling::SSE2;
|
||||
if( TEST_FMA_MOVE_OXSAVE && TEST_LZCNT && TEST_SSE41 )
|
||||
{
|
||||
if( TEST_XMM_YMM && TEST_OPMASK_ZMM && TEST_BMI1_BMI2_AVX2 && TEST_AVX512_F_BW_DQ )
|
||||
{
|
||||
retVal = MaskedOcclusionCulling::AVX512;
|
||||
}
|
||||
else if( TEST_XMM_YMM && TEST_BMI1_BMI2_AVX2 )
|
||||
{
|
||||
retVal = MaskedOcclusionCulling::AVX2;
|
||||
}
|
||||
}
|
||||
else if( TEST_SSE41 )
|
||||
{
|
||||
retVal = MaskedOcclusionCulling::SSE41;
|
||||
}
|
||||
alignedFree( cpuId );
|
||||
alignedFree( cpuIdEx );
|
||||
return retVal;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Utility functions (not directly related to the algorithm/rasterizer)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void MaskedOcclusionCulling::TransformVertices( const float* mtx, const float* inVtx, float* xfVtx, unsigned int nVtx, const VertexLayout& vtxLayout )
|
||||
{
|
||||
// This function pretty slow, about 10-20% slower than if the vertices are stored in aligned SOA form.
|
||||
if( nVtx == 0 )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Load matrix and swizzle out the z component. For post-multiplication (OGL), the matrix is assumed to be column
|
||||
// major, with one column per SSE register. For pre-multiplication (DX), the matrix is assumed to be row major.
|
||||
__m128 mtxCol0 = _mm_loadu_ps( mtx );
|
||||
__m128 mtxCol1 = _mm_loadu_ps( mtx + 4 );
|
||||
__m128 mtxCol2 = _mm_loadu_ps( mtx + 8 );
|
||||
__m128 mtxCol3 = _mm_loadu_ps( mtx + 12 );
|
||||
|
||||
int stride = vtxLayout.mStride;
|
||||
const char* vPtr = ( const char* )inVtx;
|
||||
float* outPtr = xfVtx;
|
||||
|
||||
// Iterate through all vertices and transform
|
||||
for( unsigned int vtx = 0; vtx < nVtx; ++vtx )
|
||||
{
|
||||
__m128 xVal = _mm_load1_ps( ( float* )( vPtr ) );
|
||||
__m128 yVal = _mm_load1_ps( ( float* )( vPtr + vtxLayout.mOffsetY ) );
|
||||
__m128 zVal = _mm_load1_ps( ( float* )( vPtr + vtxLayout.mOffsetZ ) );
|
||||
|
||||
__m128 xform = _mm_add_ps( _mm_mul_ps( mtxCol0, xVal ), _mm_add_ps( _mm_mul_ps( mtxCol1, yVal ), _mm_add_ps( _mm_mul_ps( mtxCol2, zVal ), mtxCol3 ) ) );
|
||||
_mm_storeu_ps( outPtr, xform );
|
||||
vPtr += stride;
|
||||
outPtr += 4;
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Typedefs
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc;
|
||||
typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree;
|
||||
typedef MaskedOcclusionCulling::VertexLayout VertexLayout;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Common SSE2/SSE4.1 defines
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define SIMD_LANES 4
|
||||
#define TILE_HEIGHT_SHIFT 2
|
||||
|
||||
#define SIMD_LANE_IDX _mm_setr_epi32(0, 1, 2, 3)
|
||||
|
||||
#define SIMD_SUB_TILE_COL_OFFSET _mm_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
|
||||
#define SIMD_SUB_TILE_ROW_OFFSET _mm_setzero_si128()
|
||||
#define SIMD_SUB_TILE_COL_OFFSET_F _mm_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
|
||||
#define SIMD_SUB_TILE_ROW_OFFSET_F _mm_setzero_ps()
|
||||
|
||||
#define SIMD_LANE_YCOORD_I _mm_setr_epi32(128, 384, 640, 896)
|
||||
#define SIMD_LANE_YCOORD_F _mm_setr_ps(128.0f, 384.0f, 640.0f, 896.0f)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Common SSE2/SSE4.1 functions
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef __m128 __mw;
|
||||
typedef __m128i __mwi;
|
||||
|
||||
#define _mmw_set1_ps _mm_set1_ps
|
||||
#define _mmw_setzero_ps _mm_setzero_ps
|
||||
#define _mmw_and_ps _mm_and_ps
|
||||
#define _mmw_or_ps _mm_or_ps
|
||||
#define _mmw_xor_ps _mm_xor_ps
|
||||
#define _mmw_not_ps(a) _mm_xor_ps((a), _mm_castsi128_ps(_mm_set1_epi32(~0)))
|
||||
#define _mmw_andnot_ps _mm_andnot_ps
|
||||
#define _mmw_neg_ps(a) _mm_xor_ps((a), _mm_set1_ps(-0.0f))
|
||||
#define _mmw_abs_ps(a) _mm_and_ps((a), _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)))
|
||||
#define _mmw_add_ps _mm_add_ps
|
||||
#define _mmw_sub_ps _mm_sub_ps
|
||||
#define _mmw_mul_ps _mm_mul_ps
|
||||
#define _mmw_div_ps _mm_div_ps
|
||||
#define _mmw_min_ps _mm_min_ps
|
||||
#define _mmw_max_ps _mm_max_ps
|
||||
#define _mmw_movemask_ps _mm_movemask_ps
|
||||
#define _mmw_cmpge_ps(a,b) _mm_cmpge_ps(a, b)
|
||||
#define _mmw_cmpgt_ps(a,b) _mm_cmpgt_ps(a, b)
|
||||
#define _mmw_cmpeq_ps(a,b) _mm_cmpeq_ps(a, b)
|
||||
#define _mmw_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
|
||||
#define _mmw_fmsub_ps(a,b,c) _mm_sub_ps(_mm_mul_ps(a,b), c)
|
||||
#define _mmw_shuffle_ps _mm_shuffle_ps
|
||||
#define _mmw_insertf32x4_ps(a,b,c) (b)
|
||||
#define _mmw_cvtepi32_ps _mm_cvtepi32_ps
|
||||
#define _mmw_blendv_epi32(a,b,c) simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c)))
|
||||
|
||||
#define _mmw_set1_epi32 _mm_set1_epi32
|
||||
#define _mmw_setzero_epi32 _mm_setzero_si128
|
||||
#define _mmw_and_epi32 _mm_and_si128
|
||||
#define _mmw_or_epi32 _mm_or_si128
|
||||
#define _mmw_xor_epi32 _mm_xor_si128
|
||||
#define _mmw_not_epi32(a) _mm_xor_si128((a), _mm_set1_epi32(~0))
|
||||
#define _mmw_andnot_epi32 _mm_andnot_si128
|
||||
#define _mmw_neg_epi32(a) _mm_sub_epi32(_mm_set1_epi32(0), (a))
|
||||
#define _mmw_add_epi32 _mm_add_epi32
|
||||
#define _mmw_sub_epi32 _mm_sub_epi32
|
||||
#define _mmw_subs_epu16 _mm_subs_epu16
|
||||
#define _mmw_cmpeq_epi32 _mm_cmpeq_epi32
|
||||
#define _mmw_cmpgt_epi32 _mm_cmpgt_epi32
|
||||
#define _mmw_srai_epi32 _mm_srai_epi32
|
||||
#define _mmw_srli_epi32 _mm_srli_epi32
|
||||
#define _mmw_slli_epi32 _mm_slli_epi32
|
||||
#define _mmw_cvtps_epi32 _mm_cvtps_epi32
|
||||
#define _mmw_cvttps_epi32 _mm_cvttps_epi32
|
||||
|
||||
#define _mmx_fmadd_ps _mmw_fmadd_ps
|
||||
#define _mmx_max_epi32 _mmw_max_epi32
|
||||
#define _mmx_min_epi32 _mmw_min_epi32
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// SIMD casting functions
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename T, typename Y> FORCE_INLINE T simd_cast( Y A );
|
||||
template<> FORCE_INLINE __m128 simd_cast<__m128>( float A )
|
||||
{
|
||||
return _mm_set1_ps( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m128 simd_cast<__m128>( __m128i A )
|
||||
{
|
||||
return _mm_castsi128_ps( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m128 simd_cast<__m128>( __m128 A )
|
||||
{
|
||||
return A;
|
||||
}
|
||||
template<> FORCE_INLINE __m128i simd_cast<__m128i>( int A )
|
||||
{
|
||||
return _mm_set1_epi32( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m128i simd_cast<__m128i>( __m128 A )
|
||||
{
|
||||
return _mm_castps_si128( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m128i simd_cast<__m128i>( __m128i A )
|
||||
{
|
||||
return A;
|
||||
}
|
||||
|
||||
#define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \
|
||||
FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \
|
||||
union accessor { simd_type m_native; base_type m_array[elements]; }; \
|
||||
is_const accessor *acs = reinterpret_cast<is_const accessor*>(&a); \
|
||||
return acs->m_array; \
|
||||
}
|
||||
|
||||
MAKE_ACCESSOR( simd_f32, __m128, float,, 4 )
|
||||
MAKE_ACCESSOR( simd_f32, __m128, float, const, 4 )
|
||||
MAKE_ACCESSOR( simd_i32, __m128i, int,, 4 )
|
||||
MAKE_ACCESSOR( simd_i32, __m128i, int, const, 4 )
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Specialized SSE input assembly function for general vertex gather
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
FORCE_INLINE void GatherVertices( __m128* vtxX, __m128* vtxY, __m128* vtxW, const float* inVtx, const unsigned int* inTrisPtr, int numLanes, const VertexLayout& vtxLayout )
|
||||
{
|
||||
for( int lane = 0; lane < numLanes; lane++ )
|
||||
{
|
||||
for( int i = 0; i < 3; i++ )
|
||||
{
|
||||
char* vPtrX = ( char* )inVtx + inTrisPtr[lane * 3 + i] * vtxLayout.mStride;
|
||||
char* vPtrY = vPtrX + vtxLayout.mOffsetY;
|
||||
char* vPtrW = vPtrX + vtxLayout.mOffsetW;
|
||||
|
||||
simd_f32( vtxX[i] )[lane] = *( ( float* )vPtrX );
|
||||
simd_f32( vtxY[i] )[lane] = *( ( float* )vPtrY );
|
||||
simd_f32( vtxW[i] )[lane] = *( ( float* )vPtrW );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// SSE4.1 version
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace MaskedOcclusionCullingSSE41
|
||||
{
|
||||
FORCE_INLINE __m128i _mmw_mullo_epi32( const __m128i& a, const __m128i& b )
|
||||
{
|
||||
return _mm_mullo_epi32( a, b );
|
||||
}
|
||||
FORCE_INLINE __m128i _mmw_min_epi32( const __m128i& a, const __m128i& b )
|
||||
{
|
||||
return _mm_min_epi32( a, b );
|
||||
}
|
||||
FORCE_INLINE __m128i _mmw_max_epi32( const __m128i& a, const __m128i& b )
|
||||
{
|
||||
return _mm_max_epi32( a, b );
|
||||
}
|
||||
FORCE_INLINE __m128i _mmw_abs_epi32( const __m128i& a )
|
||||
{
|
||||
return _mm_abs_epi32( a );
|
||||
}
|
||||
FORCE_INLINE __m128 _mmw_blendv_ps( const __m128& a, const __m128& b, const __m128& c )
|
||||
{
|
||||
return _mm_blendv_ps( a, b, c );
|
||||
}
|
||||
FORCE_INLINE int _mmw_testz_epi32( const __m128i& a, const __m128i& b )
|
||||
{
|
||||
return _mm_testz_si128( a, b );
|
||||
}
|
||||
FORCE_INLINE __m128 _mmx_dp4_ps( const __m128& a, const __m128& b )
|
||||
{
|
||||
return _mm_dp_ps( a, b, 0xFF );
|
||||
}
|
||||
FORCE_INLINE __m128 _mmw_floor_ps( const __m128& a )
|
||||
{
|
||||
return _mm_round_ps( a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC );
|
||||
}
|
||||
FORCE_INLINE __m128 _mmw_ceil_ps( const __m128& a )
|
||||
{
|
||||
return _mm_round_ps( a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC );
|
||||
}
|
||||
FORCE_INLINE __m128i _mmw_transpose_epi8( const __m128i& a )
|
||||
{
|
||||
const __m128i shuff = _mm_setr_epi8( 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF );
|
||||
return _mm_shuffle_epi8( a, shuff );
|
||||
}
|
||||
FORCE_INLINE __m128i _mmw_sllv_ones( const __m128i& ishift )
|
||||
{
|
||||
__m128i shift = _mm_min_epi32( ishift, _mm_set1_epi32( 32 ) );
|
||||
|
||||
// Uses lookup tables and _mm_shuffle_epi8 to perform _mm_sllv_epi32(~0, shift)
|
||||
const __m128i byteShiftLUT = _mm_setr_epi8( ( char )0xFF, ( char )0xFE, ( char )0xFC, ( char )0xF8, ( char )0xF0, ( char )0xE0, ( char )0xC0, ( char )0x80, 0, 0, 0, 0, 0, 0, 0, 0 );
|
||||
const __m128i byteShiftOffset = _mm_setr_epi8( 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 );
|
||||
const __m128i byteShiftShuffle = _mm_setr_epi8( 0x0, 0x0, 0x0, 0x0, 0x4, 0x4, 0x4, 0x4, 0x8, 0x8, 0x8, 0x8, 0xC, 0xC, 0xC, 0xC );
|
||||
|
||||
__m128i byteShift = _mm_shuffle_epi8( shift, byteShiftShuffle );
|
||||
byteShift = _mm_min_epi8( _mm_subs_epu8( byteShift, byteShiftOffset ), _mm_set1_epi8( 8 ) );
|
||||
__m128i retMask = _mm_shuffle_epi8( byteShiftLUT, byteShift );
|
||||
|
||||
return retMask;
|
||||
}
|
||||
|
||||
static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::SSE41;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Include common algorithm implementation (general, SIMD independent code)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "MaskedOcclusionCullingCommon.inl"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Utility function to create a new object using the allocator callbacks
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree )
|
||||
{
|
||||
MaskedOcclusionCullingPrivate* object = ( MaskedOcclusionCullingPrivate* )alignedAlloc( 64, sizeof( MaskedOcclusionCullingPrivate ) );
|
||||
new( object ) MaskedOcclusionCullingPrivate( alignedAlloc, alignedFree );
|
||||
return object;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// SSE2 version
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace MaskedOcclusionCullingSSE2
|
||||
{
|
||||
FORCE_INLINE __m128i _mmw_mullo_epi32( const __m128i& a, const __m128i& b )
|
||||
{
|
||||
// Do products for even / odd lanes & merge the result
|
||||
__m128i even = _mm_and_si128( _mm_mul_epu32( a, b ), _mm_setr_epi32( ~0, 0, ~0, 0 ) );
|
||||
__m128i odd = _mm_slli_epi64( _mm_mul_epu32( _mm_srli_epi64( a, 32 ), _mm_srli_epi64( b, 32 ) ), 32 );
|
||||
return _mm_or_si128( even, odd );
|
||||
}
|
||||
FORCE_INLINE __m128i _mmw_min_epi32( const __m128i& a, const __m128i& b )
|
||||
{
|
||||
__m128i cond = _mm_cmpgt_epi32( a, b );
|
||||
return _mm_or_si128( _mm_andnot_si128( cond, a ), _mm_and_si128( cond, b ) );
|
||||
}
|
||||
FORCE_INLINE __m128i _mmw_max_epi32( const __m128i& a, const __m128i& b )
|
||||
{
|
||||
__m128i cond = _mm_cmpgt_epi32( b, a );
|
||||
return _mm_or_si128( _mm_andnot_si128( cond, a ), _mm_and_si128( cond, b ) );
|
||||
}
|
||||
FORCE_INLINE __m128i _mmw_abs_epi32( const __m128i& a )
|
||||
{
|
||||
__m128i mask = _mm_cmplt_epi32( a, _mm_setzero_si128() );
|
||||
return _mm_add_epi32( _mm_xor_si128( a, mask ), _mm_srli_epi32( mask, 31 ) );
|
||||
}
|
||||
FORCE_INLINE int _mmw_testz_epi32( const __m128i& a, const __m128i& b )
|
||||
{
|
||||
return _mm_movemask_epi8( _mm_cmpeq_epi8( _mm_and_si128( a, b ), _mm_setzero_si128() ) ) == 0xFFFF;
|
||||
}
|
||||
FORCE_INLINE __m128 _mmw_blendv_ps( const __m128& a, const __m128& b, const __m128& c )
|
||||
{
|
||||
__m128 cond = _mm_castsi128_ps( _mm_srai_epi32( _mm_castps_si128( c ), 31 ) );
|
||||
return _mm_or_ps( _mm_andnot_ps( cond, a ), _mm_and_ps( cond, b ) );
|
||||
}
|
||||
FORCE_INLINE __m128 _mmx_dp4_ps( const __m128& a, const __m128& b )
|
||||
{
|
||||
// Product and two shuffle/adds pairs (similar to hadd_ps)
|
||||
__m128 prod = _mm_mul_ps( a, b );
|
||||
__m128 dp = _mm_add_ps( prod, _mm_shuffle_ps( prod, prod, _MM_SHUFFLE( 2, 3, 0, 1 ) ) );
|
||||
dp = _mm_add_ps( dp, _mm_shuffle_ps( dp, dp, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
|
||||
return dp;
|
||||
}
|
||||
FORCE_INLINE __m128 _mmw_floor_ps( const __m128& a )
|
||||
{
|
||||
int originalMode = _MM_GET_ROUNDING_MODE();
|
||||
_MM_SET_ROUNDING_MODE( _MM_ROUND_DOWN );
|
||||
__m128 rounded = _mm_cvtepi32_ps( _mm_cvtps_epi32( a ) );
|
||||
_MM_SET_ROUNDING_MODE( originalMode );
|
||||
return rounded;
|
||||
}
|
||||
FORCE_INLINE __m128 _mmw_ceil_ps( const __m128& a )
|
||||
{
|
||||
int originalMode = _MM_GET_ROUNDING_MODE();
|
||||
_MM_SET_ROUNDING_MODE( _MM_ROUND_UP );
|
||||
__m128 rounded = _mm_cvtepi32_ps( _mm_cvtps_epi32( a ) );
|
||||
_MM_SET_ROUNDING_MODE( originalMode );
|
||||
return rounded;
|
||||
}
|
||||
FORCE_INLINE __m128i _mmw_transpose_epi8( const __m128i& a )
|
||||
{
|
||||
// Perform transpose through two 16->8 bit pack and byte shifts
|
||||
__m128i res = a;
|
||||
const __m128i mask = _mm_setr_epi8( ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0 );
|
||||
res = _mm_packus_epi16( _mm_and_si128( res, mask ), _mm_srli_epi16( res, 8 ) );
|
||||
res = _mm_packus_epi16( _mm_and_si128( res, mask ), _mm_srli_epi16( res, 8 ) );
|
||||
return res;
|
||||
}
|
||||
FORCE_INLINE __m128i _mmw_sllv_ones( const __m128i& ishift )
|
||||
{
|
||||
__m128i shift = _mmw_min_epi32( ishift, _mm_set1_epi32( 32 ) );
|
||||
|
||||
// Uses scalar approach to perform _mm_sllv_epi32(~0, shift)
|
||||
static const unsigned int maskLUT[33] =
|
||||
{
|
||||
~0U << 0, ~0U << 1, ~0U << 2, ~0U << 3, ~0U << 4, ~0U << 5, ~0U << 6, ~0U << 7, ~0U << 8, ~0U << 9, ~0U << 10, ~0U << 11, ~0U << 12, ~0U << 13, ~0U << 14, ~0U << 15,
|
||||
~0U << 16, ~0U << 17, ~0U << 18, ~0U << 19, ~0U << 20, ~0U << 21, ~0U << 22, ~0U << 23, ~0U << 24, ~0U << 25, ~0U << 26, ~0U << 27, ~0U << 28, ~0U << 29, ~0U << 30, ~0U << 31,
|
||||
0U
|
||||
};
|
||||
|
||||
__m128i retMask;
|
||||
simd_i32( retMask )[0] = ( int )maskLUT[simd_i32( shift )[0]];
|
||||
simd_i32( retMask )[1] = ( int )maskLUT[simd_i32( shift )[1]];
|
||||
simd_i32( retMask )[2] = ( int )maskLUT[simd_i32( shift )[2]];
|
||||
simd_i32( retMask )[3] = ( int )maskLUT[simd_i32( shift )[3]];
|
||||
return retMask;
|
||||
}
|
||||
|
||||
static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::SSE2;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Include common algorithm implementation (general, SIMD independent code)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "MaskedOcclusionCullingCommon.inl"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Utility function to create a new object using the allocator callbacks
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree )
|
||||
{
|
||||
MaskedOcclusionCullingPrivate* object = ( MaskedOcclusionCullingPrivate* )alignedAlloc( 64, sizeof( MaskedOcclusionCullingPrivate ) );
|
||||
new( object ) MaskedOcclusionCullingPrivate( alignedAlloc, alignedFree );
|
||||
return object;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Object construction and allocation
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
namespace MaskedOcclusionCullingAVX512
|
||||
{
|
||||
extern MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree );
|
||||
}
|
||||
|
||||
namespace MaskedOcclusionCullingAVX2
|
||||
{
|
||||
extern MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree );
|
||||
}
|
||||
|
||||
MaskedOcclusionCulling* MaskedOcclusionCulling::Create( Implementation RequestedSIMD )
|
||||
{
|
||||
return Create( RequestedSIMD, aligned_alloc, aligned_free );
|
||||
}
|
||||
|
||||
MaskedOcclusionCulling* MaskedOcclusionCulling::Create( Implementation RequestedSIMD, pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree )
|
||||
{
|
||||
MaskedOcclusionCulling* object = nullptr;
|
||||
|
||||
MaskedOcclusionCulling::Implementation impl = DetectCPUFeatures( alignedAlloc, alignedFree );
|
||||
|
||||
if( RequestedSIMD < impl )
|
||||
{
|
||||
impl = RequestedSIMD;
|
||||
}
|
||||
|
||||
// Return best supported version
|
||||
if( object == nullptr && impl >= AVX512 )
|
||||
{
|
||||
object = MaskedOcclusionCullingAVX512::CreateMaskedOcclusionCulling( alignedAlloc, alignedFree ); // Use AVX512 version
|
||||
}
|
||||
if( object == nullptr && impl >= AVX2 )
|
||||
{
|
||||
object = MaskedOcclusionCullingAVX2::CreateMaskedOcclusionCulling( alignedAlloc, alignedFree ); // Use AVX2 version
|
||||
}
|
||||
if( object == nullptr && impl >= SSE41 )
|
||||
{
|
||||
object = MaskedOcclusionCullingSSE41::CreateMaskedOcclusionCulling( alignedAlloc, alignedFree ); // Use SSE4.1 version
|
||||
}
|
||||
if( object == nullptr )
|
||||
{
|
||||
object = MaskedOcclusionCullingSSE2::CreateMaskedOcclusionCulling( alignedAlloc, alignedFree ); // Use SSE2 (slow) version
|
||||
}
|
||||
|
||||
return object;
|
||||
}
|
||||
|
||||
void MaskedOcclusionCulling::Destroy( MaskedOcclusionCulling* moc )
|
||||
{
|
||||
pfnAlignedFree alignedFreeCallback = moc->mAlignedFreeCallback;
|
||||
moc->~MaskedOcclusionCulling();
|
||||
alignedFreeCallback( moc );
|
||||
}
|
596
neo/libs/moc/MaskedOcclusionCulling.h
Normal file
596
neo/libs/moc/MaskedOcclusionCulling.h
Normal file
|
@ -0,0 +1,596 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Copyright 2017 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#pragma once
|
||||
|
||||
/*!
|
||||
* \file MaskedOcclusionCulling.h
|
||||
* \brief Masked Occlusion Culling
|
||||
*
|
||||
* General information
|
||||
* - Input to all API functions are (x,y,w) clip-space coordinates (x positive left, y positive up, w positive away from camera).
|
||||
* We entirely skip the z component and instead compute it as 1 / w, see next bullet. For TestRect the input is NDC (x/w, y/w).
|
||||
* - We use a simple z = 1 / w transform, which is a bit faster than OGL/DX depth transforms. Thus, depth is REVERSED and z = 0 at
|
||||
* the far plane and z = inf at w = 0. We also have to use a GREATER depth function, which explains why all the conservative
|
||||
* tests will be reversed compared to what you might be used to (for example zMaxTri >= zMinBuffer is a visibility test)
|
||||
* - We support different layouts for vertex data (basic AoS and SoA), but note that it's beneficial to store the position data
|
||||
* as tightly in memory as possible to reduce cache misses. Big strides are bad, so it's beneficial to keep position as a separate
|
||||
* stream (rather than bundled with attributes) or to keep a copy of the position data for the occlusion culling system.
|
||||
* - The resolution width must be a multiple of 8 and height a multiple of 4.
|
||||
* - The hierarchical Z buffer is stored OpenGL-style with the y axis pointing up. This includes the scissor box.
|
||||
* - This code is only tested with Visual Studio 2015, but should hopefully be easy to port to other compilers.
|
||||
*/
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Defines used to configure the implementation
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef QUICK_MASK
|
||||
/*!
|
||||
* Configure the algorithm used for updating and merging hierarchical z buffer entries. If QUICK_MASK
|
||||
* is defined to 1, use the algorithm from the paper "Masked Software Occlusion Culling", which has good
|
||||
* balance between performance and low leakage. If QUICK_MASK is defined to 0, use the algorithm from
|
||||
* "Masked Depth Culling for Graphics Hardware" which has less leakage, but also lower performance.
|
||||
*/
|
||||
#define QUICK_MASK 1
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef USE_D3D
|
||||
/*!
|
||||
* Configures the library for use with Direct3D (default) or OpenGL rendering. This changes whether the
|
||||
* screen space Y axis points downwards (D3D) or upwards (OGL), and is primarily important in combination
|
||||
* with the PRECISE_COVERAGE define, where this is important to ensure correct rounding and tie-breaker
|
||||
* behaviour. It also affects the ScissorRect screen space coordinates.
|
||||
*/
|
||||
#define USE_D3D 1
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef PRECISE_COVERAGE
|
||||
/*!
|
||||
* Define PRECISE_COVERAGE to 1 to more closely match GPU rasterization rules. The increased precision comes
|
||||
* at a cost of slightly lower performance.
|
||||
*/
|
||||
#define PRECISE_COVERAGE 1
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef USE_AVX512
|
||||
/*!
|
||||
* Define USE_AVX512 to 1 to enable experimental AVX-512 support. It's currently mostly untested and only
|
||||
* validated on simple examples using Intel SDE. Older compilers may not support AVX-512 intrinsics.
|
||||
*/
|
||||
#define USE_AVX512 0
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef CLIPPING_PRESERVES_ORDER
|
||||
/*!
|
||||
* Define CLIPPING_PRESERVES_ORDER to 1 to prevent clipping from reordering triangle rasterization
|
||||
* order; This comes at a cost (approx 3-4%) but removes one source of temporal frame-to-frame instability.
|
||||
*/
|
||||
#define CLIPPING_PRESERVES_ORDER 1
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef ENABLE_STATS
|
||||
/*!
|
||||
* Define ENABLE_STATS to 1 to gather various statistics during occlusion culling. Can be used for profiling
|
||||
* and debugging. Note that enabling this function will reduce performance significantly.
|
||||
*/
|
||||
#define ENABLE_STATS 0
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef MOC_RECORDER_ENABLE
|
||||
/*!
|
||||
* Define MOC_RECORDER_ENABLE to 1 to enable frame recorder (see FrameRecorder.h/cpp for details)
|
||||
*/
|
||||
#define MOC_RECORDER_ENABLE 0
|
||||
|
||||
#endif
|
||||
|
||||
#if MOC_RECORDER_ENABLE
|
||||
#ifndef MOC_RECORDER_ENABLE_PLAYBACK
|
||||
/*!
|
||||
* Define MOC_RECORDER_ENABLE_PLAYBACK to 1 to enable compilation of the playback code (not needed
|
||||
for recording)
|
||||
*/
|
||||
#define MOC_RECORDER_ENABLE_PLAYBACK 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#if MOC_RECORDER_ENABLE
|
||||
|
||||
#include <mutex>
|
||||
|
||||
class FrameRecorder;
|
||||
|
||||
#endif // #if MOC_RECORDER_ENABLE
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Masked occlusion culling class
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class MaskedOcclusionCulling
|
||||
{
|
||||
public:
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Memory management callback functions
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef void* ( *pfnAlignedAlloc )( size_t alignment, size_t size );
|
||||
typedef void ( *pfnAlignedFree )( void* ptr );
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Enums
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum Implementation
|
||||
{
|
||||
SSE2 = 0,
|
||||
SSE41 = 1,
|
||||
AVX2 = 2,
|
||||
AVX512 = 3
|
||||
};
|
||||
|
||||
enum BackfaceWinding
|
||||
{
|
||||
BACKFACE_NONE = 0,
|
||||
BACKFACE_CW = 1,
|
||||
BACKFACE_CCW = 2,
|
||||
};
|
||||
|
||||
enum CullingResult
|
||||
{
|
||||
VISIBLE = 0x0,
|
||||
OCCLUDED = 0x1,
|
||||
VIEW_CULLED = 0x3
|
||||
};
|
||||
|
||||
enum ClipPlanes
|
||||
{
|
||||
CLIP_PLANE_NONE = 0x00,
|
||||
CLIP_PLANE_NEAR = 0x01,
|
||||
CLIP_PLANE_LEFT = 0x02,
|
||||
CLIP_PLANE_RIGHT = 0x04,
|
||||
CLIP_PLANE_BOTTOM = 0x08,
|
||||
CLIP_PLANE_TOP = 0x10,
|
||||
CLIP_PLANE_SIDES = ( CLIP_PLANE_LEFT | CLIP_PLANE_RIGHT | CLIP_PLANE_BOTTOM | CLIP_PLANE_TOP ),
|
||||
CLIP_PLANE_ALL = ( CLIP_PLANE_LEFT | CLIP_PLANE_RIGHT | CLIP_PLANE_BOTTOM | CLIP_PLANE_TOP | CLIP_PLANE_NEAR )
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Structs
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/*!
|
||||
* Used to specify custom vertex layout. Memory offsets to y and z coordinates are set through
|
||||
* mOffsetY and mOffsetW, and vertex stride is given by mStride. It's possible to configure both
|
||||
* AoS and SoA layouts. Note that large strides may cause more cache misses and decrease
|
||||
* performance. It is advisable to store position data as compactly in memory as possible.
|
||||
*/
|
||||
struct VertexLayout
|
||||
{
|
||||
VertexLayout() {}
|
||||
VertexLayout( int stride, int offsetY, int offsetZW ) :
|
||||
mStride( stride ), mOffsetY( offsetY ), mOffsetW( offsetZW ) {}
|
||||
|
||||
int mStride; //!< byte stride between vertices
|
||||
int mOffsetY; //!< byte offset from X to Y coordinate
|
||||
union
|
||||
{
|
||||
int mOffsetZ; //!< byte offset from X to Z coordinate
|
||||
int mOffsetW; //!< byte offset from X to W coordinate
|
||||
};
|
||||
};
|
||||
|
||||
/*!
|
||||
* Used to control scissoring during rasterization. Note that we only provide coarse scissor support.
|
||||
* The scissor box x coordinates must be a multiple of 32, and the y coordinates a multiple of 8.
|
||||
* Scissoring is mainly meant as a means of enabling binning (sort middle) rasterizers in case
|
||||
* application developers want to use that approach for multithreading.
|
||||
*/
|
||||
struct ScissorRect
|
||||
{
|
||||
ScissorRect() {}
|
||||
ScissorRect( int minX, int minY, int maxX, int maxY ) :
|
||||
mMinX( minX ), mMinY( minY ), mMaxX( maxX ), mMaxY( maxY ) {}
|
||||
|
||||
int mMinX; //!< Screen space X coordinate for left side of scissor rect, inclusive and must be a multiple of 32
|
||||
int mMinY; //!< Screen space Y coordinate for bottom side of scissor rect, inclusive and must be a multiple of 8
|
||||
int mMaxX; //!< Screen space X coordinate for right side of scissor rect, <B>non</B> inclusive and must be a multiple of 32
|
||||
int mMaxY; //!< Screen space Y coordinate for top side of scissor rect, <B>non</B> inclusive and must be a multiple of 8
|
||||
};
|
||||
|
||||
/*!
|
||||
* Used to specify storage area for a binlist, containing triangles. This struct is used for binning
|
||||
* and multithreading. The host application is responsible for allocating memory for the binlists.
|
||||
*/
|
||||
struct TriList
|
||||
{
|
||||
unsigned int mNumTriangles; //!< Maximum number of triangles that may be stored in mPtr
|
||||
unsigned int mTriIdx; //!< Index of next triangle to be written, clear before calling BinTriangles to start from the beginning of the list
|
||||
float* mPtr; //!< Scratchpad buffer allocated by the host application
|
||||
};
|
||||
|
||||
/*!
|
||||
* Statistics that can be gathered during occluder rendering and visibility to aid debugging
|
||||
* and profiling. Must be enabled by changing the ENABLE_STATS define.
|
||||
*/
|
||||
struct OcclusionCullingStatistics
|
||||
{
|
||||
struct
|
||||
{
|
||||
long long mNumProcessedTriangles; //!< Number of occluder triangles processed in total
|
||||
long long mNumRasterizedTriangles; //!< Number of occluder triangles passing view frustum and backface culling
|
||||
long long mNumTilesTraversed; //!< Number of tiles traversed by the rasterizer
|
||||
long long mNumTilesUpdated; //!< Number of tiles where the hierarchical z buffer was updated
|
||||
long long mNumTilesMerged; //!< Number of tiles where the hierarchical z buffer was updated
|
||||
} mOccluders;
|
||||
|
||||
struct
|
||||
{
|
||||
long long mNumProcessedRectangles; //!< Number of rects processed (TestRect())
|
||||
long long mNumProcessedTriangles; //!< Number of ocludee triangles processed (TestTriangles())
|
||||
long long mNumRasterizedTriangles; //!< Number of ocludee triangle passing view frustum and backface culling
|
||||
long long mNumTilesTraversed; //!< Number of tiles traversed by triangle & rect rasterizers
|
||||
} mOccludees;
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Functions
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/*!
|
||||
* \brief Creates a new object with default state, no z buffer attached/allocated.
|
||||
*/
|
||||
static MaskedOcclusionCulling* Create( Implementation RequestedSIMD = AVX512 );
|
||||
|
||||
/*!
|
||||
* \brief Creates a new object with default state, no z buffer attached/allocated.
|
||||
* \param alignedAlloc Pointer to a callback function used when allocating memory
|
||||
* \param alignedFree Pointer to a callback function used when freeing memory
|
||||
*/
|
||||
static MaskedOcclusionCulling* Create( Implementation RequestedSIMD, pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree );
|
||||
|
||||
/*!
|
||||
* \brief Destroys an object and frees the z buffer memory. Note that you cannot
|
||||
* use the delete operator, and should rather use this function to free up memory.
|
||||
*/
|
||||
static void Destroy( MaskedOcclusionCulling* moc );
|
||||
|
||||
/*!
|
||||
* \brief Sets the resolution of the hierarchical depth buffer. This function will
|
||||
* re-allocate the current depth buffer (if present). The contents of the
|
||||
* buffer is undefined until ClearBuffer() is called.
|
||||
*
|
||||
* \param witdh The width of the buffer in pixels, must be a multiple of 8
|
||||
* \param height The height of the buffer in pixels, must be a multiple of 4
|
||||
*/
|
||||
virtual void SetResolution( unsigned int width, unsigned int height ) = 0;
|
||||
|
||||
/*!
|
||||
* \brief Gets the resolution of the hierarchical depth buffer.
|
||||
*
|
||||
* \param witdh Output: The width of the buffer in pixels
|
||||
* \param height Output: The height of the buffer in pixels
|
||||
*/
|
||||
virtual void GetResolution( unsigned int& width, unsigned int& height ) const = 0;
|
||||
|
||||
/*!
|
||||
* \brief Returns the tile size for the current implementation.
|
||||
*
|
||||
* \param nBinsW Number of vertical bins, the screen is divided into nBinsW x nBinsH
|
||||
* rectangular bins.
|
||||
* \param nBinsH Number of horizontal bins, the screen is divided into nBinsW x nBinsH
|
||||
* rectangular bins.
|
||||
* \param outBinWidth Output: The width of the single bin in pixels (except for the
|
||||
* rightmost bin width, which is extended to resolution width)
|
||||
* \param outBinHeight Output: The height of the single bin in pixels (except for the
|
||||
* bottommost bin height, which is extended to resolution height)
|
||||
*/
|
||||
virtual void ComputeBinWidthHeight( unsigned int nBinsW, unsigned int nBinsH, unsigned int& outBinWidth, unsigned int& outBinHeight ) = 0;
|
||||
|
||||
/*!
|
||||
* \brief Sets the distance for the near clipping plane. Default is nearDist = 0.
|
||||
*
|
||||
* \param nearDist The distance to the near clipping plane, given as clip space w
|
||||
*/
|
||||
virtual void SetNearClipPlane( float nearDist ) = 0;
|
||||
|
||||
/*!
|
||||
* \brief Gets the distance for the near clipping plane.
|
||||
*/
|
||||
virtual float GetNearClipPlane() const = 0;
|
||||
|
||||
/*!
|
||||
* \brief Clears the hierarchical depth buffer.
|
||||
*/
|
||||
virtual void ClearBuffer() = 0;
|
||||
|
||||
/*!
|
||||
* \brief Merge a second hierarchical depth buffer into the main buffer.
|
||||
*/
|
||||
virtual void MergeBuffer( MaskedOcclusionCulling* BufferB ) = 0;
|
||||
|
||||
/*!
|
||||
* \brief Renders a mesh of occluder triangles and updates the hierarchical z buffer
|
||||
* with conservative depth values.
|
||||
*
|
||||
* This function is optimized for vertex layouts with stride 16 and y and w
|
||||
* offsets of 4 and 12 bytes, respectively.
|
||||
*
|
||||
* \param inVtx Pointer to an array of input vertices, should point to the x component
|
||||
* of the first vertex. The input vertices are given as (x,y,w) coordinates
|
||||
* in clip space. The memory layout can be changed using vtxLayout.
|
||||
* \param inTris Pointer to an array of vertex indices. Each triangle is created
|
||||
* from three indices consecutively fetched from the array.
|
||||
* \param nTris The number of triangles to render (inTris must contain atleast 3*nTris
|
||||
* entries)
|
||||
* \param modelToClipMatrix all vertices will be transformed by this matrix before
|
||||
* performing projection. If nullptr is passed the transform step will be skipped
|
||||
* \param bfWinding Sets triangle winding order to consider backfacing, must be one one
|
||||
* of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled
|
||||
* and will not be rasterized. You may use BACKFACE_NONE to disable culling for
|
||||
* double sided geometry
|
||||
* \param clipPlaneMask A mask indicating which clip planes should be considered by the
|
||||
* triangle clipper. Can be used as an optimization if your application can
|
||||
* determine (for example during culling) that a group of triangles does not
|
||||
* intersect a certain frustum plane. However, setting an incorrect mask may
|
||||
* cause out of bounds memory accesses.
|
||||
* \param vtxLayout A struct specifying the vertex layout (see struct for detailed
|
||||
* description). For best performance, it is advisable to store position data
|
||||
* as compactly in memory as possible.
|
||||
* \return Will return VIEW_CULLED if all triangles are either outside the frustum or
|
||||
* backface culled, returns VISIBLE otherwise.
|
||||
*/
|
||||
virtual CullingResult RenderTriangles( const float* inVtx, const unsigned int* inTris, int nTris, const float* modelToClipMatrix = nullptr, BackfaceWinding bfWinding = BACKFACE_CW, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, const VertexLayout& vtxLayout = VertexLayout( 16, 4, 12 ) ) = 0;
|
||||
|
||||
/*!
|
||||
* \brief Occlusion query for a rectangle with a given depth. The rectangle is given
|
||||
* in normalized device coordinates where (x,y) coordinates between [-1,1] map
|
||||
* to the visible screen area. The query uses a GREATER_EQUAL (reversed) depth
|
||||
* test meaning that depth values equal to the contents of the depth buffer are
|
||||
* counted as visible.
|
||||
*
|
||||
* \param xmin NDC coordinate of the left side of the rectangle.
|
||||
* \param ymin NDC coordinate of the bottom side of the rectangle.
|
||||
* \param xmax NDC coordinate of the right side of the rectangle.
|
||||
* \param ymax NDC coordinate of the top side of the rectangle.
|
||||
* \param ymax NDC coordinate of the top side of the rectangle.
|
||||
* \param wmin Clip space W coordinate for the rectangle.
|
||||
* \return The query will return VISIBLE if the rectangle may be visible, OCCLUDED
|
||||
* if the rectangle is occluded by a previously rendered object, or VIEW_CULLED
|
||||
* if the rectangle is outside the view frustum.
|
||||
*/
|
||||
virtual CullingResult TestRect( float xmin, float ymin, float xmax, float ymax, float wmin ) const = 0;
|
||||
|
||||
/*!
|
||||
* \brief This function is similar to RenderTriangles(), but performs an occlusion
|
||||
* query instead and does not update the hierarchical z buffer. The query uses
|
||||
* a GREATER_EQUAL (reversed) depth test meaning that depth values equal to the
|
||||
* contents of the depth buffer are counted as visible.
|
||||
*
|
||||
* This function is optimized for vertex layouts with stride 16 and y and w
|
||||
* offsets of 4 and 12 bytes, respectively.
|
||||
*
|
||||
* \param inVtx Pointer to an array of input vertices, should point to the x component
|
||||
* of the first vertex. The input vertices are given as (x,y,w) coordinates
|
||||
* in clip space. The memory layout can be changed using vtxLayout.
|
||||
* \param inTris Pointer to an array of triangle indices. Each triangle is created
|
||||
* from three indices consecutively fetched from the array.
|
||||
* \param nTris The number of triangles to render (inTris must contain atleast 3*nTris
|
||||
* entries)
|
||||
* \param modelToClipMatrix all vertices will be transformed by this matrix before
|
||||
* performing projection. If nullptr is passed the transform step will be skipped
|
||||
* \param bfWinding Sets triangle winding order to consider backfacing, must be one one
|
||||
* of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled
|
||||
* and will not be occlusion tested. You may use BACKFACE_NONE to disable culling
|
||||
* for double sided geometry
|
||||
* \param clipPlaneMask A mask indicating which clip planes should be considered by the
|
||||
* triangle clipper. Can be used as an optimization if your application can
|
||||
* determine (for example during culling) that a group of triangles does not
|
||||
* intersect a certain frustum plane. However, setting an incorrect mask may
|
||||
* cause out of bounds memory accesses.
|
||||
* \param vtxLayout A struct specifying the vertex layout (see struct for detailed
|
||||
* description). For best performance, it is advisable to store position data
|
||||
* as compactly in memory as possible.
|
||||
* \return The query will return VISIBLE if the triangle mesh may be visible, OCCLUDED
|
||||
* if the mesh is occluded by a previously rendered object, or VIEW_CULLED if all
|
||||
* triangles are entirely outside the view frustum or backface culled.
|
||||
*/
|
||||
virtual CullingResult TestTriangles( const float* inVtx, const unsigned int* inTris, int nTris, const float* modelToClipMatrix = nullptr, BackfaceWinding bfWinding = BACKFACE_CW, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, const VertexLayout& vtxLayout = VertexLayout( 16, 4, 12 ) ) = 0;
|
||||
|
||||
/*!
|
||||
* \brief Perform input assembly, clipping , projection, triangle setup, and write
|
||||
* triangles to the screen space bins they overlap. This function can be used to
|
||||
* distribute work for threading (See the CullingThreadpool class for an example)
|
||||
*
|
||||
* \param inVtx Pointer to an array of input vertices, should point to the x component
|
||||
* of the first vertex. The input vertices are given as (x,y,w) coordinates
|
||||
* in clip space. The memory layout can be changed using vtxLayout.
|
||||
* \param inTris Pointer to an array of vertex indices. Each triangle is created
|
||||
* from three indices consecutively fetched from the array.
|
||||
* \param nTris The number of triangles to render (inTris must contain atleast 3*nTris
|
||||
* entries)
|
||||
* \param triLists Pointer to an array of TriList objects with one TriList object per
|
||||
* bin. If a triangle overlaps a bin, it will be written to the corresponding
|
||||
* trilist. Note that this method appends the triangles to the current list, to
|
||||
* start writing from the beginning of the list, set triList.mTriIdx = 0
|
||||
* \param nBinsW Number of vertical bins, the screen is divided into nBinsW x nBinsH
|
||||
* rectangular bins.
|
||||
* \param nBinsH Number of horizontal bins, the screen is divided into nBinsW x nBinsH
|
||||
* rectangular bins.
|
||||
* \param modelToClipMatrix all vertices will be transformed by this matrix before
|
||||
* performing projection. If nullptr is passed the transform step will be skipped
|
||||
* \param clipPlaneMask A mask indicating which clip planes should be considered by the
|
||||
* triangle clipper. Can be used as an optimization if your application can
|
||||
* determine (for example during culling) that a group of triangles does not
|
||||
* intersect a certain frustum plane. However, setting an incorrect mask may
|
||||
* cause out of bounds memory accesses.
|
||||
* \param vtxLayout A struct specifying the vertex layout (see struct for detailed
|
||||
* description). For best performance, it is advisable to store position data
|
||||
* as compactly in memory as possible.
|
||||
* \param bfWinding Sets triangle winding order to consider backfacing, must be one one
|
||||
* of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled
|
||||
* and will not be binned / rasterized. You may use BACKFACE_NONE to disable culling
|
||||
* for double sided geometry
|
||||
*/
|
||||
virtual void BinTriangles( const float* inVtx, const unsigned int* inTris, int nTris, TriList* triLists, unsigned int nBinsW, unsigned int nBinsH, const float* modelToClipMatrix = nullptr, BackfaceWinding bfWinding = BACKFACE_CW, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, const VertexLayout& vtxLayout = VertexLayout( 16, 4, 12 ) ) = 0;
|
||||
|
||||
/*!
|
||||
* \brief Renders all occluder triangles in a trilist. This function can be used in
|
||||
* combination with BinTriangles() to create a threded (binning) rasterizer. The
|
||||
* bins can be processed independently by different threads without risking writing
|
||||
* to overlapping memory regions.
|
||||
*
|
||||
* \param triLists A triangle list, filled using the BinTriangles() function that is to
|
||||
* be rendered.
|
||||
* \param scissor A scissor box limiting the rendering region to the bin. The size of each
|
||||
* bin must be a multiple of 32x8 pixels due to implementation constraints. For a
|
||||
* render target with (width, height) resolution and (nBinsW, nBinsH) bins, the
|
||||
* size of a bin is:
|
||||
* binWidth = (width / nBinsW) - (width / nBinsW) % 32;
|
||||
* binHeight = (height / nBinsH) - (height / nBinsH) % 8;
|
||||
* The last row and column of tiles have a different size:
|
||||
* lastColBinWidth = width - (nBinsW-1)*binWidth;
|
||||
* lastRowBinHeight = height - (nBinsH-1)*binHeight;
|
||||
*/
|
||||
virtual void RenderTrilist( const TriList& triList, const ScissorRect* scissor ) = 0;
|
||||
|
||||
/*!
|
||||
* \brief Creates a per-pixel depth buffer from the hierarchical z buffer representation.
|
||||
* Intended for visualizing the hierarchical depth buffer for debugging. The
|
||||
* buffer is written in scanline order, from the top to bottom (D3D) or bottom to
|
||||
* top (OGL) of the surface. See the USE_D3D define.
|
||||
*
|
||||
* \param depthData Pointer to memory where the per-pixel depth data is written. Must
|
||||
* hold storage for atleast width*height elements as set by setResolution.
|
||||
*/
|
||||
virtual void ComputePixelDepthBuffer( float* depthData, bool flipY ) = 0;
|
||||
|
||||
/*!
|
||||
* \brief Fetch occlusion culling statistics, returns zeroes if ENABLE_STATS define is
|
||||
* not defined. The statistics can be used for profiling or debugging.
|
||||
*/
|
||||
virtual OcclusionCullingStatistics GetStatistics() = 0;
|
||||
|
||||
/*!
|
||||
* \brief Returns the implementation (CPU instruction set) version of this object.
|
||||
*/
|
||||
virtual Implementation GetImplementation() = 0;
|
||||
|
||||
/*!
|
||||
* \brief Utility function for transforming vertices and outputting them to an (x,y,z,w)
|
||||
* format suitable for the occluder rasterization and occludee testing functions.
|
||||
*
|
||||
* \param mtx Pointer to matrix data. The matrix should column major for post
|
||||
* multiplication (OGL) and row major for pre-multiplication (DX). This is
|
||||
* consistent with OpenGL / DirectX behavior.
|
||||
* \param inVtx Pointer to an array of input vertices. The input vertices are given as
|
||||
* (x,y,z) coordinates. The memory layout can be changed using vtxLayout.
|
||||
* \param xfVtx Pointer to an array to store transformed vertices. The transformed
|
||||
* vertices are always stored as array of structs (AoS) (x,y,z,w) packed in memory.
|
||||
* \param nVtx Number of vertices to transform.
|
||||
* \param vtxLayout A struct specifying the vertex layout (see struct for detailed
|
||||
* description). For best performance, it is advisable to store position data
|
||||
* as compactly in memory as possible. Note that for this function, the
|
||||
* w-component is assumed to be 1.0.
|
||||
*/
|
||||
static void TransformVertices( const float* mtx, const float* inVtx, float* xfVtx, unsigned int nVtx, const VertexLayout& vtxLayout = VertexLayout( 12, 4, 8 ) );
|
||||
|
||||
/*!
|
||||
* \brief Get used memory alloc/free callbacks.
|
||||
*/
|
||||
void GetAllocFreeCallback( pfnAlignedAlloc& allocCallback, pfnAlignedFree& freeCallback )
|
||||
{
|
||||
allocCallback = mAlignedAllocCallback, freeCallback = mAlignedFreeCallback;
|
||||
}
|
||||
|
||||
#if MOC_RECORDER_ENABLE
|
||||
/*!
|
||||
* \brief Start recording subsequent rasterization and testing calls using the FrameRecorder.
|
||||
* The function calls that are recorded are:
|
||||
* - ClearBuffer
|
||||
* - RenderTriangles
|
||||
* - TestTriangles
|
||||
* - TestRect
|
||||
* All inputs and outputs are recorded, which can be used for correctness validation
|
||||
* and performance testing.
|
||||
*
|
||||
* \param outputFilePath Pointer to name of the output file.
|
||||
* \return 'true' if recording was started successfully, 'false' otherwise (file access error).
|
||||
*/
|
||||
bool RecorderStart( const char* outputFilePath ) const;
|
||||
|
||||
/*!
|
||||
* \brief Stop recording, flush output and release used memory.
|
||||
*/
|
||||
void RecorderStop( ) const;
|
||||
|
||||
/*!
|
||||
* \brief Manually record triangles. This is called automatically from MaskedOcclusionCulling::RenderTriangles
|
||||
* if the recording is started, but not from BinTriangles/RenderTrilist (used in multithreaded codepath), in
|
||||
* which case it has to be called manually.
|
||||
*
|
||||
* \param inVtx Pointer to an array of input vertices, should point to the x component
|
||||
* of the first vertex. The input vertices are given as (x,y,w) coordinates
|
||||
* in clip space. The memory layout can be changed using vtxLayout.
|
||||
* \param inTris Pointer to an array of triangle indices. Each triangle is created
|
||||
* from three indices consecutively fetched from the array.
|
||||
* \param nTris The number of triangles to render (inTris must contain atleast 3*nTris
|
||||
* entries)
|
||||
* \param modelToClipMatrix all vertices will be transformed by this matrix before
|
||||
* performing projection. If nullptr is passed the transform step will be skipped
|
||||
* \param bfWinding Sets triangle winding order to consider backfacing, must be one one
|
||||
* of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled
|
||||
* and will not be occlusion tested. You may use BACKFACE_NONE to disable culling
|
||||
* for double sided geometry
|
||||
* \param clipPlaneMask A mask indicating which clip planes should be considered by the
|
||||
* triangle clipper. Can be used as an optimization if your application can
|
||||
* determine (for example during culling) that a group of triangles does not
|
||||
* intersect a certain frustum plane. However, setting an incorrect mask may
|
||||
* cause out of bounds memory accesses.
|
||||
* \param vtxLayout A struct specifying the vertex layout (see struct for detailed
|
||||
* description). For best performance, it is advisable to store position data
|
||||
* as compactly in memory as possible.
|
||||
* \param cullingResult cull result value expected to be returned by executing the
|
||||
* RenderTriangles call with recorded parameters.
|
||||
*/
|
||||
//
|
||||
// merge the binned data back into original layout; in this case, call it manually from your Threadpool implementation (already added to CullingThreadpool).
|
||||
// If recording is not enabled, calling this function will do nothing.
|
||||
void RecordRenderTriangles( const float* inVtx, const unsigned int* inTris, int nTris, const float* modelToClipMatrix = nullptr, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, BackfaceWinding bfWinding = BACKFACE_CW, const VertexLayout& vtxLayout = VertexLayout( 16, 4, 12 ), CullingResult cullingResult = ( CullingResult ) - 1 );
|
||||
#endif // #if MOC_RECORDER_ENABLE
|
||||
|
||||
protected:
|
||||
pfnAlignedAlloc mAlignedAllocCallback;
|
||||
pfnAlignedFree mAlignedFreeCallback;
|
||||
|
||||
mutable OcclusionCullingStatistics mStats;
|
||||
|
||||
#if MOC_RECORDER_ENABLE
|
||||
mutable FrameRecorder* mRecorder;
|
||||
mutable std::mutex mRecorderMutex;
|
||||
#endif // #if MOC_RECORDER_ENABLE
|
||||
|
||||
virtual ~MaskedOcclusionCulling() {}
|
||||
};
|
280
neo/libs/moc/MaskedOcclusionCullingAVX2.cpp
Normal file
280
neo/libs/moc/MaskedOcclusionCullingAVX2.cpp
Normal file
|
@ -0,0 +1,280 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Copyright 2017 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <float.h>
|
||||
#include "MaskedOcclusionCulling.h"
|
||||
#include "CompilerSpecific.inl"
|
||||
|
||||
#if MOC_RECORDER_ENABLE
|
||||
#include "FrameRecorder.h"
|
||||
#endif
|
||||
|
||||
#if defined(__MICROSOFT_COMPILER) && _MSC_VER < 1900
|
||||
// If you remove/comment this error, the code will compile & use the SSE41 version instead.
|
||||
#error Older versions than visual studio 2015 not supported due to compiler bug(s)
|
||||
#endif
|
||||
|
||||
#if !defined(__MICROSOFT_COMPILER) || _MSC_VER >= 1900
|
||||
|
||||
// For performance reasons, the MaskedOcclusionCullingAVX2.cpp file should be compiled with VEX encoding for SSE instructions (to avoid
|
||||
// AVX-SSE transition penalties, see https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties). However, the SSE
|
||||
// version in MaskedOcclusionCulling.cpp _must_ be compiled without VEX encoding to allow backwards compatibility. Best practice is to
|
||||
// use lowest supported target platform (e.g. /arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files.
|
||||
#ifndef __AVX2__
|
||||
#error For best performance, MaskedOcclusionCullingAVX2.cpp should be compiled with /arch:AVX2
|
||||
#endif
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// AVX specific defines and constants
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define SIMD_LANES 8
|
||||
#define TILE_HEIGHT_SHIFT 3
|
||||
|
||||
#define SIMD_LANE_IDX _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7)
|
||||
|
||||
#define SIMD_SUB_TILE_COL_OFFSET _mm256_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
|
||||
#define SIMD_SUB_TILE_ROW_OFFSET _mm256_setr_epi32(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT)
|
||||
#define SIMD_SUB_TILE_COL_OFFSET_F _mm256_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
|
||||
#define SIMD_SUB_TILE_ROW_OFFSET_F _mm256_setr_ps(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT)
|
||||
|
||||
#define SIMD_SHUFFLE_SCANLINE_TO_SUBTILES _mm256_setr_epi8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF, 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF)
|
||||
|
||||
#define SIMD_LANE_YCOORD_I _mm256_setr_epi32(128, 384, 640, 896, 1152, 1408, 1664, 1920)
|
||||
#define SIMD_LANE_YCOORD_F _mm256_setr_ps(128.0f, 384.0f, 640.0f, 896.0f, 1152.0f, 1408.0f, 1664.0f, 1920.0f)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// AVX specific typedefs and functions
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef __m256 __mw;
|
||||
typedef __m256i __mwi;
|
||||
|
||||
#define _mmw_set1_ps _mm256_set1_ps
|
||||
#define _mmw_setzero_ps _mm256_setzero_ps
|
||||
#define _mmw_and_ps _mm256_and_ps
|
||||
#define _mmw_or_ps _mm256_or_ps
|
||||
#define _mmw_xor_ps _mm256_xor_ps
|
||||
#define _mmw_not_ps(a) _mm256_xor_ps((a), _mm256_castsi256_ps(_mm256_set1_epi32(~0)))
|
||||
#define _mmw_andnot_ps _mm256_andnot_ps
|
||||
#define _mmw_neg_ps(a) _mm256_xor_ps((a), _mm256_set1_ps(-0.0f))
|
||||
#define _mmw_abs_ps(a) _mm256_and_ps((a), _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)))
|
||||
#define _mmw_add_ps _mm256_add_ps
|
||||
#define _mmw_sub_ps _mm256_sub_ps
|
||||
#define _mmw_mul_ps _mm256_mul_ps
|
||||
#define _mmw_div_ps _mm256_div_ps
|
||||
#define _mmw_min_ps _mm256_min_ps
|
||||
#define _mmw_max_ps _mm256_max_ps
|
||||
#define _mmw_fmadd_ps _mm256_fmadd_ps
|
||||
#define _mmw_fmsub_ps _mm256_fmsub_ps
|
||||
#define _mmw_movemask_ps _mm256_movemask_ps
|
||||
#define _mmw_blendv_ps _mm256_blendv_ps
|
||||
#define _mmw_cmpge_ps(a,b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
|
||||
#define _mmw_cmpgt_ps(a,b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
|
||||
#define _mmw_cmpeq_ps(a,b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
|
||||
#define _mmw_floor_ps(x) _mm256_round_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)
|
||||
#define _mmw_ceil_ps(x) _mm256_round_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)
|
||||
#define _mmw_shuffle_ps _mm256_shuffle_ps
|
||||
#define _mmw_insertf32x4_ps _mm256_insertf128_ps
|
||||
#define _mmw_cvtepi32_ps _mm256_cvtepi32_ps
|
||||
#define _mmw_blendv_epi32(a,b,c) simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c)))
|
||||
|
||||
#define _mmw_set1_epi32 _mm256_set1_epi32
|
||||
#define _mmw_setzero_epi32 _mm256_setzero_si256
|
||||
#define _mmw_and_epi32 _mm256_and_si256
|
||||
#define _mmw_or_epi32 _mm256_or_si256
|
||||
#define _mmw_xor_epi32 _mm256_xor_si256
|
||||
#define _mmw_not_epi32(a) _mm256_xor_si256((a), _mm256_set1_epi32(~0))
|
||||
#define _mmw_andnot_epi32 _mm256_andnot_si256
|
||||
#define _mmw_neg_epi32(a) _mm256_sub_epi32(_mm256_set1_epi32(0), (a))
|
||||
#define _mmw_add_epi32 _mm256_add_epi32
|
||||
#define _mmw_sub_epi32 _mm256_sub_epi32
|
||||
#define _mmw_min_epi32 _mm256_min_epi32
|
||||
#define _mmw_max_epi32 _mm256_max_epi32
|
||||
#define _mmw_subs_epu16 _mm256_subs_epu16
|
||||
#define _mmw_mullo_epi32 _mm256_mullo_epi32
|
||||
#define _mmw_cmpeq_epi32 _mm256_cmpeq_epi32
|
||||
#define _mmw_testz_epi32 _mm256_testz_si256
|
||||
#define _mmw_cmpgt_epi32 _mm256_cmpgt_epi32
|
||||
#define _mmw_srai_epi32 _mm256_srai_epi32
|
||||
#define _mmw_srli_epi32 _mm256_srli_epi32
|
||||
#define _mmw_slli_epi32 _mm256_slli_epi32
|
||||
#define _mmw_sllv_ones(x) _mm256_sllv_epi32(SIMD_BITS_ONE, x)
|
||||
#define _mmw_transpose_epi8(x) _mm256_shuffle_epi8(x, SIMD_SHUFFLE_SCANLINE_TO_SUBTILES)
|
||||
#define _mmw_abs_epi32 _mm256_abs_epi32
|
||||
#define _mmw_cvtps_epi32 _mm256_cvtps_epi32
|
||||
#define _mmw_cvttps_epi32 _mm256_cvttps_epi32
|
||||
|
||||
#define _mmx_dp4_ps(a, b) _mm_dp_ps(a, b, 0xFF)
|
||||
#define _mmx_fmadd_ps _mm_fmadd_ps
|
||||
#define _mmx_max_epi32 _mm_max_epi32
|
||||
#define _mmx_min_epi32 _mm_min_epi32
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// SIMD casting functions
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename T, typename Y> FORCE_INLINE T simd_cast( Y A );
|
||||
template<> FORCE_INLINE __m128 simd_cast<__m128>( float A )
|
||||
{
|
||||
return _mm_set1_ps( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m128 simd_cast<__m128>( __m128i A )
|
||||
{
|
||||
return _mm_castsi128_ps( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m128 simd_cast<__m128>( __m128 A )
|
||||
{
|
||||
return A;
|
||||
}
|
||||
template<> FORCE_INLINE __m128i simd_cast<__m128i>( int A )
|
||||
{
|
||||
return _mm_set1_epi32( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m128i simd_cast<__m128i>( __m128 A )
|
||||
{
|
||||
return _mm_castps_si128( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m128i simd_cast<__m128i>( __m128i A )
|
||||
{
|
||||
return A;
|
||||
}
|
||||
template<> FORCE_INLINE __m256 simd_cast<__m256>( float A )
|
||||
{
|
||||
return _mm256_set1_ps( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m256 simd_cast<__m256>( __m256i A )
|
||||
{
|
||||
return _mm256_castsi256_ps( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m256 simd_cast<__m256>( __m256 A )
|
||||
{
|
||||
return A;
|
||||
}
|
||||
template<> FORCE_INLINE __m256i simd_cast<__m256i>( int A )
|
||||
{
|
||||
return _mm256_set1_epi32( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m256i simd_cast<__m256i>( __m256 A )
|
||||
{
|
||||
return _mm256_castps_si256( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m256i simd_cast<__m256i>( __m256i A )
|
||||
{
|
||||
return A;
|
||||
}
|
||||
|
||||
#define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \
|
||||
FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \
|
||||
union accessor { simd_type m_native; base_type m_array[elements]; }; \
|
||||
is_const accessor *acs = reinterpret_cast<is_const accessor*>(&a); \
|
||||
return acs->m_array; \
|
||||
}
|
||||
|
||||
MAKE_ACCESSOR( simd_f32, __m128, float,, 4 )
|
||||
MAKE_ACCESSOR( simd_f32, __m128, float, const, 4 )
|
||||
MAKE_ACCESSOR( simd_i32, __m128i, int,, 4 )
|
||||
MAKE_ACCESSOR( simd_i32, __m128i, int, const, 4 )
|
||||
|
||||
MAKE_ACCESSOR( simd_f32, __m256, float,, 8 )
|
||||
MAKE_ACCESSOR( simd_f32, __m256, float, const, 8 )
|
||||
MAKE_ACCESSOR( simd_i32, __m256i, int,, 8 )
|
||||
MAKE_ACCESSOR( simd_i32, __m256i, int, const, 8 )
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Specialized AVX input assembly function for general vertex gather
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef MaskedOcclusionCulling::VertexLayout VertexLayout;
|
||||
|
||||
FORCE_INLINE void GatherVertices( __m256* vtxX, __m256* vtxY, __m256* vtxW, const float* inVtx, const unsigned int* inTrisPtr, int numLanes, const VertexLayout& vtxLayout )
|
||||
{
|
||||
assert( numLanes >= 1 );
|
||||
|
||||
const __m256i SIMD_TRI_IDX_OFFSET = _mm256_setr_epi32( 0, 3, 6, 9, 12, 15, 18, 21 );
|
||||
static const __m256i SIMD_LANE_MASK[9] =
|
||||
{
|
||||
_mm256_setr_epi32( 0, 0, 0, 0, 0, 0, 0, 0 ),
|
||||
_mm256_setr_epi32( ~0, 0, 0, 0, 0, 0, 0, 0 ),
|
||||
_mm256_setr_epi32( ~0, ~0, 0, 0, 0, 0, 0, 0 ),
|
||||
_mm256_setr_epi32( ~0, ~0, ~0, 0, 0, 0, 0, 0 ),
|
||||
_mm256_setr_epi32( ~0, ~0, ~0, ~0, 0, 0, 0, 0 ),
|
||||
_mm256_setr_epi32( ~0, ~0, ~0, ~0, ~0, 0, 0, 0 ),
|
||||
_mm256_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, 0, 0 ),
|
||||
_mm256_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0 ),
|
||||
_mm256_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 )
|
||||
};
|
||||
|
||||
// Compute per-lane index list offset that guards against out of bounds memory accesses
|
||||
__m256i safeTriIdxOffset = _mm256_and_si256( SIMD_TRI_IDX_OFFSET, SIMD_LANE_MASK[numLanes] );
|
||||
|
||||
// Fetch triangle indices.
|
||||
__m256i vtxIdx[3];
|
||||
vtxIdx[0] = _mmw_mullo_epi32( _mm256_i32gather_epi32( ( const int* )inTrisPtr + 0, safeTriIdxOffset, 4 ), _mmw_set1_epi32( vtxLayout.mStride ) );
|
||||
vtxIdx[1] = _mmw_mullo_epi32( _mm256_i32gather_epi32( ( const int* )inTrisPtr + 1, safeTriIdxOffset, 4 ), _mmw_set1_epi32( vtxLayout.mStride ) );
|
||||
vtxIdx[2] = _mmw_mullo_epi32( _mm256_i32gather_epi32( ( const int* )inTrisPtr + 2, safeTriIdxOffset, 4 ), _mmw_set1_epi32( vtxLayout.mStride ) );
|
||||
|
||||
char* vPtr = ( char* )inVtx;
|
||||
|
||||
// Fetch triangle vertices
|
||||
for( int i = 0; i < 3; i++ )
|
||||
{
|
||||
vtxX[i] = _mm256_i32gather_ps( ( float* )vPtr, vtxIdx[i], 1 );
|
||||
vtxY[i] = _mm256_i32gather_ps( ( float* )( vPtr + vtxLayout.mOffsetY ), vtxIdx[i], 1 );
|
||||
vtxW[i] = _mm256_i32gather_ps( ( float* )( vPtr + vtxLayout.mOffsetW ), vtxIdx[i], 1 );
|
||||
}
|
||||
}
|
||||
|
||||
namespace MaskedOcclusionCullingAVX2
|
||||
{
|
||||
static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::AVX2;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Include common algorithm implementation (general, SIMD independent code)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "MaskedOcclusionCullingCommon.inl"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Utility function to create a new object using the allocator callbacks
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc;
|
||||
typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree;
|
||||
|
||||
MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree )
|
||||
{
|
||||
MaskedOcclusionCullingPrivate* object = ( MaskedOcclusionCullingPrivate* )alignedAlloc( 64, sizeof( MaskedOcclusionCullingPrivate ) );
|
||||
new( object ) MaskedOcclusionCullingPrivate( alignedAlloc, alignedFree );
|
||||
return object;
|
||||
}
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
namespace MaskedOcclusionCullingAVX2
|
||||
{
|
||||
typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc;
|
||||
typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree;
|
||||
|
||||
MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree )
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
364
neo/libs/moc/MaskedOcclusionCullingAVX512.cpp
Normal file
364
neo/libs/moc/MaskedOcclusionCullingAVX512.cpp
Normal file
|
@ -0,0 +1,364 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Copyright 2017 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <float.h>
|
||||
#include "MaskedOcclusionCulling.h"
|
||||
#include "CompilerSpecific.inl"
|
||||
|
||||
#if MOC_RECORDER_ENABLE
|
||||
#include "FrameRecorder.h"
|
||||
#endif
|
||||
|
||||
// Make sure compiler supports AVX-512 intrinsics: Visual Studio 2017 (Update 3) || Intel C++ Compiler 16.0 || Clang 4.0 || GCC 5.0
|
||||
#if USE_AVX512 != 0 && ((defined(_MSC_VER) && _MSC_VER >= 1911) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1600) || (defined(__clang__) && __clang_major__ >= 4) || (defined(__GNUC__) && __GNUC__ >= 5))
|
||||
|
||||
// The MaskedOcclusionCullingAVX512.cpp file should be compiled avx2/avx512 architecture options turned on in the compiler. However, the SSE
|
||||
// version in MaskedOcclusionCulling.cpp _must_ be compiled with SSE2 architecture allow backwards compatibility. Best practice is to
|
||||
// use lowest supported target platform (e.g. /arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files.
|
||||
#ifndef __AVX2__
|
||||
#error For best performance, MaskedOcclusionCullingAVX512.cpp should be compiled with /arch:AVX2
|
||||
#endif
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// AVX specific defines and constants
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define SIMD_LANES 16
|
||||
#define TILE_HEIGHT_SHIFT 4
|
||||
|
||||
#define SIMD_LANE_IDX _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
|
||||
|
||||
#define SIMD_SUB_TILE_COL_OFFSET _mm512_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
|
||||
#define SIMD_SUB_TILE_ROW_OFFSET _mm512_setr_epi32(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3)
|
||||
#define SIMD_SUB_TILE_COL_OFFSET_F _mm512_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3)
|
||||
#define SIMD_SUB_TILE_ROW_OFFSET_F _mm512_setr_ps(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3)
|
||||
|
||||
#define SIMD_SHUFFLE_SCANLINE_TO_SUBTILES _mm512_set_epi32(0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400, 0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400, 0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400, 0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400)
|
||||
|
||||
#define SIMD_LANE_YCOORD_I _mm512_setr_epi32(128, 384, 640, 896, 1152, 1408, 1664, 1920, 2176, 2432, 2688, 2944, 3200, 3456, 3712, 3968)
|
||||
#define SIMD_LANE_YCOORD_F _mm512_setr_ps(128.0f, 384.0f, 640.0f, 896.0f, 1152.0f, 1408.0f, 1664.0f, 1920.0f, 2176.0f, 2432.0f, 2688.0f, 2944.0f, 3200.0f, 3456.0f, 3712.0f, 3968.0f)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// AVX specific typedefs and functions
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef __m512 __mw;
|
||||
typedef __m512i __mwi;
|
||||
|
||||
#define _mmw_set1_ps _mm512_set1_ps
|
||||
#define _mmw_setzero_ps _mm512_setzero_ps
|
||||
#define _mmw_and_ps _mm512_and_ps
|
||||
#define _mmw_or_ps _mm512_or_ps
|
||||
#define _mmw_xor_ps _mm512_xor_ps
|
||||
#define _mmw_not_ps(a) _mm512_xor_ps((a), _mm512_castsi512_ps(_mm512_set1_epi32(~0)))
|
||||
#define _mmw_andnot_ps _mm512_andnot_ps
|
||||
#define _mmw_neg_ps(a) _mm512_xor_ps((a), _mm512_set1_ps(-0.0f))
|
||||
#define _mmw_abs_ps(a) _mm512_and_ps((a), _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF)))
|
||||
#define _mmw_add_ps _mm512_add_ps
|
||||
#define _mmw_sub_ps _mm512_sub_ps
|
||||
#define _mmw_mul_ps _mm512_mul_ps
|
||||
#define _mmw_div_ps _mm512_div_ps
|
||||
#define _mmw_min_ps _mm512_min_ps
|
||||
#define _mmw_max_ps _mm512_max_ps
|
||||
#define _mmw_fmadd_ps _mm512_fmadd_ps
|
||||
#define _mmw_fmsub_ps _mm512_fmsub_ps
|
||||
#define _mmw_shuffle_ps _mm512_shuffle_ps
|
||||
#define _mmw_insertf32x4_ps _mm512_insertf32x4
|
||||
#define _mmw_cvtepi32_ps _mm512_cvtepi32_ps
|
||||
#define _mmw_blendv_epi32(a,b,c) simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c)))
|
||||
|
||||
#define _mmw_set1_epi32 _mm512_set1_epi32
|
||||
#define _mmw_setzero_epi32 _mm512_setzero_si512
|
||||
#define _mmw_and_epi32 _mm512_and_si512
|
||||
#define _mmw_or_epi32 _mm512_or_si512
|
||||
#define _mmw_xor_epi32 _mm512_xor_si512
|
||||
#define _mmw_not_epi32(a) _mm512_xor_si512((a), _mm512_set1_epi32(~0))
|
||||
#define _mmw_andnot_epi32 _mm512_andnot_si512
|
||||
#define _mmw_neg_epi32(a) _mm512_sub_epi32(_mm512_set1_epi32(0), (a))
|
||||
#define _mmw_add_epi32 _mm512_add_epi32
|
||||
#define _mmw_sub_epi32 _mm512_sub_epi32
|
||||
#define _mmw_min_epi32 _mm512_min_epi32
|
||||
#define _mmw_max_epi32 _mm512_max_epi32
|
||||
#define _mmw_subs_epu16 _mm512_subs_epu16
|
||||
#define _mmw_mullo_epi32 _mm512_mullo_epi32
|
||||
#define _mmw_srai_epi32 _mm512_srai_epi32
|
||||
#define _mmw_srli_epi32 _mm512_srli_epi32
|
||||
#define _mmw_slli_epi32 _mm512_slli_epi32
|
||||
#define _mmw_sllv_ones(x) _mm512_sllv_epi32(SIMD_BITS_ONE, x)
|
||||
#define _mmw_transpose_epi8(x) _mm512_shuffle_epi8(x, SIMD_SHUFFLE_SCANLINE_TO_SUBTILES)
|
||||
#define _mmw_abs_epi32 _mm512_abs_epi32
|
||||
#define _mmw_cvtps_epi32 _mm512_cvtps_epi32
|
||||
#define _mmw_cvttps_epi32 _mm512_cvttps_epi32
|
||||
|
||||
#define _mmx_dp4_ps(a, b) _mm_dp_ps(a, b, 0xFF)
|
||||
#define _mmx_fmadd_ps _mm_fmadd_ps
|
||||
#define _mmx_max_epi32 _mm_max_epi32
|
||||
#define _mmx_min_epi32 _mm_min_epi32
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// SIMD casting functions
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename T, typename Y> FORCE_INLINE T simd_cast( Y A );
|
||||
template<> FORCE_INLINE __m128 simd_cast<__m128>( float A )
|
||||
{
|
||||
return _mm_set1_ps( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m128 simd_cast<__m128>( __m128i A )
|
||||
{
|
||||
return _mm_castsi128_ps( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m128 simd_cast<__m128>( __m128 A )
|
||||
{
|
||||
return A;
|
||||
}
|
||||
template<> FORCE_INLINE __m128i simd_cast<__m128i>( int A )
|
||||
{
|
||||
return _mm_set1_epi32( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m128i simd_cast<__m128i>( __m128 A )
|
||||
{
|
||||
return _mm_castps_si128( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m128i simd_cast<__m128i>( __m128i A )
|
||||
{
|
||||
return A;
|
||||
}
|
||||
template<> FORCE_INLINE __m256 simd_cast<__m256>( float A )
|
||||
{
|
||||
return _mm256_set1_ps( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m256 simd_cast<__m256>( __m256i A )
|
||||
{
|
||||
return _mm256_castsi256_ps( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m256 simd_cast<__m256>( __m256 A )
|
||||
{
|
||||
return A;
|
||||
}
|
||||
template<> FORCE_INLINE __m256i simd_cast<__m256i>( int A )
|
||||
{
|
||||
return _mm256_set1_epi32( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m256i simd_cast<__m256i>( __m256 A )
|
||||
{
|
||||
return _mm256_castps_si256( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m256i simd_cast<__m256i>( __m256i A )
|
||||
{
|
||||
return A;
|
||||
}
|
||||
template<> FORCE_INLINE __m512 simd_cast<__m512>( float A )
|
||||
{
|
||||
return _mm512_set1_ps( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m512 simd_cast<__m512>( __m512i A )
|
||||
{
|
||||
return _mm512_castsi512_ps( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m512 simd_cast<__m512>( __m512 A )
|
||||
{
|
||||
return A;
|
||||
}
|
||||
template<> FORCE_INLINE __m512i simd_cast<__m512i>( int A )
|
||||
{
|
||||
return _mm512_set1_epi32( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m512i simd_cast<__m512i>( __m512 A )
|
||||
{
|
||||
return _mm512_castps_si512( A );
|
||||
}
|
||||
template<> FORCE_INLINE __m512i simd_cast<__m512i>( __m512i A )
|
||||
{
|
||||
return A;
|
||||
}
|
||||
|
||||
#define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \
|
||||
FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \
|
||||
union accessor { simd_type m_native; base_type m_array[elements]; }; \
|
||||
is_const accessor *acs = reinterpret_cast<is_const accessor*>(&a); \
|
||||
return acs->m_array; \
|
||||
}
|
||||
|
||||
MAKE_ACCESSOR( simd_f32, __m128, float,, 4 )
|
||||
MAKE_ACCESSOR( simd_f32, __m128, float, const, 4 )
|
||||
MAKE_ACCESSOR( simd_i32, __m128i, int,, 4 )
|
||||
MAKE_ACCESSOR( simd_i32, __m128i, int, const, 4 )
|
||||
|
||||
MAKE_ACCESSOR( simd_f32, __m256, float,, 8 )
|
||||
MAKE_ACCESSOR( simd_f32, __m256, float, const, 8 )
|
||||
MAKE_ACCESSOR( simd_i32, __m256i, int,, 8 )
|
||||
MAKE_ACCESSOR( simd_i32, __m256i, int, const, 8 )
|
||||
|
||||
MAKE_ACCESSOR( simd_f32, __m512, float,, 16 )
|
||||
MAKE_ACCESSOR( simd_f32, __m512, float, const, 16 )
|
||||
MAKE_ACCESSOR( simd_i32, __m512i, int,, 16 )
|
||||
MAKE_ACCESSOR( simd_i32, __m512i, int, const, 16 )
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Specialized AVX input assembly function for general vertex gather
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef MaskedOcclusionCulling::VertexLayout VertexLayout;
|
||||
|
||||
FORCE_INLINE void GatherVertices( __m512* vtxX, __m512* vtxY, __m512* vtxW, const float* inVtx, const unsigned int* inTrisPtr, int numLanes, const VertexLayout& vtxLayout )
|
||||
{
|
||||
assert( numLanes >= 1 );
|
||||
|
||||
const __m512i SIMD_TRI_IDX_OFFSET = _mm512_setr_epi32( 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45 );
|
||||
static const __m512i SIMD_LANE_MASK[17] =
|
||||
{
|
||||
_mm512_setr_epi32( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0 ),
|
||||
_mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 )
|
||||
};
|
||||
|
||||
// Compute per-lane index list offset that guards against out of bounds memory accesses
|
||||
__m512i safeTriIdxOffset = _mm512_and_si512( SIMD_TRI_IDX_OFFSET, SIMD_LANE_MASK[numLanes] );
|
||||
|
||||
// Fetch triangle indices.
|
||||
__m512i vtxIdx[3];
|
||||
vtxIdx[0] = _mmw_mullo_epi32( _mm512_i32gather_epi32( safeTriIdxOffset, ( const int* )inTrisPtr + 0, 4 ), _mmw_set1_epi32( vtxLayout.mStride ) );
|
||||
vtxIdx[1] = _mmw_mullo_epi32( _mm512_i32gather_epi32( safeTriIdxOffset, ( const int* )inTrisPtr + 1, 4 ), _mmw_set1_epi32( vtxLayout.mStride ) );
|
||||
vtxIdx[2] = _mmw_mullo_epi32( _mm512_i32gather_epi32( safeTriIdxOffset, ( const int* )inTrisPtr + 2, 4 ), _mmw_set1_epi32( vtxLayout.mStride ) );
|
||||
|
||||
char* vPtr = ( char* )inVtx;
|
||||
|
||||
// Fetch triangle vertices
|
||||
for( int i = 0; i < 3; i++ )
|
||||
{
|
||||
vtxX[i] = _mm512_i32gather_ps( vtxIdx[i], ( float* )vPtr, 1 );
|
||||
vtxY[i] = _mm512_i32gather_ps( vtxIdx[i], ( float* )( vPtr + vtxLayout.mOffsetY ), 1 );
|
||||
vtxW[i] = _mm512_i32gather_ps( vtxIdx[i], ( float* )( vPtr + vtxLayout.mOffsetW ), 1 );
|
||||
}
|
||||
}
|
||||
|
||||
namespace MaskedOcclusionCullingAVX512
|
||||
{
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Poorly implemented functions. TODO: fix common (maskedOcclusionCullingCommon.inl) code to improve perf
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
FORCE_INLINE __m512 _mmw_floor_ps( __m512 x )
|
||||
{
|
||||
return _mm512_roundscale_ps( x, 1 ); // 1 = floor
|
||||
}
|
||||
|
||||
FORCE_INLINE __m512 _mmw_ceil_ps( __m512 x )
|
||||
{
|
||||
return _mm512_roundscale_ps( x, 2 ); // 2 = ceil
|
||||
}
|
||||
|
||||
FORCE_INLINE __m512i _mmw_cmpeq_epi32( __m512i a, __m512i b )
|
||||
{
|
||||
__mmask16 mask = _mm512_cmpeq_epi32_mask( a, b );
|
||||
return _mm512_mask_mov_epi32( _mm512_set1_epi32( 0 ), mask, _mm512_set1_epi32( ~0 ) );
|
||||
}
|
||||
|
||||
FORCE_INLINE __m512i _mmw_cmpgt_epi32( __m512i a, __m512i b )
|
||||
{
|
||||
__mmask16 mask = _mm512_cmpgt_epi32_mask( a, b );
|
||||
return _mm512_mask_mov_epi32( _mm512_set1_epi32( 0 ), mask, _mm512_set1_epi32( ~0 ) );
|
||||
}
|
||||
|
||||
FORCE_INLINE bool _mmw_testz_epi32( __m512i a, __m512i b )
|
||||
{
|
||||
__mmask16 mask = _mm512_cmpeq_epi32_mask( _mm512_and_si512( a, b ), _mm512_set1_epi32( 0 ) );
|
||||
return mask == 0xFFFF;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m512 _mmw_cmpge_ps( __m512 a, __m512 b )
|
||||
{
|
||||
__mmask16 mask = _mm512_cmp_ps_mask( a, b, _CMP_GE_OQ );
|
||||
return _mm512_castsi512_ps( _mm512_mask_mov_epi32( _mm512_set1_epi32( 0 ), mask, _mm512_set1_epi32( ~0 ) ) );
|
||||
}
|
||||
|
||||
FORCE_INLINE __m512 _mmw_cmpgt_ps( __m512 a, __m512 b )
|
||||
{
|
||||
__mmask16 mask = _mm512_cmp_ps_mask( a, b, _CMP_GT_OQ );
|
||||
return _mm512_castsi512_ps( _mm512_mask_mov_epi32( _mm512_set1_epi32( 0 ), mask, _mm512_set1_epi32( ~0 ) ) );
|
||||
}
|
||||
|
||||
FORCE_INLINE __m512 _mmw_cmpeq_ps( __m512 a, __m512 b )
|
||||
{
|
||||
__mmask16 mask = _mm512_cmp_ps_mask( a, b, _CMP_EQ_OQ );
|
||||
return _mm512_castsi512_ps( _mm512_mask_mov_epi32( _mm512_set1_epi32( 0 ), mask, _mm512_set1_epi32( ~0 ) ) );
|
||||
}
|
||||
|
||||
FORCE_INLINE __mmask16 _mmw_movemask_ps( const __m512& a )
|
||||
{
|
||||
__mmask16 mask = _mm512_cmp_epi32_mask( _mm512_and_si512( _mm512_castps_si512( a ), _mm512_set1_epi32( 0x80000000 ) ), _mm512_set1_epi32( 0 ), 4 ); // a & 0x8000000 != 0
|
||||
return mask;
|
||||
}
|
||||
|
||||
FORCE_INLINE __m512 _mmw_blendv_ps( const __m512& a, const __m512& b, const __m512& c )
|
||||
{
|
||||
__mmask16 mask = _mmw_movemask_ps( c );
|
||||
return _mm512_mask_mov_ps( a, mask, b );
|
||||
}
|
||||
|
||||
static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::AVX512;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Include common algorithm implementation (general, SIMD independent code)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "MaskedOcclusionCullingCommon.inl"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Utility function to create a new object using the allocator callbacks
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc;
|
||||
typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree;
|
||||
|
||||
MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree )
|
||||
{
|
||||
MaskedOcclusionCullingPrivate* object = ( MaskedOcclusionCullingPrivate* )alignedAlloc( 64, sizeof( MaskedOcclusionCullingPrivate ) );
|
||||
new( object ) MaskedOcclusionCullingPrivate( alignedAlloc, alignedFree );
|
||||
return object;
|
||||
}
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
namespace MaskedOcclusionCullingAVX512
|
||||
{
|
||||
typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc;
|
||||
typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree;
|
||||
|
||||
MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree )
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
2050
neo/libs/moc/MaskedOcclusionCullingCommon.inl
Normal file
2050
neo/libs/moc/MaskedOcclusionCullingCommon.inl
Normal file
File diff suppressed because it is too large
Load diff
450
neo/libs/moc/README.md
Normal file
450
neo/libs/moc/README.md
Normal file
|
@ -0,0 +1,450 @@
|
|||
# MaskedOcclusionCulling
|
||||
|
||||
This code accompanies the research paper ["Masked Software Occlusion Culling"](https://software.intel.com/en-us/articles/masked-software-occlusion-culling),
|
||||
and implements an efficient alternative to the hierarchical depth buffer algorithm. Our algorithm decouples depth values and coverage, and operates directly
|
||||
on the hierarchical depth buffer. It lets us efficiently parallelize both coverage computations and hierarchical depth buffer updates.
|
||||
|
||||
## Update May 2018
|
||||
|
||||
Added the ability to merge 2 depth buffers, this allows both an alterative method for parallelizing buffer creation and a way to reduce silhouette bleed when input data cannot be roughly sorted from front to back, for example rendering large terrain patches with foreground occluders in an open world game engine.
|
||||
|
||||
## Requirements
|
||||
|
||||
This code is mainly optimized for AVX capable CPUs. However, we also provide SSE 4.1 and SSE 2 implementations for backwards compatibility. The appropriate
|
||||
implementation will be chosen during run-time based on the CPU's capabilities.
|
||||
|
||||
## Notes on build time
|
||||
|
||||
The code is optimized for runtime performance and may require a long time to compile due to heavy code inlining. This can be worked around by compiling
|
||||
a library file. An alternative solution is to disable *whole program optimizations* for the `MaskedOcclusionCulling.cpp`,
|
||||
`MaskedOcclusionCullingAVX2.cpp` and `MaskedOcclusionCullingAVX512.cpp` files. It does not impact runtime performance, but greatly reduces the time of program linking.
|
||||
|
||||
## <a name="cs"></a>Notes on coordinate systems and winding
|
||||
|
||||
Most inputs are given as clip space (x,y,w) coordinates assuming the same right handed coordinate system as used by DirectX and OpenGL (x positive right, y
|
||||
positive up and w positive in the view direction). Note that we use the clip space w coordinate for depth and disregard the z coordinate. Internally our
|
||||
masked hierarchical depth buffer stores *depth = 1 / w*.
|
||||
|
||||
The `TestRect()` function is an exception and instead accepts normalized device coordinates (NDC), *(x' = x/w, y' = y/w)*, where the visible screen region
|
||||
maps to the range [-1,1] for *x'* and *y'* (x positive right and y positive up). Again, this is consistent with both DirectX and OpenGL behavior.
|
||||
|
||||
By default, the screen space coordinate system used internally to access our hierarchical depth buffer follows DirectX conventions (y positive down), which is
|
||||
**not** consistent with OpenGL (y positive up). This can be configured by changing the `USE_D3D` define. The screen space coordinate system affects the layout
|
||||
of the buffer returned by the `ComputePixelDepthBuffer()` function, scissor rectangles (which are specified in screen space coordinates), and rasterization
|
||||
tie-breaker rules if `PRECISE_COVERAGE` is enabled.
|
||||
|
||||
## API / Tutorial
|
||||
|
||||
We have made an effort to keep the API as simple and minimal as possible. The rendering functions are quite similar to submitting DirectX or OpenGL drawcalls
|
||||
and we hope they will feel natural to anyone with graphics programming experience. In the following we will use the example project as a tutorial to showcase
|
||||
the API. Please refer to the documentation in the header file for further details.
|
||||
|
||||
### Setup
|
||||
|
||||
We begin by creating a new instance of the occlusion culling object. The object is created using the static `Create()` function rather than a standard
|
||||
constructor, and can be destroyed using the `Destroy()` function. The reason for using the factory `Create()`/`Destroy()` design pattern is that we want to
|
||||
support custom (aligned) memory allocators, and that the library choses either the AVX-512, AVX or SSE implementation based on the CPU's capabilities.
|
||||
|
||||
```C++
|
||||
MaskedOcclusionCulling *moc = MaskedOcclusionCulling::Create();
|
||||
|
||||
...
|
||||
|
||||
MaskedOcclusionCulling::Destroy(moc);
|
||||
```
|
||||
|
||||
The created object is empty and has no hierarchical depth buffer attached, so we must first allocate a buffer using the `SetResolution()` function. This function can
|
||||
also be used later to resize the hierarchical depth buffer, causing it to be re-allocated. Note that the resolution width must be a multiple of 8, and the height
|
||||
a multiple of 4. This is a limitation of the occlusion culling algorithm.
|
||||
|
||||
```C++
|
||||
int width = 1920;
|
||||
int height = 1080;
|
||||
moc.SetResolution(width, height); // Set full HD resolution
|
||||
```
|
||||
After setting the resolution we can start rendering occluders and performing occlusion queries. We must first clear the hierarchical depth buffer
|
||||
|
||||
```C++
|
||||
// Clear hierarchical depth buffer to far depth
|
||||
moc.ClearDepthBuffer();
|
||||
```
|
||||
|
||||
**Optional** The `SetNearClipPlane()` function can be used to configure the distance to the near clipping plane to make the occlusion culling renderer match your DX/GL
|
||||
renderer. The default value for the near plane is 0 which should work as expected unless your application relies on having onscreen geometry clipped by
|
||||
the near plane.
|
||||
|
||||
```C++
|
||||
float nearClipDist = 1.0f;
|
||||
moc.SetNearClipPlane(nearClipDist); // Set near clipping dist (optional)
|
||||
```
|
||||
|
||||
### Occluder rendering
|
||||
|
||||
The `RenderTriangles()` function renders triangle meshes to the hierarchical depth buffer. Similar to DirectX/OpenGL, meshes are constructed from a vertex array
|
||||
and an triangle index array. By default, the vertices are given as *(x,y,z,w)* floating point clip space coordinates, but the *z*-coordinate is ignored and
|
||||
instead we use *depth = 1 / w*. We expose a `TransformVertices()` utility function to transform vertices from *(x,y,z,1)* model/world space to *(x,y,z,w)* clip
|
||||
space, but you can use your own transform code as well. For more information on the `TransformVertices()` function, please refer to the documentaiton in the
|
||||
header file.
|
||||
|
||||
The triangle index array is identical to a DirectX or OpenGL triangle list and connects vertices to form triangles. Every three indices in the array form a new
|
||||
triangle, so the size of the array must be a multiple of 3. Note that we only support triangle lists, and we currently have no plans on supporting other primitives
|
||||
such as strips or fans.
|
||||
|
||||
```C++
|
||||
struct ClipSpaceVertex { float x, y, z, w; };
|
||||
|
||||
// Create an example triangle. The z component of each vertex is not used by the
|
||||
// occlusion culling system.
|
||||
ClipspaceVertex triVerts[] = { { 5, 0, 0, 10 }, { 30, 0, 0, 20 }, { 10, 50, 0, 40 } };
|
||||
unsigned int triIndices[] = { 0, 1, 2 };
|
||||
unsigned int nTris = 1;
|
||||
|
||||
// Render an example triangle
|
||||
moc.RenderTriangles(triVerts, triIndices, nTris);
|
||||
```
|
||||
|
||||
**Transform** It is possible to include a transform when calling `RenderTriangles()`, by passing the modelToClipSpace parameter. This is equivalent to calling `TransformVertices()`, followed
|
||||
by `RenderTriangles()`, but performing the transform as shown in the example below typically
|
||||
leads to better performance.
|
||||
|
||||
```C++
|
||||
// Example matrix swapping the x and y coordinates
|
||||
float swapxyMatrix[4][4] = {
|
||||
{0,1,0,0},
|
||||
{1,0,0,0},
|
||||
{0,0,1,0},
|
||||
{0,0,0,1}};
|
||||
|
||||
// Render triangle with transform.
|
||||
moc.RenderTriangles(triVerts, triIndices, nTris, swapxyMatrix);
|
||||
```
|
||||
|
||||
**Backface Culling** By default, clockwise winded triangles are considered backfacing and are culled when rasterizing occluders. However, you can
|
||||
configure the `RenderTriangles()` function to backface cull either clockwise or counter-clockwise winded triangles, or to disable backface culling
|
||||
for two-sided rendering.
|
||||
|
||||
```C++
|
||||
// A clockwise winded (normally backfacing) triangle
|
||||
ClipspaceVertex cwTriVerts[] = { { 7, -7, 0, 20 },{ 7.5, -7, 0, 20 },{ 7, -7.5, 0, 20 } };
|
||||
unsigned int cwTriIndices[] = { 0, 1, 2 };
|
||||
|
||||
// Render with counter-clockwise backface culling, the triangle is drawn
|
||||
moc->RenderTriangles((float*)cwTriVerts, cwTriIndices, 1, nullptr, BACKFACE_CCW);
|
||||
```
|
||||
|
||||
The rasterization code only handles counter-clockwise winded triangles, so configurable backface culling is implemented by re-winding clockwise winded triangles
|
||||
on the fly. Therefore, other culling modes than `BACKFACE_CW` may decrease performance slightly.
|
||||
|
||||
**Clip Flags** `RenderTriangles()` accepts an additional parameter to optimize polygon clipping. The calling application may disable any clipping plane if it can
|
||||
guarantee that the mesh does not intersect said clipping plane. In the example below we have a quad which is entirely on screen, and we can disable
|
||||
all clipping planes. **Warning** it is unsafe to incorrectly disable clipping planes and this may cause the program to crash or perform out of bounds
|
||||
memory accesses. Consider this a power user feature (use `CLIP_PLANE_ALL` to clip against the full frustum when in doubt).
|
||||
|
||||
```C++
|
||||
// Create a quad completely within the view frustum
|
||||
ClipspaceVertex quadVerts[]
|
||||
= { { -150, -150, 0, 200 },{ -10, -65, 0, 75 },{ 0, 0, 0, 20 },{ -40, 10, 0, 50 } };
|
||||
unsigned int quadIndices[] = { 0, 1, 2, 0, 2, 3 };
|
||||
unsigned int nTris = 2;
|
||||
|
||||
// Render the quad. As an optimization, indicate that clipping is not required
|
||||
moc.RenderTriangles((float*)quadVerts, quadIndices, nTris, nullptr, BACKFACE_CW, CLIP_PLANE_NONE);
|
||||
```
|
||||
|
||||
**Vertex Storage Layout** Finally, the `RenderTriangles()` supports configurable vertex storage layout. The code so far has used an array of structs (AoS) layout based
|
||||
on the `ClipSpaceVertex` struct, and this is the default behaviour. You may use the `VertexLayout` struct to configure the memory layout of the vertex data. Note that
|
||||
the vertex pointer passed to the `RenderTriangles()` should point at the *x* coordinate of the first vertex, so there is no x coordinate offset specified in the struct.
|
||||
|
||||
```C++
|
||||
struct VertexLayout
|
||||
{
|
||||
int mStride; // Stride between vertices
|
||||
int mOffsetY; // Offset to vertex y coordinate
|
||||
int mOffsetW; // Offset to vertex w coordinate
|
||||
};
|
||||
```
|
||||
|
||||
For example, you can configure a struct of arrays (SoA) layout as follows
|
||||
|
||||
```C++
|
||||
// A triangle specified on struct of arrays (SoA) form
|
||||
float SoAVerts[] = {
|
||||
10, 10, 7, // x-coordinates
|
||||
-10, -7, -10, // y-coordinates
|
||||
10, 10, 10 // w-coordinates
|
||||
};
|
||||
|
||||
// Set vertex layout (stride, y offset, w offset)
|
||||
VertexLayout SoAVertexLayout(sizeof(float), 3 * sizeof(float), 6 * sizeof(float));
|
||||
|
||||
// Render triangle with SoA layout
|
||||
moc.RenderTriangles((float*)SoAVerts, triIndices, 1, nullptr, BACKFACE_CW, CLIP_PLANE_ALL, SoAVertexLayout);
|
||||
```
|
||||
|
||||
Vertex layout may affect performance. We have seen no large performance impact when using either SoA or AoS layout, but generally speaking the
|
||||
vertex position data should be packed as compactly into memory as possible to minimize number of cache misses. It is, for example, not advicable to bundle vertex
|
||||
position data together with normals, texture coordinates, etc. and using a large stride.
|
||||
|
||||
### Occlusion queries
|
||||
|
||||
After rendering a few occluder meshes you can begin to perform occlusion queries. There are two functions for occlusion queries, called `TestTriangles()` and
|
||||
`TestRect()`. The `TestTriangles()` function is identical to `RenderTriangles()` with the exception being that it performs an occlusion query and does not
|
||||
update the hierarchical depth buffer. The result of the occlusion query is returned as an enum, which indicates if the triangles are `VISIBLE`, `OCCLUDED`, or were
|
||||
`VIEW_CULLED`. Here, `VIEW_CULLED` means that all triangles were either frustum or back face culling, so no occlusion culling test had to be performed.
|
||||
|
||||
```C++
|
||||
// A triangle that is partly, but not completely, overlapped by the quad rendered before
|
||||
ClipspaceVertex oqTriVerts[] = { { 0, 50, 0, 200 },{ -60, -60, 0, 200 },{ 20, -40, 0, 200 } };
|
||||
unsigned int oqTriIndices[] = { 0, 1, 2 };
|
||||
unsigned int nTris = 1;
|
||||
|
||||
// Perform an occlusion query. The triangle is visible and the query should return VISIBLE
|
||||
CullingResult result = moc.TestTriangles((float*)oqTriVerts, oqTriIndices, nTris);
|
||||
```
|
||||
|
||||
The `TestRect()` function performs an occlusion query for a rectangular screen space region with a given depth. It can be used to, for example, quickly test
|
||||
the projected bounding box of an object to determine if the entire object is visible or not. The function is considerably faster than `TestTriangles()` becuase
|
||||
it does not require input assembly, clipping, or triangle rasterization. The queries are typically less accurate as screen space bounding rectangles tend to
|
||||
grow quite large, but we've personally seen best overall performance using this type of culling.
|
||||
|
||||
```C++
|
||||
// Perform an occlusion query testing if a rectangle is visible. The rectangle is completely
|
||||
// behind the previously drawn quad, so the query should indicate that it's occluded
|
||||
result = moc.TestRect(-0.6f, -0.6f, -0.4f, -0.4f, 100);
|
||||
```
|
||||
|
||||
Unlike the other functions the input to `TestRect()` is normalized device coordinates (NDC). Normalized device coordinates are projected clip space coordinates
|
||||
*(x' = x/w, y' = y/w)* and the visible screen maps to the range [-1,1] for both the *x'* and *y'* coordinate. The w coordinate is still given in clip space,
|
||||
however. It is up to the application to compute projected bounding rectangles from the object's bounding shapes.
|
||||
|
||||
### Debugging and visualization
|
||||
|
||||
We expose a utility function, `ComputePixelDepthBuffer()` that can be used to visualize the hierarchical depth buffer used internally by the occlusion culling
|
||||
system. The function fills in a complete per-pixel depth buffer, but the internal representation is hierarchical with just two depth values and a mask stored per
|
||||
tile. It is not reasonable to expect the image to completely match the exact depth buffer, and you may notice some areas where backrgound objects leak through
|
||||
the foreground. Leakage is part of the algorithm (and one reason for the high performance), and we have
|
||||
not found it to be problematic. However, if you experience issues due to leakage you may want to disable the `QUICK_MASK` define, described in more detail in the
|
||||
section on [hierarchical depth buffer updates](#update).
|
||||
|
||||
```C++
|
||||
// Compute a per pixel depth buffer from the hierarchical depth buffer, used for visualization.
|
||||
float *perPixelZBuffer = new float[width * height];
|
||||
moc.ComputePixelDepthBuffer(perPixelZBuffer);
|
||||
```
|
||||
|
||||
We also support basic instrumentation to help with profiling and debugging. By defining `ENABLE_STATS` in the header file, the occlusion culling code will
|
||||
gather statistics about the number of occluders rendered and occlusion queries performed. For more details about the statistics, see the
|
||||
`OcclusionCullingStatistics` struct. The statistics can be queried using the `GetStatistics()` function, which will simply return a zeroed struct if `ENABLE_STATS`
|
||||
is not defined. Note that instrumentation reduces performance somewhat and should generally be disabled in release builds.
|
||||
|
||||
```C++
|
||||
OcclusionCullingStatistics stats = moc.GetStatistics();
|
||||
```
|
||||
|
||||
### Memory management
|
||||
|
||||
As shown in the example below, you may optionally provide callback functions for allocating and freeing memory when creating a
|
||||
`MaskedOcclusionCulling` object. The functions must support aligned allocations.
|
||||
|
||||
```C++
|
||||
void *alignedAllocCallback(size_t alignment, size_t size)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
void alignedFreeCallback(void *ptr)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
MaskedOcclusionCulling *moc = MaskedOcclusionCulling::Create(alignedAllocCallback, alignedFreeCallback);
|
||||
```
|
||||
|
||||
## <a name="update"></a>Hierarchical depth buffer update algorithm and render order
|
||||
|
||||
The library contains two update algorithms / heuristics for the hierarchical depth buffer, one focused on speed and one focused on accuracy. The
|
||||
active algorithm can be configured using the `QUICK_MASK` define. Setting the define (default) enables algorithm is described in the research paper
|
||||
["Masked Software Occlusion Culling"](https://software.intel.com/en-us/articles/masked-software-occlusion-culling), which has a good balance between low
|
||||
leakage and good performance. Not defining `QUICK_MASK` enables the mergine heuristic used in the paper
|
||||
["Masked Depth Culling for Graphics Hardware"](http://dl.acm.org/citation.cfm?id=2818138). It is more accurate, with less leakage, but also has lower performance.
|
||||
|
||||
If you experience problems due to leakage you may want to use the more accurate update algorithm. However, rendering order can also affect the quality
|
||||
of the hierarchical depth buffer, with the best order being rendering objects front-to-back. We perform early depth culling tests during occluder
|
||||
|
||||
rendering, so rendering in front-to-back order will not only improve quality, but also greatly improve performance of occluder rendering. If your scene
|
||||
is stored in a hierarchical data structure, it is often possible to modify the traversal algorithm to traverse nodes in approximate front-to-back order,
|
||||
see the research paper ["Masked Software Occlusion Culling"](https://software.intel.com/en-us/articles/masked-software-occlusion-culling) for an example.
|
||||
|
||||
## <a name="interleaved"></a>Interleaving occluder rendering and occlusion queries
|
||||
|
||||
The library supports *light weight* switching between occluder rendering and occlusion queries. While it is still possible to do occlusion culling
|
||||
as a standard two pass algorithm (first render all occluders, then perform all queries) it is typically beneficial to interleave occluder rendering with
|
||||
queries.
|
||||
|
||||
This is especially powerful when rendering objects in front-to-back order. After drawing the first few occluder triangles, you can start performing
|
||||
occlusion queries, and if the occlusion query indicate that an object is occluded there is no need to draw the occluder mesh for that object. This
|
||||
can greatly improve the performance of the occlusion culling pass in itself. As described in further detail in the research paper
|
||||
["Masked Software Occlusion Culling"](https://software.intel.com/en-us/articles/masked-software-occlusion-culling), this may be used to perform early exits in
|
||||
BVH traversal code.
|
||||
|
||||
## Rasterization precision
|
||||
|
||||
The library supports high precision rasterization through Bresenham interpolation, and this may be enabled by changing the `PRECISE_COVERAGE` define in
|
||||
the header file. The high precision rasterizer is somewhat slower (5-15%) than using the default rasterizer, but is compliant with DirectX 11 and OpenGL
|
||||
rasterization rules. We have empirically verified it on a large set of randomly generated on-screen triangles. While there still may be differences to GPU
|
||||
rasterization due to clipping or vertex transform precision differences, we have not noticed any differences in rasterized coverage in our test scenes. Note
|
||||
that tie breaker rules and vertex rounding behaves differently between DirectX and OpenGL due to the direction of the screen space Y axis. The `USE_D3D` define
|
||||
(enabled by default) can be used to toggle between DirectX or OpenGL behaviour.
|
||||
|
||||
## Multi-threading and binned rendering
|
||||
|
||||
Multi-threading is supported through a binning rasterizer. The `MaskedOcclusionCulling` class exposes two functions, `BinTriangles()` and `RenderTrilist()`
|
||||
that may be used to perform binning, and render all triangles assigned to a bin. Using binned rasterization makes it simple to guarantee that no two threads are
|
||||
accessing the same part of the framebuffer, as rendering is limited to a particular bin, or region of the screen.
|
||||
|
||||
Binned rendering starts by performing geometry processing (primitive assembly, vertex transform, clipping, and projection) followed by a binning step, where
|
||||
triangles are written to all bins they overlap. This is performed using the `BinTriangles()` function, which is very similar to the `RenderTriangles()`
|
||||
function, but provides some additional parameters for specifying the number of bins the screen is split into. The calling application also needs to pass a
|
||||
pointer to an array of `TriList` object, with one instance per bin. Each `TriList` object points to a "scratchpad" memory buffer, and all triangles overlapping
|
||||
that bin will be written to the buffer.
|
||||
|
||||
```C++
|
||||
const int binsW = 4;
|
||||
const int binsH = 4;
|
||||
|
||||
float *dataBuffer = new float[binsW*BinsH*1024*3*3]; // Allocate storage for 1k triangles in each trilist
|
||||
TriList *triLists = new TriList[binsW*binsH]; // Allocate trilists for 4x4 = 16 bins
|
||||
for (int i = 0; i < binsW*BinsH; ++i)
|
||||
{
|
||||
triLists[i].mNumTriangles = 1024; // triangle list capacity
|
||||
triLists[i].mTriIdx = 0; // Set triangle write pointer to first element
|
||||
triLists[i].mData = dataBuffer + i*1024*1024;
|
||||
}
|
||||
|
||||
// Perform geometry processing and write triangles to the triLists of all bins they overlap.
|
||||
moc.BinTriangles(triVerts, triIndices, nTris, triLists, binsW, binsW);
|
||||
```
|
||||
|
||||
After generating the triangle lists for each bin, the triangles may be rendered using the `RenderTrilist()` function and the rendering region should be
|
||||
limited using a scissor rectangle. It should be noted that the `BinTriangles()` function makes assumptions on the size of the bins, and the calling
|
||||
application must therefore always compute the scissor region of each bin, relying on the `ComputeBinWidthHeight()` utility function as shown in the
|
||||
example below. Note that the scissor rectangle is specified in screen space coordinates which depends on the `USE_D3D` define.
|
||||
|
||||
```C++
|
||||
unsigned int binWidth, binHeight;
|
||||
moc.ComputeBinWidthHeight(mBinsW, mBinsH, binWidth, binHeight);
|
||||
|
||||
for (int by = 0; by < binsH; ++by)
|
||||
{
|
||||
for (int bx = 0; bx < binsW ; ++bx)
|
||||
{
|
||||
// Compute scissor rectangle that matches the one assumed by BinTriangles()
|
||||
// note that the ScissorRect is specified in pixel coordinates, with (0,0)
|
||||
// being the bottom left corner
|
||||
ScissorRect binRect;
|
||||
binRect.minX = bx*binWidth;
|
||||
binRect.maxX = bx + 1 == binsW ? screenWidth : (bx + 1) * binWidth;
|
||||
binRect.minY = by*binHeight;
|
||||
binRect.maxY = by + 1 == binsH ? screenHeight : (by + 1) * binHeight;
|
||||
|
||||
// Render all triangles overlapping the current bin.
|
||||
moc.RenderTrilist(triLists[bx + by*4], &binRect);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Multi-threading example
|
||||
|
||||
This library includes a multi-threading example in the `CullingThreadpool` class. The class interface is similar to that of `MaskedOcclusionCulling`, but occluder
|
||||
rendering is performed asynchronously. Calling the `CullingThreadpool::RenderTriangles()` function adds a render job to a command queue and immediately return
|
||||
to the calling thread, rather than immediately performing the rendering work. Internally, the class uses the `BinTriangles()` and `RenderTrilist()` functions to
|
||||
bin all triangles of the `CullingThreadpool::RenderTriangles()` call, and distribute the tiles. At any time, there may be a number of binning jobs, and tile
|
||||
rendering jobs unprocessed, and the scheduler picks the most urgent job and process it first. If a thread runs out of available jobs, task stealing is used as a
|
||||
means of improving load-balancing.
|
||||
|
||||
The occlusion query functions `CullingThreadpool::TestTriangles()` and `CullingThreadpool::TestRect()` immediately return the result of the query. However, the
|
||||
query depends on the contents of the hierarchical depth buffer you may need to wait for the worker threads to finish to make sure the query is performed on the
|
||||
most up to date version of the buffer, this can be accomplished by calling `CullingThreadpool::Flush()`. It is not always necessary to work with the most up to
|
||||
date version of the hierarchical depth buffer for a query. While the result may be incorrect, it is still always conservative in that occluded objects may be
|
||||
classified as visible, but not the other way around. Since the `CullingThreadpool::Flush()` causes a wait it may be beneficial to work against a slightly out of
|
||||
date version of the hierarchical depth buffer if your application will cause a lot of flushes. We found this particularly true when implementing threading in
|
||||
our interleaved BVH traversal algorithm (see the ["Masked Software Occlusion Culling"](https://software.intel.com/en-us/articles/masked-software-occlusion-culling)
|
||||
paper) where each BVH traversal step is based on the outcome of an occlusion query interleaved with occluder rendering for the BVH-leaves.
|
||||
|
||||
The `CullingThreadpool` class was written as an example and not the de-facto threading approach. In some cases we believe it is possible to improve performance
|
||||
further by threading occlusion queries, or thread the entire occlusion culling system, including scene graph traversal. However, it does provide a simple means
|
||||
of enabling multi-threading in a traditional single threaded application as the APIs is very similar to the `MaskedOcclusionCulling` class, and may be called from
|
||||
a single threaded application. As previously mentioned we integrated this implementation in our interleaved BVH traversal algorithm (see the ["Masked Software Occlusion Culling"](https://software.intel.com/en-us/articles/masked-software-occlusion-culling)
|
||||
paper) and noted speedup of roughly *3x*, running on four threads, compared to our previous single threaded implementation.
|
||||
|
||||
## Compiling
|
||||
|
||||
The code has been reworked to support more platforms and compilers, such as [Intel C++ Compiler](https://software.intel.com/en-us/intel-compilers), [G++](https://gcc.gnu.org/)
|
||||
and [LLVM/Clang](http://releases.llvm.org/download.html). The original Visual Studio 2015 projects remain and works with both ICC and Microsoft's compilers. Other compilers
|
||||
are supported through [CMake](https://cmake.org/). See the `CMakeLists.txt` files in the `Example` and `FillrateTest` folders. You can use CMake to generate a
|
||||
Visual Studio project for Clang on Windows:
|
||||
|
||||
```
|
||||
md <path to library>\Example\Clang
|
||||
cd <path to library>\Example\Clang
|
||||
cmake -G"Visual Studio 14 2015 Win64" -T"LLVM-vs2014" ..
|
||||
```
|
||||
|
||||
or build the library with G++/Clang on linux systems (the `D3DValidate` sample only works on Windows as it relies on Direct 3D)
|
||||
|
||||
```
|
||||
mkdir <path to library>/Example/Release
|
||||
cd <path to library>/Example/Release
|
||||
cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
make
|
||||
```
|
||||
|
||||
Note that AVX-512 support is only experimental at the moment, and has only been verified through [Intel SDE](https://software.intel.com/en-us/articles/pre-release-license-agreement-for-intel-software-development-emulator-accept-end-user-license-agreement-and-download).
|
||||
If using the original visual studio project, you need to "opt in" for AVX-512 support by setting `#define USE_AVX512 1`. When building with CMake you can
|
||||
enable AVX support using the `-DUSE_AVX512=ON` option:
|
||||
|
||||
```
|
||||
cmake -DUSE_AVX512=ON -G"Visual Studio 14 2015 Win64" -T"LLVM-vs2014" ..
|
||||
```
|
||||
|
||||
## Version History
|
||||
|
||||
* Version 1.4:
|
||||
* Added support for merging 2 depth buffers as detailed in GDC 2018 presenation.
|
||||
* Fixed Profiling counters to be thread safe removing a race condition when runing the CullingThreadpool class.
|
||||
* Version 1.3:
|
||||
* **Experimental**: Added support for AVX-512 capable CPUs. Currently only verified through [emulator](https://software.intel.com/en-us/articles/intel-software-development-emulator).
|
||||
* Added multiplatform support. Code now compiles on Visual C++ Compiler, Intel C++ Compiler, GCC, and Clang.
|
||||
* Added configurable backface culling, to support two-sided occluder rendering.
|
||||
* Version 1.2:
|
||||
* Added support for threading, through a binning rasterizer. The `CullingThreadpool` class implements an example multi-threaded task system with a very similar
|
||||
API to the `MaskedOcclusionCulling`class.
|
||||
* Added support for higher precision rasterization, with DirectX and OpenGL compliant rasterization rules.
|
||||
* **Note:** The default screen space coordinate system has been changed from OpenGL to DirectX conventions. If you upgrade from an older version of the library
|
||||
this will flip the y coordinate of scissor boxes and the images returned by `ComputePixelDepthBuffer()`. Disabling the `USE_D3D` define changes back to OpenGL conventions.
|
||||
* Version 1.1:
|
||||
* Added support for SSE4.1 and SSE2 capable CPUs for backwards compatibility. The SSE versions must emulate some operations using
|
||||
simpler instructions, and are therefore less efficient, with the SSE2 version having the lowest performance.
|
||||
* Version 1.0:
|
||||
* Initial revision, only support for AVX2 capable CPUs
|
||||
|
||||
## Differences to the research paper
|
||||
|
||||
This code does not exactly match implementation used in
|
||||
["Masked Software Occlusion Culling"](https://software.intel.com/en-us/articles/masked-software-occlusion-culling), and performance may vary slightly
|
||||
from what is presented in the research paper. We aimed for making the API as simple as possible and have removed many limitations, in particular
|
||||
requirements on input data being aligned to SIMD boundaries. This affects performance slightly in both directions. Unaligned loads and
|
||||
gathers are more costly, but unaligned data may be packed more efficiently in memory leading to fewer cache misses.
|
||||
|
||||
## License agreement
|
||||
|
||||
See the Apache 2.0 license.txt for full license agreement details.
|
||||
|
||||
Disclaimer:
|
||||
|
||||
This software is subject to the U.S. Export Administration Regulations and other U.S.
|
||||
law, and may not be exported or re-exported to certain countries (Cuba, Iran, North
|
||||
Korea, Sudan, and Syria) or to persons or entities prohibited from receiving U.S.
|
||||
exports (including Denied Parties, Specially Designated Nationals, and entities on the
|
||||
Bureau of Export Administration Entity List or involved with missile technology or
|
||||
nuclear, chemical or biological weapons)..
|
181
neo/libs/moc/license.txt
Normal file
181
neo/libs/moc/license.txt
Normal file
|
@ -0,0 +1,181 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction, and
|
||||
distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by the copyright
|
||||
owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all other entities
|
||||
that control, are controlled by, or are under common control with that entity.
|
||||
For the purposes of this definition, "control" means (i) the power, direct or
|
||||
indirect, to cause the direction or management of such entity, whether by
|
||||
contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity exercising
|
||||
permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications, including
|
||||
but not limited to software source code, documentation source, and configuration
|
||||
files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical transformation or
|
||||
translation of a Source form, including but not limited to compiled object code,
|
||||
generated documentation, and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or Object form, made
|
||||
available under the License, as indicated by a copyright notice that is included
|
||||
in or attached to the work (an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object form, that
|
||||
is based on (or derived from) the Work and for which the editorial revisions,
|
||||
annotations, elaborations, or other modifications represent, as a whole, an
|
||||
original work of authorship. For the purposes of this License, Derivative Works
|
||||
shall not include works that remain separable from, or merely link (or bind by
|
||||
name) to the interfaces of, the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including the original version
|
||||
of the Work and any modifications or additions to that Work or Derivative Works
|
||||
thereof, that is intentionally submitted to Licensor for inclusion in the Work
|
||||
by the copyright owner or by an individual or Legal Entity authorized to submit
|
||||
on behalf of the copyright owner. For the purposes of this definition,
|
||||
"submitted" means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems, and
|
||||
issue tracking systems that are managed by, or on behalf of, the Licensor for
|
||||
the purpose of discussing and improving the Work, but excluding communication
|
||||
that is conspicuously marked or otherwise designated in writing by the copyright
|
||||
owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf
|
||||
of whom a Contribution has been received by Licensor and subsequently
|
||||
incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of this
|
||||
License, each Contributor hereby grants to You a perpetual, worldwide,
|
||||
non-exclusive, no-charge, royalty-free, irrevocable copyright license to
|
||||
reproduce, prepare Derivative Works of, publicly display, publicly perform,
|
||||
sublicense, and distribute the Work and such Derivative Works in Source or
|
||||
Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of this License,
|
||||
each Contributor hereby grants to You a perpetual, worldwide, non-exclusive,
|
||||
no-charge, royalty-free, irrevocable (except as stated in this section) patent
|
||||
license to make, have made, use, offer to sell, sell, import, and otherwise
|
||||
transfer the Work, where such license applies only to those patent claims
|
||||
licensable by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s) with the Work
|
||||
to which such Contribution(s) was submitted. If You institute patent litigation
|
||||
against any entity (including a cross-claim or counterclaim in a lawsuit)
|
||||
alleging that the Work or a Contribution incorporated within the Work
|
||||
constitutes direct or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate as of the date
|
||||
such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the Work or
|
||||
Derivative Works thereof in any medium, with or without modifications, and in
|
||||
Source or Object form, provided that You meet the following conditions:
|
||||
You must give any other recipients of the Work or Derivative Works a copy of
|
||||
this License; and
|
||||
|
||||
|
||||
You must cause any modified files to carry prominent notices stating that You
|
||||
changed the files; and
|
||||
|
||||
|
||||
You must retain, in the Source form of any Derivative Works that You
|
||||
distribute, all copyright, patent, trademark, and attribution notices from the
|
||||
Source form of the Work, excluding those notices that do not pertain to any
|
||||
part of the Derivative Works; and
|
||||
|
||||
|
||||
If the Work includes a "NOTICE" text file as part of its distribution, then
|
||||
any Derivative Works that You distribute must include a readable copy of the
|
||||
attribution notices contained within such NOTICE file, excluding those notices
|
||||
that do not pertain to any part of the Derivative Works, in at least one of
|
||||
the following places: within a NOTICE text file distributed as part of the
|
||||
Derivative Works; within the Source form or documentation, if provided along
|
||||
with the Derivative Works; or, within a display generated by the Derivative
|
||||
Works, if and wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and do not modify the
|
||||
License. You may add Your own attribution notices within Derivative Works that
|
||||
You distribute, alongside or as an addendum to the NOTICE text from the Work,
|
||||
provided that such additional attribution notices cannot be construed as
|
||||
modifying the License.
|
||||
You may add Your own copyright statement to Your modifications and may provide
|
||||
additional or different license terms and conditions for use, reproduction, or
|
||||
distribution of Your modifications, or for any such Derivative Works as a whole,
|
||||
provided Your use, reproduction, and distribution of the Work otherwise complies
|
||||
with the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise, any
|
||||
Contribution intentionally submitted for inclusion in the Work by You to the
|
||||
Licensor shall be under the terms and conditions of this License, without any
|
||||
additional terms or conditions. Notwithstanding the above, nothing herein shall
|
||||
supersede or modify the terms of any separate license agreement you may have
|
||||
executed with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade names,
|
||||
trademarks, service marks, or product names of the Licensor, except as required
|
||||
for reasonable and customary use in describing the origin of the Work and
|
||||
reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or agreed to in
|
||||
writing, Licensor provides the Work (and each Contributor provides its
|
||||
Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied, including, without limitation, any warranties
|
||||
or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any risks
|
||||
associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory, whether in
|
||||
tort (including negligence), contract, or otherwise, unless required by
|
||||
applicable law (such as deliberate and grossly negligent acts) or agreed to in
|
||||
writing, shall any Contributor be liable to You for damages, including any
|
||||
direct, indirect, special, incidental, or consequential damages of any character
|
||||
arising as a result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill, work stoppage,
|
||||
computer failure or malfunction, or any and all other commercial damages or
|
||||
losses), even if such Contributor has been advised of the possibility of such
|
||||
damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing the Work or
|
||||
Derivative Works thereof, You may choose to offer, and charge a fee for,
|
||||
acceptance of support, warranty, indemnity, or other liability obligations
|
||||
and/or rights consistent with this License. However, in accepting such
|
||||
obligations, You may act only on Your own behalf and on Your sole
|
||||
responsibility, not on behalf of any other Contributor, and only if You agree to
|
||||
indemnify, defend, and hold each Contributor harmless for any liability incurred
|
||||
by, or claims asserted against, such Contributor by reason of your accepting any
|
||||
such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work
|
||||
|
||||
To apply the Apache License to your work, attach the following boilerplate
|
||||
notice, with the fields enclosed by brackets "[]" replaced with your own
|
||||
identifying information. (Don't include the brackets!) The text should be
|
||||
enclosed in the appropriate comment syntax for the file format. We also
|
||||
recommend that a file or class name and description of purpose be included on
|
||||
the same "printed page" as the copyright notice for easier identification within
|
||||
third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner] Licensed under the Apache License,
|
||||
Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or
|
||||
agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
or implied. See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -1546,6 +1546,16 @@ void R_LinkDrawSurfToView( drawSurf_t* drawSurf, viewDef_t* viewDef );
|
|||
|
||||
void R_AddModels();
|
||||
|
||||
/*
|
||||
============================================================
|
||||
|
||||
TR_FRONTEND_MASKED_OCCLUSION_CULLING
|
||||
|
||||
============================================================
|
||||
*/
|
||||
|
||||
void R_FillMaskedOcclusionBufferWithModels();
|
||||
|
||||
/*
|
||||
=============================================================
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
Doom 3 BFG Edition GPL Source Code
|
||||
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
|
||||
Copyright (C) 2014-2016 Robert Beckebans
|
||||
Copyright (C) 2014-2024 Robert Beckebans
|
||||
Copyright (C) 2014-2016 Kot in Action Creative Artel
|
||||
|
||||
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
|
||||
|
@ -46,64 +46,7 @@ idCVar r_lodMaterialDistance( "r_lodMaterialDistance", "500", CVAR_RENDERER | CV
|
|||
|
||||
static const float CHECK_BOUNDS_EPSILON = 1.0f;
|
||||
|
||||
/*
|
||||
==================
|
||||
R_SortViewEntities
|
||||
==================
|
||||
*/
|
||||
viewEntity_t* R_SortViewEntities( viewEntity_t* vEntities )
|
||||
{
|
||||
SCOPED_PROFILE_EVENT( "R_SortViewEntities" );
|
||||
|
||||
// We want to avoid having a single AddModel for something complex be
|
||||
// the last thing processed and hurt the parallel occupancy, so
|
||||
// sort dynamic models first, _area models second, then everything else.
|
||||
viewEntity_t* dynamics = NULL;
|
||||
viewEntity_t* areas = NULL;
|
||||
viewEntity_t* others = NULL;
|
||||
for( viewEntity_t* vEntity = vEntities; vEntity != NULL; )
|
||||
{
|
||||
viewEntity_t* next = vEntity->next;
|
||||
const idRenderModel* model = vEntity->entityDef->parms.hModel;
|
||||
if( model->IsDynamicModel() != DM_STATIC )
|
||||
{
|
||||
vEntity->next = dynamics;
|
||||
dynamics = vEntity;
|
||||
}
|
||||
else if( model->IsStaticWorldModel() )
|
||||
{
|
||||
vEntity->next = areas;
|
||||
areas = vEntity;
|
||||
}
|
||||
else
|
||||
{
|
||||
vEntity->next = others;
|
||||
others = vEntity;
|
||||
}
|
||||
vEntity = next;
|
||||
}
|
||||
|
||||
// concatenate the lists
|
||||
viewEntity_t* all = others;
|
||||
|
||||
for( viewEntity_t* vEntity = areas; vEntity != NULL; )
|
||||
{
|
||||
viewEntity_t* next = vEntity->next;
|
||||
vEntity->next = all;
|
||||
all = vEntity;
|
||||
vEntity = next;
|
||||
}
|
||||
|
||||
for( viewEntity_t* vEntity = dynamics; vEntity != NULL; )
|
||||
{
|
||||
viewEntity_t* next = vEntity->next;
|
||||
vEntity->next = all;
|
||||
all = vEntity;
|
||||
vEntity = next;
|
||||
}
|
||||
|
||||
return all;
|
||||
}
|
||||
|
||||
/*
|
||||
==================
|
||||
|
@ -1115,7 +1058,8 @@ void R_AddModels()
|
|||
{
|
||||
SCOPED_PROFILE_EVENT( "R_AddModels" );
|
||||
|
||||
tr.viewDef->viewEntitys = R_SortViewEntities( tr.viewDef->viewEntitys );
|
||||
// RB: already done in R_FillMaskedOcclusionBufferWithModels
|
||||
// tr.viewDef->viewEntitys = R_SortViewEntities( tr.viewDef->viewEntitys );
|
||||
|
||||
//-------------------------------------------------
|
||||
// Go through each view entity that is either visible to the view, or to
|
||||
|
|
|
@ -645,6 +645,9 @@ void R_RenderView( viewDef_t* parms )
|
|||
// wait for any shadow volume jobs from the previous frame to finish
|
||||
tr.frontEndJobList->Wait();
|
||||
|
||||
// RB: render worldspawn geometry to the software culling buffer
|
||||
R_FillMaskedOcclusionBufferWithModels();
|
||||
|
||||
// make sure that interactions exist for all light / entity combinations that are visible
|
||||
// add any pre-generated light shadows, and calculate the light shader values
|
||||
R_AddLights();
|
||||
|
|
557
neo/renderer/tr_frontend_masked_occlusion_culling.cpp
Normal file
557
neo/renderer/tr_frontend_masked_occlusion_culling.cpp
Normal file
|
@ -0,0 +1,557 @@
|
|||
/*
|
||||
===========================================================================
|
||||
|
||||
Doom 3 BFG Edition GPL Source Code
|
||||
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
|
||||
Copyright (C) 2024 Robert Beckebans
|
||||
|
||||
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
|
||||
|
||||
Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below.
|
||||
|
||||
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
|
||||
|
||||
===========================================================================
|
||||
*/
|
||||
#include "precompiled.h"
|
||||
#pragma hdrstop
|
||||
|
||||
#include "../libs/moc/MaskedOcclusionCulling.h"
|
||||
|
||||
#include "RenderCommon.h"
|
||||
#include "Model_local.h"
|
||||
|
||||
static const float CHECK_BOUNDS_EPSILON = 1.0f;
|
||||
|
||||
/*
|
||||
==================
|
||||
R_SortViewEntities
|
||||
==================
|
||||
*/
|
||||
viewEntity_t* R_SortViewEntities( viewEntity_t* vEntities )
|
||||
{
|
||||
SCOPED_PROFILE_EVENT( "R_SortViewEntities" );
|
||||
|
||||
// We want to avoid having a single AddModel for something complex be
|
||||
// the last thing processed and hurt the parallel occupancy, so
|
||||
// sort dynamic models first, _area models second, then everything else.
|
||||
viewEntity_t* dynamics = NULL;
|
||||
viewEntity_t* areas = NULL;
|
||||
viewEntity_t* others = NULL;
|
||||
for( viewEntity_t* vEntity = vEntities; vEntity != NULL; )
|
||||
{
|
||||
viewEntity_t* next = vEntity->next;
|
||||
const idRenderModel* model = vEntity->entityDef->parms.hModel;
|
||||
if( model->IsDynamicModel() != DM_STATIC )
|
||||
{
|
||||
vEntity->next = dynamics;
|
||||
dynamics = vEntity;
|
||||
}
|
||||
else if( model->IsStaticWorldModel() )
|
||||
{
|
||||
vEntity->next = areas;
|
||||
areas = vEntity;
|
||||
}
|
||||
else
|
||||
{
|
||||
vEntity->next = others;
|
||||
others = vEntity;
|
||||
}
|
||||
vEntity = next;
|
||||
}
|
||||
|
||||
// concatenate the lists
|
||||
viewEntity_t* all = others;
|
||||
|
||||
for( viewEntity_t* vEntity = areas; vEntity != NULL; )
|
||||
{
|
||||
viewEntity_t* next = vEntity->next;
|
||||
vEntity->next = all;
|
||||
all = vEntity;
|
||||
vEntity = next;
|
||||
}
|
||||
|
||||
for( viewEntity_t* vEntity = dynamics; vEntity != NULL; )
|
||||
{
|
||||
viewEntity_t* next = vEntity->next;
|
||||
vEntity->next = all;
|
||||
all = vEntity;
|
||||
vEntity = next;
|
||||
}
|
||||
|
||||
return all;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
===================
|
||||
R_RenderSingleModel
|
||||
|
||||
May be run in parallel.
|
||||
|
||||
Here is where dynamic models actually get instantiated, and necessary
|
||||
interaction surfaces get created. This is all done on a sort-by-model
|
||||
basis to keep source data in cache (most likely L2) as any interactions
|
||||
and shadows are generated, since dynamic models will typically be lit by
|
||||
two or more lights.
|
||||
===================
|
||||
*/
|
||||
void R_RenderSingleModel( viewEntity_t* vEntity )
|
||||
{
|
||||
// we will add all interaction surfs here, to be chained to the lights in later serial code
|
||||
vEntity->drawSurfs = NULL;
|
||||
|
||||
// RB
|
||||
vEntity->useLightGrid = false;
|
||||
|
||||
// globals we really should pass in...
|
||||
const viewDef_t* viewDef = tr.viewDef;
|
||||
|
||||
idRenderEntityLocal* entityDef = vEntity->entityDef;
|
||||
const renderEntity_t* renderEntity = &entityDef->parms;
|
||||
const idRenderWorldLocal* world = entityDef->world;
|
||||
|
||||
if( viewDef->isXraySubview && entityDef->parms.xrayIndex == 1 )
|
||||
{
|
||||
return;
|
||||
}
|
||||
else if( !viewDef->isXraySubview && entityDef->parms.xrayIndex == 2 )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
SCOPED_PROFILE_EVENT( renderEntity->hModel == NULL ? "Unknown Model" : renderEntity->hModel->Name() );
|
||||
|
||||
// calculate the znear for testing whether or not the view is inside a shadow projection
|
||||
const float znear = ( viewDef->renderView.cramZNear ) ? ( r_znear.GetFloat() * 0.25f ) : r_znear.GetFloat();
|
||||
|
||||
// if the entity wasn't seen through a portal chain, it was added just for light shadows
|
||||
const bool modelIsVisible = !vEntity->scissorRect.IsEmpty();
|
||||
const bool addInteractions = modelIsVisible && ( !viewDef->isXraySubview || entityDef->parms.xrayIndex == 2 );
|
||||
const int entityIndex = entityDef->index;
|
||||
|
||||
extern idCVar r_lodMaterialDistance;
|
||||
|
||||
//---------------------------
|
||||
// Find which of the visible lights contact this entity
|
||||
//
|
||||
// If the entity doesn't accept light or cast shadows from any surface,
|
||||
// this can be skipped.
|
||||
//
|
||||
// OPTIMIZE: world areas can assume all referenced lights are used
|
||||
//---------------------------
|
||||
int numContactedLights = 0;
|
||||
static const int MAX_CONTACTED_LIGHTS = 128;
|
||||
viewLight_t* contactedLights[MAX_CONTACTED_LIGHTS];
|
||||
idInteraction* staticInteractions[MAX_CONTACTED_LIGHTS];
|
||||
|
||||
if( renderEntity->hModel == NULL ||
|
||||
renderEntity->hModel->ModelHasInteractingSurfaces() ||
|
||||
renderEntity->hModel->ModelHasShadowCastingSurfaces() )
|
||||
{
|
||||
SCOPED_PROFILE_EVENT( "Find lights" );
|
||||
for( viewLight_t* vLight = viewDef->viewLights; vLight != NULL; vLight = vLight->next )
|
||||
{
|
||||
if( vLight->scissorRect.IsEmpty() )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if( vLight->entityInteractionState != NULL )
|
||||
{
|
||||
// new code path, everything was done in AddLight
|
||||
if( vLight->entityInteractionState[entityIndex] == viewLight_t::INTERACTION_YES )
|
||||
{
|
||||
contactedLights[numContactedLights] = vLight;
|
||||
staticInteractions[numContactedLights] = world->interactionTable[vLight->lightDef->index * world->interactionTableWidth + entityIndex];
|
||||
if( ++numContactedLights == MAX_CONTACTED_LIGHTS )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
const idRenderLightLocal* lightDef = vLight->lightDef;
|
||||
|
||||
if( !lightDef->globalLightBounds.IntersectsBounds( entityDef->globalReferenceBounds ) )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if( R_CullModelBoundsToLight( lightDef, entityDef->localReferenceBounds, entityDef->modelRenderMatrix ) )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if( !modelIsVisible )
|
||||
{
|
||||
// some lights have their center of projection outside the world
|
||||
if( lightDef->areaNum != -1 )
|
||||
{
|
||||
// if no part of the model is in an area that is connected to
|
||||
// the light center (it is behind a solid, closed door), we can ignore it
|
||||
bool areasConnected = false;
|
||||
for( areaReference_t* ref = entityDef->entityRefs; ref != NULL; ref = ref->ownerNext )
|
||||
{
|
||||
if( world->AreasAreConnected( lightDef->areaNum, ref->area->areaNum, PS_BLOCK_VIEW ) )
|
||||
{
|
||||
areasConnected = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( areasConnected == false )
|
||||
{
|
||||
// can't possibly be seen or shadowed
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// check more precisely for shadow visibility
|
||||
idBounds shadowBounds;
|
||||
R_ShadowBounds( entityDef->globalReferenceBounds, lightDef->globalLightBounds, lightDef->globalLightOrigin, shadowBounds );
|
||||
|
||||
// this doesn't say that the shadow can't effect anything, only that it can't
|
||||
// effect anything in the view
|
||||
if( idRenderMatrix::CullBoundsToMVP( viewDef->worldSpace.mvp, shadowBounds ) )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
contactedLights[numContactedLights] = vLight;
|
||||
staticInteractions[numContactedLights] = world->interactionTable[vLight->lightDef->index * world->interactionTableWidth + entityIndex];
|
||||
if( ++numContactedLights == MAX_CONTACTED_LIGHTS )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if we aren't visible and none of the shadows stretch into the view,
|
||||
// we don't need to do anything else
|
||||
if( !modelIsVisible && numContactedLights == 0 )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
//---------------------------
|
||||
// create a dynamic model if the geometry isn't static
|
||||
//---------------------------
|
||||
idRenderModel* model = R_EntityDefDynamicModel( entityDef );
|
||||
if( model == NULL || model->NumSurfaces() <= 0 )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
//---------------------------
|
||||
// copy matrix related stuff for back-end use
|
||||
// and setup a render matrix for faster culling
|
||||
//---------------------------
|
||||
vEntity->modelDepthHack = renderEntity->modelDepthHack;
|
||||
vEntity->weaponDepthHack = renderEntity->weaponDepthHack;
|
||||
vEntity->skipMotionBlur = renderEntity->skipMotionBlur;
|
||||
|
||||
memcpy( vEntity->modelMatrix, entityDef->modelMatrix, sizeof( vEntity->modelMatrix ) );
|
||||
R_MatrixMultiply( entityDef->modelMatrix, viewDef->worldSpace.modelViewMatrix, vEntity->modelViewMatrix );
|
||||
|
||||
idRenderMatrix viewMat;
|
||||
idRenderMatrix::Transpose( *( idRenderMatrix* )vEntity->modelViewMatrix, viewMat );
|
||||
idRenderMatrix::Multiply( viewDef->projectionRenderMatrix, viewMat, vEntity->mvp );
|
||||
if( renderEntity->weaponDepthHack )
|
||||
{
|
||||
idRenderMatrix::ApplyDepthHack( vEntity->mvp );
|
||||
}
|
||||
if( renderEntity->modelDepthHack != 0.0f )
|
||||
{
|
||||
idRenderMatrix::ApplyModelDepthHack( vEntity->mvp, renderEntity->modelDepthHack );
|
||||
}
|
||||
|
||||
// local light and view origins are used to determine if the view is definitely outside
|
||||
// an extruded shadow volume, which means we can skip drawing the end caps
|
||||
idVec3 localViewOrigin;
|
||||
R_GlobalPointToLocal( vEntity->modelMatrix, viewDef->renderView.vieworg, localViewOrigin );
|
||||
|
||||
//---------------------------
|
||||
// add all the model surfaces
|
||||
//---------------------------
|
||||
for( int surfaceNum = 0; surfaceNum < model->NumSurfaces(); surfaceNum++ )
|
||||
{
|
||||
const modelSurface_t* surf = model->Surface( surfaceNum );
|
||||
|
||||
// for debugging, only show a single surface at a time
|
||||
if( r_singleSurface.GetInteger() >= 0 && surfaceNum != r_singleSurface.GetInteger() )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
srfTriangles_t* tri = surf->geometry;
|
||||
if( tri == NULL )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if( tri->numIndexes == 0 )
|
||||
{
|
||||
continue; // happens for particles
|
||||
}
|
||||
const idMaterial* shader = surf->shader;
|
||||
if( shader == NULL )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// motorsep 11-24-2014; checking for LOD surface for LOD1 iteration
|
||||
if( shader->IsLOD() )
|
||||
{
|
||||
// foresthale 2014-11-24: calculate the bounds and get the distance from camera to bounds
|
||||
idBounds& localBounds = tri->bounds;
|
||||
if( tri->staticModelWithJoints )
|
||||
{
|
||||
// skeletal models have difficult to compute bounds for surfaces, so use the whole entity
|
||||
localBounds = vEntity->entityDef->localReferenceBounds;
|
||||
}
|
||||
const float* bounds = localBounds.ToFloatPtr();
|
||||
idVec3 nearestPointOnBounds = localViewOrigin;
|
||||
nearestPointOnBounds.x = Max( nearestPointOnBounds.x, bounds[0] );
|
||||
nearestPointOnBounds.x = Min( nearestPointOnBounds.x, bounds[3] );
|
||||
nearestPointOnBounds.y = Max( nearestPointOnBounds.y, bounds[1] );
|
||||
nearestPointOnBounds.y = Min( nearestPointOnBounds.y, bounds[4] );
|
||||
nearestPointOnBounds.z = Max( nearestPointOnBounds.z, bounds[2] );
|
||||
nearestPointOnBounds.z = Min( nearestPointOnBounds.z, bounds[5] );
|
||||
idVec3 delta = nearestPointOnBounds - localViewOrigin;
|
||||
float distance = delta.LengthFast();
|
||||
|
||||
if( !shader->IsLODVisibleForDistance( distance, r_lodMaterialDistance.GetFloat() ) )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// foresthale 2014-09-01: don't skip surfaces that use the "forceShadows" flag
|
||||
if( !shader->IsDrawn() && !shader->SurfaceCastsShadow() )
|
||||
{
|
||||
continue; // collision hulls, etc
|
||||
}
|
||||
|
||||
// RemapShaderBySkin
|
||||
if( entityDef->parms.customShader != NULL )
|
||||
{
|
||||
// this is sort of a hack, but causes deformed surfaces to map to empty surfaces,
|
||||
// so the item highlight overlay doesn't highlight the autosprite surface
|
||||
if( shader->Deform() )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
shader = entityDef->parms.customShader;
|
||||
}
|
||||
else if( entityDef->parms.customSkin )
|
||||
{
|
||||
shader = entityDef->parms.customSkin->RemapShaderBySkin( shader );
|
||||
if( shader == NULL )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// foresthale 2014-09-01: don't skip surfaces that use the "forceShadows" flag
|
||||
if( !shader->IsDrawn() && !shader->SurfaceCastsShadow() )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// optionally override with the renderView->globalMaterial
|
||||
if( tr.primaryRenderView.globalMaterial != NULL )
|
||||
{
|
||||
shader = tr.primaryRenderView.globalMaterial;
|
||||
}
|
||||
|
||||
SCOPED_PROFILE_EVENT( shader->GetName() );
|
||||
|
||||
// debugging tool to make sure we have the correct pre-calculated bounds
|
||||
if( r_checkBounds.GetBool() )
|
||||
{
|
||||
for( int j = 0; j < tri->numVerts; j++ )
|
||||
{
|
||||
int k;
|
||||
for( k = 0; k < 3; k++ )
|
||||
{
|
||||
if( tri->verts[j].xyz[k] > tri->bounds[1][k] + CHECK_BOUNDS_EPSILON
|
||||
|| tri->verts[j].xyz[k] < tri->bounds[0][k] - CHECK_BOUNDS_EPSILON )
|
||||
{
|
||||
common->Printf( "bad tri->bounds on %s:%s\n", entityDef->parms.hModel->Name(), shader->GetName() );
|
||||
break;
|
||||
}
|
||||
if( tri->verts[j].xyz[k] > entityDef->localReferenceBounds[1][k] + CHECK_BOUNDS_EPSILON
|
||||
|| tri->verts[j].xyz[k] < entityDef->localReferenceBounds[0][k] - CHECK_BOUNDS_EPSILON )
|
||||
{
|
||||
common->Printf( "bad referenceBounds on %s:%s\n", entityDef->parms.hModel->Name(), shader->GetName() );
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( k != 3 )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// view frustum culling for the precise surface bounds, which is tighter
|
||||
// than the entire entity reference bounds
|
||||
// If the entire model wasn't visible, there is no need to check the
|
||||
// individual surfaces.
|
||||
const bool surfaceDirectlyVisible = modelIsVisible && !idRenderMatrix::CullBoundsToMVP( vEntity->mvp, tri->bounds );
|
||||
|
||||
// RB: added check wether GPU skinning is available at all
|
||||
const bool gpuSkinned = ( tri->staticModelWithJoints != NULL && r_useGPUSkinning.GetBool() );
|
||||
// RB end
|
||||
|
||||
//--------------------------
|
||||
// base drawing surface
|
||||
//--------------------------
|
||||
const float* shaderRegisters = NULL;
|
||||
drawSurf_t* baseDrawSurf = NULL;
|
||||
if( surfaceDirectlyVisible && shader->IsDrawn() )
|
||||
{
|
||||
// TODO render to masked occlusion buffer
|
||||
|
||||
/*
|
||||
// make sure we have an ambient cache and all necessary normals / tangents
|
||||
if( !vertexCache.CacheIsCurrent( tri->indexCache ) )
|
||||
{
|
||||
tri->indexCache = vertexCache.AllocIndex( tri->indexes, tri->numIndexes );
|
||||
}
|
||||
|
||||
if( !vertexCache.CacheIsCurrent( tri->ambientCache ) )
|
||||
{
|
||||
// we are going to use it for drawing, so make sure we have the tangents and normals
|
||||
if( shader->ReceivesLighting() && !tri->tangentsCalculated )
|
||||
{
|
||||
assert( tri->staticModelWithJoints == NULL );
|
||||
R_DeriveTangents( tri );
|
||||
|
||||
// RB: this was hit by parametric particle models ..
|
||||
//assert( false ); // this should no longer be hit
|
||||
// RB end
|
||||
}
|
||||
tri->ambientCache = vertexCache.AllocVertex( tri->verts, tri->numVerts );
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
// add the surface for drawing
|
||||
// we can re-use some of the values for light interaction surfaces
|
||||
baseDrawSurf = ( drawSurf_t* )R_FrameAlloc( sizeof( *baseDrawSurf ), FRAME_ALLOC_DRAW_SURFACE );
|
||||
baseDrawSurf->frontEndGeo = tri;
|
||||
baseDrawSurf->space = vEntity;
|
||||
baseDrawSurf->scissorRect = vEntity->scissorRect;
|
||||
baseDrawSurf->extraGLState = 0;
|
||||
|
||||
R_SetupDrawSurfShader( baseDrawSurf, shader, renderEntity );
|
||||
|
||||
shaderRegisters = baseDrawSurf->shaderRegisters;
|
||||
|
||||
// Check for deformations (eyeballs, flares, etc)
|
||||
const deform_t shaderDeform = shader->Deform();
|
||||
if( shaderDeform != DFRM_NONE )
|
||||
{
|
||||
drawSurf_t* deformDrawSurf = R_DeformDrawSurf( baseDrawSurf );
|
||||
if( deformDrawSurf != NULL )
|
||||
{
|
||||
// any deforms may have created multiple draw surfaces
|
||||
for( drawSurf_t* surf = deformDrawSurf, * next = NULL; surf != NULL; surf = next )
|
||||
{
|
||||
next = surf->nextOnLight;
|
||||
|
||||
surf->linkChain = NULL;
|
||||
surf->nextOnLight = vEntity->drawSurfs;
|
||||
vEntity->drawSurfs = surf;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Most deform source surfaces do not need to be rendered.
|
||||
// However, particles are rendered in conjunction with the source surface.
|
||||
if( shaderDeform == DFRM_NONE || shaderDeform == DFRM_PARTICLE || shaderDeform == DFRM_PARTICLE2 )
|
||||
{
|
||||
// copy verts and indexes to this frame's hardware memory if they aren't already there
|
||||
if( !vertexCache.CacheIsCurrent( tri->ambientCache ) )
|
||||
{
|
||||
tri->ambientCache = vertexCache.AllocVertex( tri->verts, tri->numVerts );
|
||||
}
|
||||
if( !vertexCache.CacheIsCurrent( tri->indexCache ) )
|
||||
{
|
||||
tri->indexCache = vertexCache.AllocIndex( tri->indexes, tri->numIndexes );
|
||||
}
|
||||
|
||||
R_SetupDrawSurfJoints( baseDrawSurf, tri, shader );
|
||||
|
||||
baseDrawSurf->numIndexes = tri->numIndexes;
|
||||
baseDrawSurf->ambientCache = tri->ambientCache;
|
||||
baseDrawSurf->indexCache = tri->indexCache;
|
||||
|
||||
baseDrawSurf->linkChain = NULL; // link to the view
|
||||
baseDrawSurf->nextOnLight = vEntity->drawSurfs;
|
||||
vEntity->drawSurfs = baseDrawSurf;
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//REGISTER_PARALLEL_JOB( R_AddSingleModel, "R_AddSingleModel" );
|
||||
|
||||
|
||||
|
||||
/*
|
||||
===================
|
||||
R_FillMaskedOcclusionBufferWithModels
|
||||
===================
|
||||
*/
|
||||
void R_FillMaskedOcclusionBufferWithModels()
|
||||
{
|
||||
SCOPED_PROFILE_EVENT( "R_FillMaskedOcclusionBufferWithModels" );
|
||||
|
||||
tr.viewDef->viewEntitys = R_SortViewEntities( tr.viewDef->viewEntitys );
|
||||
|
||||
//-------------------------------------------------
|
||||
// Go through each view entity that is either visible to the view, or to
|
||||
// any light that intersects the view (for shadows).
|
||||
//-------------------------------------------------
|
||||
|
||||
/*
|
||||
if( r_useParallelAddModels.GetBool() )
|
||||
{
|
||||
for( viewEntity_t* vEntity = tr.viewDef->viewEntitys; vEntity != NULL; vEntity = vEntity->next )
|
||||
{
|
||||
tr.frontEndJobList->AddJob( ( jobRun_t )R_AddSingleModel, vEntity );
|
||||
}
|
||||
tr.frontEndJobList->Submit();
|
||||
tr.frontEndJobList->Wait();
|
||||
}
|
||||
else
|
||||
*/
|
||||
{
|
||||
for( viewEntity_t* vEntity = tr.viewDef->viewEntitys; vEntity != NULL; vEntity = vEntity->next )
|
||||
{
|
||||
const idRenderModel* model = vEntity->entityDef->parms.hModel;
|
||||
|
||||
// skip after rendering BSP area models
|
||||
if( !model->IsStaticWorldModel() )
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
R_RenderSingleModel( vEntity );
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue