From 9e919c8d7693d0dc7f422809dd17371eae334ccc Mon Sep 17 00:00:00 2001 From: Robert Beckebans Date: Wed, 21 Aug 2024 18:39:52 +0200 Subject: [PATCH] Added Masked Software Occlusion Culling lib by Intel --- neo/CMakeLists.txt | 2 + neo/libs/moc/CMakeLists.txt | 85 + neo/libs/moc/CompilerSpecific.inl | 98 + neo/libs/moc/CullingThreadpool.cpp | 503 ++++ neo/libs/moc/CullingThreadpool.h | 311 +++ neo/libs/moc/MaskedOcclusionCulling.cpp | 528 +++++ neo/libs/moc/MaskedOcclusionCulling.h | 596 +++++ neo/libs/moc/MaskedOcclusionCullingAVX2.cpp | 280 +++ neo/libs/moc/MaskedOcclusionCullingAVX512.cpp | 364 +++ neo/libs/moc/MaskedOcclusionCullingCommon.inl | 2050 +++++++++++++++++ neo/libs/moc/README.md | 450 ++++ neo/libs/moc/license.txt | 181 ++ neo/renderer/RenderCommon.h | 10 + neo/renderer/tr_frontend_addmodels.cpp | 62 +- neo/renderer/tr_frontend_main.cpp | 3 + .../tr_frontend_masked_occlusion_culling.cpp | 557 +++++ 16 files changed, 6021 insertions(+), 59 deletions(-) create mode 100644 neo/libs/moc/CMakeLists.txt create mode 100644 neo/libs/moc/CompilerSpecific.inl create mode 100644 neo/libs/moc/CullingThreadpool.cpp create mode 100644 neo/libs/moc/CullingThreadpool.h create mode 100644 neo/libs/moc/MaskedOcclusionCulling.cpp create mode 100644 neo/libs/moc/MaskedOcclusionCulling.h create mode 100644 neo/libs/moc/MaskedOcclusionCullingAVX2.cpp create mode 100644 neo/libs/moc/MaskedOcclusionCullingAVX512.cpp create mode 100644 neo/libs/moc/MaskedOcclusionCullingCommon.inl create mode 100644 neo/libs/moc/README.md create mode 100644 neo/libs/moc/license.txt create mode 100644 neo/renderer/tr_frontend_masked_occlusion_culling.cpp diff --git a/neo/CMakeLists.txt b/neo/CMakeLists.txt index 0af02487..a6bc842c 100644 --- a/neo/CMakeLists.txt +++ b/neo/CMakeLists.txt @@ -428,6 +428,8 @@ if(NOT APPLE) add_subdirectory(tools/compilers) endif() +add_subdirectory(libs/moc) + file(GLOB NATVIS_SOURCES .natvis) file(GLOB AAS_INCLUDES aas/*.h) diff --git a/neo/libs/moc/CMakeLists.txt b/neo/libs/moc/CMakeLists.txt new file mode 100644 index 00000000..33c3fab0 --- /dev/null +++ b/neo/libs/moc/CMakeLists.txt @@ -0,0 +1,85 @@ +# +# CMake file for the masked occlusion culling library +# +set(CMAKE_SUPPRESS_REGENERATION true) +option(USE_AVX512 "Enable experimental AVX-512 support" OFF) +set(CMAKE_CONFIGURATION_TYPES Debug Release) + +# +# Lists of all files included in the library +# +set( MOC_AVX512_FILES MaskedOcclusionCullingAVX512.cpp ) +set( MOC_AVX2_FILES MaskedOcclusionCullingAVX2.cpp ) +set( MOC_SSE_FILES MaskedOcclusionCulling.cpp CullingThreadpool.cpp ) +set( MOC_INCLUDE_FILES MaskedOcclusionCulling.h CullingThreadpool.h CompilerSpecific.inl MaskedOcclusionCullingCommon.inl ) +set( MOC_FILES ${MOC_AVX512_FILES} ${MOC_AVX2_FILES} ${MOC_SSE_FILES} ${MOC_INCLUDE_FILES} ) + +# +# Common compiler flags +# +if(MSVC) + if(MSVC_VERSION LESS 1900) + set(CMAKE_CXX_FLAGS "-std=c++11") + endif() +else() + set(CMAKE_CXX_FLAGS "-std=c++11 -m64") +endif() + +if(MSVC) +# +# Setup compiler flags for AVX-512 files (MSVC) +# + +if (USE_AVX512) + SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX512_FILES} PROPERTIES COMPILE_FLAGS "-DUSE_AVX512=1 /arch:AVX2" ) +else() + SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX512_FILES} PROPERTIES COMPILE_FLAGS "/arch:AVX2" ) +endif() + +# +# Setup compiler flags for AVX2 files (MSVC) +# +SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX2_FILES} PROPERTIES COMPILE_FLAGS "/arch:AVX2" ) + +# +# Setup compiler flags for SSE4.1 / SSE2 files (MSVC) +# +if(NOT "${CMAKE_GENERATOR}" MATCHES "(Win64|IA64)") + # SSE2 is always enabled on 64-bit architectures, specifying redundant flag produces a compiler warning + if(MSVC_VERSION LESS 1900) + SET_SOURCE_FILES_PROPERTIES( ${MOC_SSE_FILES} PROPERTIES COMPILE_FLAGS "/arch:SSE2" ) + endif() +endif() + +else() + +# +# Setup compiler flags for AVX-512 files +# +if (USE_AVX512) + SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX512_FILES} PROPERTIES COMPILE_FLAGS "-DUSE_AVX512=1 -mavx512f -mavx512bw -mavx512dq -mavx2 -mfma -msse4.1" ) +else() + SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX512_FILES} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -msse4.1" ) +endif() + +# +# Setup compiler flags for AVX2 files +# +SET_SOURCE_FILES_PROPERTIES( ${MOC_AVX2_FILES} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -msse4.1" ) + +# +# Setup compiler flags for SSE4.1 / SSE2 files +# +SET_SOURCE_FILES_PROPERTIES( ${MOC_SSE_FILES} PROPERTIES COMPILE_FLAGS "-msse4.1" ) + +endif() + +# +# Create masked occlusion culling library +# +add_library( MaskedOcclusionCulling ${MOC_FILES} ) + +# +# Add folder to include path +# +target_include_directories(MaskedOcclusionCulling PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/neo/libs/moc/CompilerSpecific.inl b/neo/libs/moc/CompilerSpecific.inl new file mode 100644 index 00000000..a6203ff9 --- /dev/null +++ b/neo/libs/moc/CompilerSpecific.inl @@ -0,0 +1,98 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +//////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Common shared include file to hide compiler/os specific functions from the rest of the code. +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && !defined(__clang__) + #define __MICROSOFT_COMPILER +#endif + +#if defined(_WIN32) && (defined(_MSC_VER) || defined(__INTEL_COMPILER) || defined(__clang__)) // Windows: MSVC / Intel compiler / clang + #include + #include + + #define FORCE_INLINE __forceinline + + FORCE_INLINE unsigned long find_clear_lsb(unsigned int *mask) + { + unsigned long idx; + _BitScanForward(&idx, *mask); + *mask &= *mask - 1; + return idx; + } + + FORCE_INLINE void *aligned_alloc(size_t alignment, size_t size) + { + return _aligned_malloc(size, alignment); + } + + FORCE_INLINE void aligned_free(void *ptr) + { + _aligned_free(ptr); + } + +#elif defined(__GNUG__) || defined(__clang__) // G++ or clang + #include +#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) + #include // memalign +#else + #include // memalign +#endif + #include + #include + #include + + #define FORCE_INLINE inline + + FORCE_INLINE unsigned long find_clear_lsb(unsigned int *mask) + { + unsigned long idx; + idx = __builtin_ctzl(*mask); + *mask &= *mask - 1; + return idx; + } + + FORCE_INLINE void *aligned_alloc(size_t alignment, size_t size) + { + return memalign(alignment, size); + } + + FORCE_INLINE void aligned_free(void *ptr) + { + free(ptr); + } + + FORCE_INLINE void __cpuidex(int* cpuinfo, int function, int subfunction) + { + __cpuid_count(function, subfunction, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); + } + + FORCE_INLINE unsigned long long _xgetbv(unsigned int index) + { + unsigned int eax, edx; + __asm__ __volatile__( + "xgetbv;" + : "=a" (eax), "=d"(edx) + : "c" (index) + ); + return ((unsigned long long)edx << 32) | eax; + } + +#else + #error Unsupported compiler +#endif diff --git a/neo/libs/moc/CullingThreadpool.cpp b/neo/libs/moc/CullingThreadpool.cpp new file mode 100644 index 00000000..390592dc --- /dev/null +++ b/neo/libs/moc/CullingThreadpool.cpp @@ -0,0 +1,503 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +//////////////////////////////////////////////////////////////////////////////// +#include +#include "CullingThreadpool.h" + +#define SAFE_DELETE(X) {if (X != nullptr) delete X; X = nullptr;} +#define SAFE_DELETE_ARRAY(X) {if (X != nullptr) delete[] X; X = nullptr;} + +template CullingThreadpool::StateData::StateData( unsigned int maxJobs ) : + mMaxJobs( maxJobs ), + mCurrentIdx( ~0 ) +{ + mData = new T[mMaxJobs]; +} + +template CullingThreadpool::StateData::~StateData() +{ + SAFE_DELETE_ARRAY( mData ); +} + +template void CullingThreadpool::StateData::AddData( const T& data ) +{ + mCurrentIdx++; + mData[mCurrentIdx % mMaxJobs] = data; +} + +template const T* CullingThreadpool::StateData::GetData() const +{ + return &mData[mCurrentIdx % mMaxJobs]; +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Helper class: Mostly lockless queue for render jobs +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +CullingThreadpool::RenderJobQueue::RenderJobQueue( unsigned int nBins, unsigned int maxJobs ) : + mNumBins( nBins ), + mMaxJobs( maxJobs ) +{ + mRenderPtrs = new std::atomic_uint[mNumBins]; + mBinMutexes = new std::atomic_uint[mNumBins]; + for( unsigned int i = 0; i < mNumBins; ++i ) + { + mBinMutexes[i] = 0; + } + + mJobs = new Job[mMaxJobs]; + for( unsigned int i = 0; i < mMaxJobs; ++i ) + { + mJobs[i].mRenderJobs = new TriList[mNumBins]; + } + + // Compute worst case job size (we allocate memory for the worst case) + const unsigned int TriSize = 3 * 3; + const unsigned int MaxTrisPerJob = TRIS_PER_JOB * 6; + const unsigned int MaxJobSize = MaxTrisPerJob * TriSize; + mTrilistData = new float[MaxJobSize * mMaxJobs * mNumBins]; + + // Setup trilist objects used for binning + for( unsigned int i = 0; i < mMaxJobs; ++i ) + { + for( unsigned int j = 0; j < mNumBins; ++j ) + { + int idx = i * mNumBins + j; + TriList& tList = mJobs[i].mRenderJobs[j]; + tList.mNumTriangles = MaxTrisPerJob; + tList.mTriIdx = 0; + tList.mPtr = mTrilistData + idx * MaxJobSize; + } + } + + // Clear render queue + Reset(); +} + +CullingThreadpool::RenderJobQueue::~RenderJobQueue() +{ + SAFE_DELETE_ARRAY( mRenderPtrs ); + SAFE_DELETE_ARRAY( mBinMutexes ); + for( unsigned int i = 0; i < mMaxJobs; ++i ) + { + SAFE_DELETE_ARRAY( mJobs[i].mRenderJobs ); + } + SAFE_DELETE_ARRAY( mJobs ); + SAFE_DELETE_ARRAY( mTrilistData ); +} + +inline unsigned int CullingThreadpool::RenderJobQueue::GetMinRenderPtr() const +{ + unsigned int minRenderPtr = mRenderPtrs[0]; + for( unsigned int i = 1; i < mNumBins; ++i ) + { + unsigned int renderPtr = mRenderPtrs[i]; + minRenderPtr = renderPtr < minRenderPtr ? renderPtr : minRenderPtr; + } + return minRenderPtr; +} + +inline void CullingThreadpool::RenderJobQueue::AdvanceRenderJob( int binIdx ) +{ + mRenderPtrs[binIdx]++; + mBinMutexes[binIdx] = 0; +} + +inline unsigned int CullingThreadpool::RenderJobQueue::GetBestGlobalQueue() const +{ + // Find least advanced queue + unsigned int bestBin = ~0, bestPtr = mWritePtr; + for( unsigned int i = 0; i < mNumBins; ++i ) + { + if( mRenderPtrs[i] < bestPtr && mBinMutexes[i] == 0 ) + { + bestBin = i; + bestPtr = mRenderPtrs[i]; + } + } + return bestBin; +} + +inline bool CullingThreadpool::RenderJobQueue::IsPipelineEmpty() const +{ + return GetMinRenderPtr() == mWritePtr; +} + +inline bool CullingThreadpool::RenderJobQueue::CanWrite() const +{ + return mWritePtr - GetMinRenderPtr() < mMaxJobs; +} + +inline bool CullingThreadpool::RenderJobQueue::CanBin() const +{ + return mBinningPtr < mWritePtr && mBinningPtr - GetMinRenderPtr() < mMaxJobs; +} + +inline CullingThreadpool::RenderJobQueue::Job* CullingThreadpool::RenderJobQueue::GetWriteJob() +{ + return &mJobs[mWritePtr % mMaxJobs]; +} + +inline void CullingThreadpool::RenderJobQueue::AdvanceWriteJob() +{ + mWritePtr++; +} + +inline CullingThreadpool::RenderJobQueue::Job* CullingThreadpool::RenderJobQueue::GetBinningJob() +{ + unsigned int binningPtr = mBinningPtr; + if( binningPtr < mWritePtr && binningPtr - GetMinRenderPtr() < mMaxJobs ) + { + if( mBinningPtr.compare_exchange_strong( binningPtr, binningPtr + 1 ) ) + { + mJobs[binningPtr % mMaxJobs].mBinningJobStartedIdx = binningPtr; + return &mJobs[binningPtr % mMaxJobs]; + } + } + return nullptr; +} + +inline void CullingThreadpool::RenderJobQueue::FinishedBinningJob( Job* job ) +{ + job->mBinningJobCompletedIdx = job->mBinningJobStartedIdx; +} + +inline CullingThreadpool::RenderJobQueue::Job* CullingThreadpool::RenderJobQueue::GetRenderJob( int binIdx ) +{ + // Attempt to lock bin mutex + unsigned int expected = 0; + if( !mBinMutexes[binIdx].compare_exchange_strong( expected, 1 ) ) + { + return nullptr; + } + + // Check any items in the queue, and bail if empty + if( mRenderPtrs[binIdx] != mJobs[mRenderPtrs[binIdx] % mMaxJobs].mBinningJobCompletedIdx ) + { + mBinMutexes[binIdx] = 0; + return nullptr; + } + + return &mJobs[mRenderPtrs[binIdx] % mMaxJobs]; +} + +void CullingThreadpool::RenderJobQueue::Reset() +{ + mWritePtr = 0; + mBinningPtr = 0; + + for( unsigned int i = 0; i < mNumBins; ++i ) + { + mRenderPtrs[i] = 0; + } + + for( unsigned int i = 0; i < mMaxJobs; ++i ) + { + mJobs[i].mBinningJobCompletedIdx = -1; + mJobs[i].mBinningJobStartedIdx = -1; + } +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Culling threadpool private helper functions +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void CullingThreadpool::SetupScissors() +{ + unsigned int width, height; + mMOC->GetResolution( width, height ); + + unsigned int binWidth; + unsigned int binHeight; + mMOC->ComputeBinWidthHeight( mBinsW, mBinsH, binWidth, binHeight ); + + for( unsigned int ty = 0; ty < mBinsH; ++ty ) + { + for( unsigned int tx = 0; tx < mBinsW; ++tx ) + { + unsigned int threadIdx = tx + ty * mBinsW; + + // Adjust rects on final row / col to match resolution + mRects[threadIdx].mMinX = tx * binWidth; + mRects[threadIdx].mMaxX = tx + 1 == mBinsW ? width : ( tx + 1 ) * binWidth; + mRects[threadIdx].mMinY = ty * binHeight; + mRects[threadIdx].mMaxY = ty + 1 == mBinsH ? height : ( ty + 1 ) * binHeight; + } + } +} + +void CullingThreadpool::ThreadRun( CullingThreadpool* threadPool, unsigned int threadId ) +{ + threadPool->ThreadMain( threadId ); +} + +void CullingThreadpool::ThreadMain( unsigned int threadIdx ) +{ + while( true ) + { + bool threadIsIdle = true; + unsigned int threadBinIdx = threadIdx; + + // Wait for threads to be woken up (low CPU load sleep) + std::unique_lock lock( mSuspendedMutex ); + mNumSuspendedThreads++; + mSuspendedCV.wait( lock, [&] {return !mSuspendThreads; } ); + mNumSuspendedThreads--; + lock.unlock(); + + // Loop until suspended again + while( !mSuspendThreads || !threadIsIdle ) + { + if( mKillThreads ) + { + return; + } + + threadIsIdle = false; + + // Prio 1: Process any render jobs local to this thread + unsigned int binIdx = threadBinIdx; + threadBinIdx = threadBinIdx + mNumThreads < mNumBins ? threadBinIdx + mNumThreads : threadIdx; + RenderJobQueue::Job* job = mRenderQueue->GetRenderJob( binIdx ); + if( job != nullptr ) + { + if( job->mRenderJobs[binIdx].mTriIdx > 0 ) + { + mMOC->RenderTrilist( job->mRenderJobs[binIdx], &mRects[binIdx] ); + } + + mRenderQueue->AdvanceRenderJob( binIdx ); + continue; + } + + // Prio 2: Process any outstanding setup/binning jobs + if( mRenderQueue->CanBin() ) + { + // If no more rasterization jobs, get next binning job + RenderJobQueue::Job* job = mRenderQueue->GetBinningJob(); + if( job != nullptr ) + { + RenderJobQueue::BinningJob& sjob = job->mBinningJob; + for( unsigned int i = 0; i < mNumBins; ++i ) + { + job->mRenderJobs[i].mTriIdx = 0; + } + mMOC->BinTriangles( sjob.mVerts, sjob.mTris, sjob.nTris, job->mRenderJobs, mBinsW, mBinsH, sjob.mMatrix, sjob.mBfWinding, sjob.mClipPlanes, *sjob.mVtxLayout ); + mRenderQueue->FinishedBinningJob( job ); + } + continue; + } + + // Prio 3: No work is available, work steal from another thread's queue + if( mNumBins > mNumThreads ) + { + binIdx = mRenderQueue->GetBestGlobalQueue(); + if( binIdx < mRenderQueue->mNumBins ) + { + RenderJobQueue::Job* job = mRenderQueue->GetRenderJob( binIdx ); + if( job != nullptr ) + { + if( job->mRenderJobs[binIdx].mTriIdx > 0 ) + { + mMOC->RenderTrilist( job->mRenderJobs[binIdx], &mRects[binIdx] ); + } + + mRenderQueue->AdvanceRenderJob( binIdx ); + } + continue; + } + } + + // No work available: Yield this thread + std::this_thread::yield(); + threadIsIdle = true; + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Culling threadpool public API, similar to the MaskedOcclusionCulling class +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +CullingThreadpool::CullingThreadpool( unsigned int numThreads, unsigned int binsW, unsigned int binsH, unsigned int maxJobs ) : + mNumThreads( numThreads ), + mMaxJobs( maxJobs ), + mBinsW( binsW ), + mBinsH( binsH ), + mKillThreads( false ), + mSuspendThreads( true ), + mNumSuspendedThreads( 0 ), + mModelToClipMatrices( maxJobs ), + mVertexLayouts( maxJobs ), + mMOC( nullptr ) +{ + mNumBins = mBinsW * mBinsH; + assert( mNumBins >= mNumThreads ); // Having less bins than threads is a bad idea! + + mRects = new ScissorRect[mNumBins]; + mRenderQueue = new RenderJobQueue( mNumBins, mMaxJobs ); + + // Add default vertex layout and matrix + mVertexLayouts.AddData( VertexLayout( 16, 4, 12 ) ); + mCurrentMatrix = nullptr; + + mThreads = new std::thread[mNumThreads]; + for( unsigned int i = 0; i < mNumThreads; ++i ) + { + mThreads[i] = std::thread( ThreadRun, this, i ); + } + +} + +CullingThreadpool::~CullingThreadpool() +{ + // Wait for threads to terminate + if( mThreads != nullptr || !mKillThreads ) + { + WakeThreads(); + mKillThreads = true; + for( unsigned int i = 0; i < mNumThreads; ++i ) + { + mThreads[i].join(); + } + + } + + // Free memory + SAFE_DELETE( mRenderQueue ); + SAFE_DELETE_ARRAY( mRects ); + SAFE_DELETE_ARRAY( mThreads ); +} + +void CullingThreadpool::WakeThreads() +{ + // Wait for all threads to be in suspended mode + while( mNumSuspendedThreads < mNumThreads ) + { + std::this_thread::yield(); + } + + // Send wake up event + std::unique_lock lock( mSuspendedMutex ); + mSuspendThreads = false; + lock.unlock(); + mSuspendedCV.notify_all(); +} + +void CullingThreadpool::SuspendThreads() +{ + // Signal threads to go into suspended mode (after finishing all outstanding work) + mSuspendThreads = true; +} + +void CullingThreadpool::Flush() +{ + // Wait for pipeline to be empty (i.e. all work is finished) + while( !mRenderQueue->IsPipelineEmpty() ) + { + std::this_thread::yield(); + } + + // Reset queue counters + mRenderQueue->Reset(); +} + +void CullingThreadpool::SetBuffer( MaskedOcclusionCulling* moc ) +{ + Flush(); + mMOC = moc; + SetupScissors(); +} + +void CullingThreadpool::SetResolution( unsigned int width, unsigned int height ) +{ + Flush(); + mMOC->SetResolution( width, height ); + SetupScissors(); +} + +void CullingThreadpool::SetNearClipPlane( float nearDist ) +{ + Flush(); + mMOC->SetNearClipPlane( nearDist ); +} + +void CullingThreadpool::SetMatrix( const float* modelToClipMatrix ) +{ + // Treat nullptr matrix as a special case, otherwise copy the contents of the pointer and add to state + if( modelToClipMatrix == nullptr ) + { + mCurrentMatrix = nullptr; + } + else + { + mModelToClipMatrices.AddData( Matrix4x4( modelToClipMatrix ) ); + mCurrentMatrix = mModelToClipMatrices.GetData()->mValues; + } +} + +void CullingThreadpool::SetVertexLayout( const VertexLayout& vtxLayout ) +{ + mVertexLayouts.AddData( vtxLayout ); +} + +void CullingThreadpool::ClearBuffer() +{ + Flush(); + mMOC->ClearBuffer(); +} + +void CullingThreadpool::RenderTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask ) +{ +#if MOC_RECORDER_ENABLE != 0 + mMOC->RecordRenderTriangles( inVtx, inTris, nTris, mCurrentMatrix, clipPlaneMask, bfWinding, *mVertexLayouts.GetData( ) ); +#endif + + for( int i = 0; i < nTris; i += TRIS_PER_JOB ) + { + // Yield if work queue is full + while( !mRenderQueue->CanWrite() ) + { + std::this_thread::yield(); + } + + // Create new renderjob + RenderJobQueue::Job* job = mRenderQueue->GetWriteJob(); + job->mBinningJob.mVerts = inVtx; + job->mBinningJob.mTris = inTris + i * 3; + job->mBinningJob.nTris = nTris - i < TRIS_PER_JOB ? nTris - i : TRIS_PER_JOB; + job->mBinningJob.mMatrix = mCurrentMatrix; + job->mBinningJob.mClipPlanes = clipPlaneMask; + job->mBinningJob.mBfWinding = bfWinding; + job->mBinningJob.mVtxLayout = mVertexLayouts.GetData(); + mRenderQueue->AdvanceWriteJob(); + } +} + +CullingThreadpool::CullingResult CullingThreadpool::TestRect( float xmin, float ymin, float xmax, float ymax, float wmin ) +{ + return mMOC->TestRect( xmin, ymin, xmax, ymax, wmin ); +} + +CullingThreadpool::CullingResult CullingThreadpool::TestTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask ) +{ + return mMOC->TestTriangles( inVtx, inTris, nTris, mCurrentMatrix, bfWinding, clipPlaneMask, *mVertexLayouts.GetData() ); +} + +void CullingThreadpool::ComputePixelDepthBuffer( float* depthData, bool flipY ) +{ + Flush(); + mMOC->ComputePixelDepthBuffer( depthData, flipY ); +} diff --git a/neo/libs/moc/CullingThreadpool.h b/neo/libs/moc/CullingThreadpool.h new file mode 100644 index 00000000..7d4f8052 --- /dev/null +++ b/neo/libs/moc/CullingThreadpool.h @@ -0,0 +1,311 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +//////////////////////////////////////////////////////////////////////////////// +#pragma once + +/*! + * \file CullingThreadpool.h + * \brief Worker threadpool example for threaded masked occlusion culling. + * + * This class implements a threadpool for occluder rendering. Calls to CullingThreadpool::RenderTriangle() + * will immediately return, after adding work items to a queue, and occluder rendering is performed + * by worker threads as quickly as possible. Occlusion queries are performed directly on the calling + * threadand can be performed either synchronosly, by calling Flush() before executing the query, or + * asynchronosly, by performing the query without waiting for the worker threads to finish. + * + * Note that this implementation should be considered an example rather than the best threading + * solution. You may want to integrate threading in your own task system, and it may also be beneficial + * to thread the traversal code. Refer to MaskedOcclusionCulling::BinTriangles() and + * MaskedOcclusionCulling::RenderTrilist() for functions that can be used to make your own + * threaded culling system. + */ + +#include +#include +#include +#include + +#include "MaskedOcclusionCulling.h" + +class CullingThreadpool +{ +protected: + static const int TRIS_PER_JOB = 1024; // Maximum number of triangles per job (bigger drawcalls are split), affects memory requirements + + typedef MaskedOcclusionCulling::CullingResult CullingResult; + typedef MaskedOcclusionCulling::ClipPlanes ClipPlanes; + typedef MaskedOcclusionCulling::BackfaceWinding BackfaceWinding; + typedef MaskedOcclusionCulling::ScissorRect ScissorRect; + typedef MaskedOcclusionCulling::VertexLayout VertexLayout; + typedef MaskedOcclusionCulling::TriList TriList; + + // Small utility class for 4x4 matrices + struct Matrix4x4 + { + float mValues[16]; + Matrix4x4() {} + Matrix4x4( const float* matrix ) + { + for( int i = 0; i < 16; ++i ) + { + mValues[i] = matrix[i]; + } + } + }; + + // Internal utility class for a (mostly) lockless queue for binning & rendering jobs + struct RenderJobQueue + { + struct BinningJob + { + const float* mVerts; + const unsigned int* mTris; + unsigned int nTris; + + const float* mMatrix; + ClipPlanes mClipPlanes; + BackfaceWinding mBfWinding; + const VertexLayout* mVtxLayout; + }; + + struct Job + { + volatile unsigned int mBinningJobStartedIdx; + volatile unsigned int mBinningJobCompletedIdx; + BinningJob mBinningJob; + TriList* mRenderJobs; + }; + + unsigned int mNumBins; + unsigned int mMaxJobs; + + volatile unsigned int mWritePtr; + std::atomic_uint mBinningPtr; + std::atomic_uint* mRenderPtrs; + std::atomic_uint* mBinMutexes; + + float* mTrilistData; + Job* mJobs; + + RenderJobQueue( unsigned int nBins, unsigned int maxJobs ); + ~RenderJobQueue(); + + unsigned int GetMinRenderPtr() const; + unsigned int GetBestGlobalQueue() const; + bool IsPipelineEmpty() const; + + bool CanWrite() const; + bool CanBin() const; + + Job* GetWriteJob(); + void AdvanceWriteJob(); + + Job* GetBinningJob(); + void FinishedBinningJob( Job* job ); + + Job* GetRenderJob( int binIdx ); + void AdvanceRenderJob( int binIdx ); + + void Reset(); + }; + + // Internal utility class for state (matrix / vertex layout) + template struct StateData + { + unsigned int mMaxJobs; + unsigned int mCurrentIdx; + T* mData; + + StateData( unsigned int maxJobs ); + ~StateData(); + void AddData( const T& data ); + const T* GetData() const; + }; + + // Number of worker threads and bins + unsigned int mNumThreads; + unsigned int mNumBins; + unsigned int mMaxJobs; + unsigned int mBinsW; + unsigned int mBinsH; + + // Threads and control variables + std::mutex mSuspendedMutex; + std::condition_variable mSuspendedCV; + volatile bool mKillThreads; + volatile bool mSuspendThreads; + volatile unsigned int mNumSuspendedThreads; + std::thread* mThreads; + + // State variables and command queue + const float* mCurrentMatrix; + StateData mModelToClipMatrices; + StateData mVertexLayouts; + RenderJobQueue* mRenderQueue; + + // Occlusion culling object and related scissor rectangles + ScissorRect* mRects; + MaskedOcclusionCulling* mMOC; + + void SetupScissors(); + + static void ThreadRun( CullingThreadpool* threadPool, unsigned int threadId ); + void ThreadMain( unsigned int threadIdx ); + +public: + /*! + * \brief Creates a new threadpool for masked occlusion culling. This object has a + * similar API to the MaskedOcclusionCulling class, but performs occluder + * rendering asynchronously on worker threads (similar to how DX/GL works). + * + * \param numThreads Number of worker threads to perform occluder rendering. Best + * balance may be scene/machine dependent, but it's good practice to leave at + * least one full core (2 threads with hyperthreading) for the main thread. + * \param binsW The screen is divided into binsW x binsH rectangular bins for load + * balancing. The number of bins should be atleast equal to the number of + * worker threads. + * \param binsH See description for the binsW parameter. + * \param maxJobs Maximum number of jobs that may be in flight at any given time. If + * the caller thread generates jobs faster than the worker threads can finish + * them, then the job queue will fill up and the caller thread will stall once + * "maxJobs" items have been queued up. For culling systems interleaving occlusion + * queries and rendering, this value should be kept quite low to minimize false + * positives (see TestRect()). We've observed that 32 [default] items typically + * works well for our interleaved queries, while also allowing good load-balancing, + * and this is the recommended setting. + */ + CullingThreadpool( unsigned int numThreads, unsigned int binsW, unsigned int binsH, unsigned int maxJobs = 32 ); + + /*! + * \brief Destroys the threadpool and terminates all worker threads. + */ + ~CullingThreadpool(); + + /*! + * \brief Wakes up culling worker threads from suspended sleep, and puts them in a + * ready state (using an idle spinlock with significantly higher CPU overhead). + * + * It may take on the order of 100us to wake up the threads, so this function should + * preferably be called slightly ahead of starting occlusion culling work. + */ + void WakeThreads(); + + /*! + * \brief Suspend all culling worker threads to a low CPU overhead sleep state. + * + * For performance and latency reasons, the culling work is performed in an active + * processing loop (with no thread sleeping) with high CPU overhead. In a system + * with more worker threads it's important to put the culling worker threads in a + * low overhead sleep state after occlusion culling work has completed. + */ + void SuspendThreads(); + + /*! + * \brief Waits for all outstanding occluder rendering work to complete. Can be used + * to ensure that rendering has completed before performing a TestRect() or + * TestTriangles() call. + */ + void Flush(); + + /* + * \brief Sets the MaskedOcclusionCulling object (buffer) to be used for rendering and + * testing calls. This method causes a Flush() to ensure that all unfinished + * rendering is completed. + */ + void SetBuffer( MaskedOcclusionCulling* moc ); + + /* + * \brief Changes the resolution of the occlusion buffer, see MaskedOcclusionCulling::SetResolution(). + * This method causes a Flush() to ensure that all unfinished rendering is completed. + */ + void SetResolution( unsigned int width, unsigned int height ); + + /* + * \brief Sets the near clipping plane, see MaskedOcclusionCulling::SetNearClipPlane(). This + * method causes a Flush() to ensure that all unfinished rendering is completed. + */ + void SetNearClipPlane( float nearDist ); + + /* + * \brief Sets the model to clipspace transform matrix used for the RenderTriangles() and TestTriangles() + * function calls. The contents of the matrix is copied, and it's safe to modify it without calling + * Flush(). The copy may be costly, which is the reason for passing this parameter as "state". + * + * \param modelToClipMatrix All vertices will be transformed by the specified model to clipspace matrix. + * Passing nullptr [default] disables the transform (equivalent to using an identity matrix). + */ + void SetMatrix( const float* modelToClipMatrix = nullptr ); + + /* + * \brief Sets the vertex layout used for the RenderTriangles() and TestTriangles() function calls. + * The vertex layout is copied, and it's safe to modify it without calling Flush(). The copy + * may be costly, which is the reason for passing this parameter as "state". + * + * \param vtxLayout A struct specifying the vertex layout (see struct for detailed + * description). For best performance, it is advicable to store position data + * as compactly in memory as possible. + */ + void SetVertexLayout( const VertexLayout& vtxLayout = VertexLayout( 16, 4, 12 ) ); + + /* + * \brief Clears the occlusion buffer, see MaskedOcclusionCulling::ClearBuffer(). This method + * causes a Flush() to ensure that all unfinished rendering is completed. + */ + void ClearBuffer(); + + /* + * \brief Asynchronously render occluder triangles, see MaskedOcclusionCulling::RenderTriangles(). + * + * This method puts the drawcall into a command queue, and immediately returns. The rendering is + * performed by the worker threads at the earliest opportunity. + * + * Important: As rendering is performed asynchronously, the application is not allowed to + * change the contents of the *inVtx or *inTris buffers until after rendering is completed. If + * you wish to use dynamic buffers, the application must perform a Flush() to ensure that rendering + * is finished, or make sure to rotate between more buffers than the maximum number of outstanding + * render jobs (see the CullingThreadpool() constructor). + */ + void RenderTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding = MaskedOcclusionCulling::BACKFACE_CW, ClipPlanes clipPlaneMask = MaskedOcclusionCulling::CLIP_PLANE_ALL ); + + /* + * \brief Occlusion query for a rectangle with a given depth, see MaskedOcclusionCulling::TestRect(). + * + * Important: This method is performed on the main thread and does not wait for outstanding + * occluder rendering to be finished. To ensure that all occluder rendering is completed you must + * perform a Flush() prior to calling this function. + * + * It is conservatively correct to perform occlusion queries without calling Flush() (it may only + * lead to objects being incorrectly classified as visible), and it can lead to much better performance + * if occlusion queries are used for traversing a BVH or similar data structure. It's possible to + * use "asynchronous" queries during traversal, and removing false positives later, when rendering + * has completed. + */ + CullingResult TestRect( float xmin, float ymin, float xmax, float ymax, float wmin ); + + /* + * \brief Occlusion query for a mesh, see MaskedOcclusionCulling::TestTriangles(). + * + * Important: See the TestRect() method for a brief discussion about asynchronous occlusion + * queries. + */ + CullingResult TestTriangles( const float* inVtx, const unsigned int* inTris, int nTris, BackfaceWinding bfWinding = MaskedOcclusionCulling::BACKFACE_CW, ClipPlanes clipPlaneMask = MaskedOcclusionCulling::CLIP_PLANE_ALL ); + + /*! + * \brief Creates a per-pixel depth buffer from the hierarchical z buffer representation, see + * MaskedOcclusionCulling::ComputePixelDepthBuffer(). This method causes a Flush() to + * ensure that all unfinished rendering is completed. + */ + void ComputePixelDepthBuffer( float* depthData, bool flipY ); +}; diff --git a/neo/libs/moc/MaskedOcclusionCulling.cpp b/neo/libs/moc/MaskedOcclusionCulling.cpp new file mode 100644 index 00000000..043e6716 --- /dev/null +++ b/neo/libs/moc/MaskedOcclusionCulling.cpp @@ -0,0 +1,528 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +//////////////////////////////////////////////////////////////////////////////// +#include +#include +#include +#include +#include "MaskedOcclusionCulling.h" +#include "CompilerSpecific.inl" + +#if MOC_RECORDER_ENABLE + #include "FrameRecorder.h" +#endif + +#if defined(__AVX__) || defined(__AVX2__) + // For performance reasons, the MaskedOcclusionCullingAVX2/512.cpp files should be compiled with VEX encoding for SSE instructions (to avoid + // AVX-SSE transition penalties, see https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties). However, this file + // _must_ be compiled without VEX encoding to allow backwards compatibility. Best practice is to use lowest supported target platform + // (/arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files. + #error The MaskedOcclusionCulling.cpp should be compiled with lowest supported target platform, e.g. /arch:SSE2 +#endif + +static MaskedOcclusionCulling::Implementation DetectCPUFeatures( MaskedOcclusionCulling::pfnAlignedAlloc alignedAlloc, MaskedOcclusionCulling::pfnAlignedFree alignedFree ) +{ + struct CpuInfo + { + int regs[4]; + }; + + // Get regular CPUID values + int regs[4]; + __cpuidex( regs, 0, 0 ); + + // MOCVectorAllocator mocalloc( alignedAlloc, alignedFree ); + // std::vector> cpuId( mocalloc ), cpuIdEx( mocalloc ); + // cpuId.resize( regs[0] ); + size_t cpuIdCount = regs[0]; + CpuInfo* cpuId = ( CpuInfo* )alignedAlloc( 64, sizeof( CpuInfo ) * cpuIdCount ); + + for( size_t i = 0; i < cpuIdCount; ++i ) + { + __cpuidex( cpuId[i].regs, ( int )i, 0 ); + } + + // Get extended CPUID values + __cpuidex( regs, 0x80000000, 0 ); + + //cpuIdEx.resize(regs[0] - 0x80000000); + size_t cpuIdExCount = regs[0] - 0x80000000; + CpuInfo* cpuIdEx = ( CpuInfo* )alignedAlloc( 64, sizeof( CpuInfo ) * cpuIdExCount ); + + for( size_t i = 0; i < cpuIdExCount; ++i ) + { + __cpuidex( cpuIdEx[i].regs, 0x80000000 + ( int )i, 0 ); + } + +#define TEST_BITS(A, B) (((A) & (B)) == (B)) +#define TEST_FMA_MOVE_OXSAVE (cpuIdCount >= 1 && TEST_BITS(cpuId[1].regs[2], (1 << 12) | (1 << 22) | (1 << 27))) +#define TEST_LZCNT (cpuIdExCount >= 1 && TEST_BITS(cpuIdEx[1].regs[2], 0x20)) +#define TEST_SSE41 (cpuIdCount >= 1 && TEST_BITS(cpuId[1].regs[2], (1 << 19))) +#define TEST_XMM_YMM (cpuIdCount >= 1 && TEST_BITS(_xgetbv(0), (1 << 2) | (1 << 1))) +#define TEST_OPMASK_ZMM (cpuIdCount >= 1 && TEST_BITS(_xgetbv(0), (1 << 7) | (1 << 6) | (1 << 5))) +#define TEST_BMI1_BMI2_AVX2 (cpuIdCount >= 7 && TEST_BITS(cpuId[7].regs[1], (1 << 3) | (1 << 5) | (1 << 8))) +#define TEST_AVX512_F_BW_DQ (cpuIdCount >= 7 && TEST_BITS(cpuId[7].regs[1], (1 << 16) | (1 << 17) | (1 << 30))) + + MaskedOcclusionCulling::Implementation retVal = MaskedOcclusionCulling::SSE2; + if( TEST_FMA_MOVE_OXSAVE && TEST_LZCNT && TEST_SSE41 ) + { + if( TEST_XMM_YMM && TEST_OPMASK_ZMM && TEST_BMI1_BMI2_AVX2 && TEST_AVX512_F_BW_DQ ) + { + retVal = MaskedOcclusionCulling::AVX512; + } + else if( TEST_XMM_YMM && TEST_BMI1_BMI2_AVX2 ) + { + retVal = MaskedOcclusionCulling::AVX2; + } + } + else if( TEST_SSE41 ) + { + retVal = MaskedOcclusionCulling::SSE41; + } + alignedFree( cpuId ); + alignedFree( cpuIdEx ); + return retVal; +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Utility functions (not directly related to the algorithm/rasterizer) +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void MaskedOcclusionCulling::TransformVertices( const float* mtx, const float* inVtx, float* xfVtx, unsigned int nVtx, const VertexLayout& vtxLayout ) +{ + // This function pretty slow, about 10-20% slower than if the vertices are stored in aligned SOA form. + if( nVtx == 0 ) + { + return; + } + + // Load matrix and swizzle out the z component. For post-multiplication (OGL), the matrix is assumed to be column + // major, with one column per SSE register. For pre-multiplication (DX), the matrix is assumed to be row major. + __m128 mtxCol0 = _mm_loadu_ps( mtx ); + __m128 mtxCol1 = _mm_loadu_ps( mtx + 4 ); + __m128 mtxCol2 = _mm_loadu_ps( mtx + 8 ); + __m128 mtxCol3 = _mm_loadu_ps( mtx + 12 ); + + int stride = vtxLayout.mStride; + const char* vPtr = ( const char* )inVtx; + float* outPtr = xfVtx; + + // Iterate through all vertices and transform + for( unsigned int vtx = 0; vtx < nVtx; ++vtx ) + { + __m128 xVal = _mm_load1_ps( ( float* )( vPtr ) ); + __m128 yVal = _mm_load1_ps( ( float* )( vPtr + vtxLayout.mOffsetY ) ); + __m128 zVal = _mm_load1_ps( ( float* )( vPtr + vtxLayout.mOffsetZ ) ); + + __m128 xform = _mm_add_ps( _mm_mul_ps( mtxCol0, xVal ), _mm_add_ps( _mm_mul_ps( mtxCol1, yVal ), _mm_add_ps( _mm_mul_ps( mtxCol2, zVal ), mtxCol3 ) ) ); + _mm_storeu_ps( outPtr, xform ); + vPtr += stride; + outPtr += 4; + } +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Typedefs +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc; +typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree; +typedef MaskedOcclusionCulling::VertexLayout VertexLayout; + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Common SSE2/SSE4.1 defines +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#define SIMD_LANES 4 +#define TILE_HEIGHT_SHIFT 2 + +#define SIMD_LANE_IDX _mm_setr_epi32(0, 1, 2, 3) + +#define SIMD_SUB_TILE_COL_OFFSET _mm_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3) +#define SIMD_SUB_TILE_ROW_OFFSET _mm_setzero_si128() +#define SIMD_SUB_TILE_COL_OFFSET_F _mm_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3) +#define SIMD_SUB_TILE_ROW_OFFSET_F _mm_setzero_ps() + +#define SIMD_LANE_YCOORD_I _mm_setr_epi32(128, 384, 640, 896) +#define SIMD_LANE_YCOORD_F _mm_setr_ps(128.0f, 384.0f, 640.0f, 896.0f) + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Common SSE2/SSE4.1 functions +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +typedef __m128 __mw; +typedef __m128i __mwi; + +#define _mmw_set1_ps _mm_set1_ps +#define _mmw_setzero_ps _mm_setzero_ps +#define _mmw_and_ps _mm_and_ps +#define _mmw_or_ps _mm_or_ps +#define _mmw_xor_ps _mm_xor_ps +#define _mmw_not_ps(a) _mm_xor_ps((a), _mm_castsi128_ps(_mm_set1_epi32(~0))) +#define _mmw_andnot_ps _mm_andnot_ps +#define _mmw_neg_ps(a) _mm_xor_ps((a), _mm_set1_ps(-0.0f)) +#define _mmw_abs_ps(a) _mm_and_ps((a), _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF))) +#define _mmw_add_ps _mm_add_ps +#define _mmw_sub_ps _mm_sub_ps +#define _mmw_mul_ps _mm_mul_ps +#define _mmw_div_ps _mm_div_ps +#define _mmw_min_ps _mm_min_ps +#define _mmw_max_ps _mm_max_ps +#define _mmw_movemask_ps _mm_movemask_ps +#define _mmw_cmpge_ps(a,b) _mm_cmpge_ps(a, b) +#define _mmw_cmpgt_ps(a,b) _mm_cmpgt_ps(a, b) +#define _mmw_cmpeq_ps(a,b) _mm_cmpeq_ps(a, b) +#define _mmw_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c) +#define _mmw_fmsub_ps(a,b,c) _mm_sub_ps(_mm_mul_ps(a,b), c) +#define _mmw_shuffle_ps _mm_shuffle_ps +#define _mmw_insertf32x4_ps(a,b,c) (b) +#define _mmw_cvtepi32_ps _mm_cvtepi32_ps +#define _mmw_blendv_epi32(a,b,c) simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c))) + +#define _mmw_set1_epi32 _mm_set1_epi32 +#define _mmw_setzero_epi32 _mm_setzero_si128 +#define _mmw_and_epi32 _mm_and_si128 +#define _mmw_or_epi32 _mm_or_si128 +#define _mmw_xor_epi32 _mm_xor_si128 +#define _mmw_not_epi32(a) _mm_xor_si128((a), _mm_set1_epi32(~0)) +#define _mmw_andnot_epi32 _mm_andnot_si128 +#define _mmw_neg_epi32(a) _mm_sub_epi32(_mm_set1_epi32(0), (a)) +#define _mmw_add_epi32 _mm_add_epi32 +#define _mmw_sub_epi32 _mm_sub_epi32 +#define _mmw_subs_epu16 _mm_subs_epu16 +#define _mmw_cmpeq_epi32 _mm_cmpeq_epi32 +#define _mmw_cmpgt_epi32 _mm_cmpgt_epi32 +#define _mmw_srai_epi32 _mm_srai_epi32 +#define _mmw_srli_epi32 _mm_srli_epi32 +#define _mmw_slli_epi32 _mm_slli_epi32 +#define _mmw_cvtps_epi32 _mm_cvtps_epi32 +#define _mmw_cvttps_epi32 _mm_cvttps_epi32 + +#define _mmx_fmadd_ps _mmw_fmadd_ps +#define _mmx_max_epi32 _mmw_max_epi32 +#define _mmx_min_epi32 _mmw_min_epi32 + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// SIMD casting functions +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +template FORCE_INLINE T simd_cast( Y A ); +template<> FORCE_INLINE __m128 simd_cast<__m128>( float A ) +{ + return _mm_set1_ps( A ); +} +template<> FORCE_INLINE __m128 simd_cast<__m128>( __m128i A ) +{ + return _mm_castsi128_ps( A ); +} +template<> FORCE_INLINE __m128 simd_cast<__m128>( __m128 A ) +{ + return A; +} +template<> FORCE_INLINE __m128i simd_cast<__m128i>( int A ) +{ + return _mm_set1_epi32( A ); +} +template<> FORCE_INLINE __m128i simd_cast<__m128i>( __m128 A ) +{ + return _mm_castps_si128( A ); +} +template<> FORCE_INLINE __m128i simd_cast<__m128i>( __m128i A ) +{ + return A; +} + +#define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \ + FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \ + union accessor { simd_type m_native; base_type m_array[elements]; }; \ + is_const accessor *acs = reinterpret_cast(&a); \ + return acs->m_array; \ + } + +MAKE_ACCESSOR( simd_f32, __m128, float,, 4 ) +MAKE_ACCESSOR( simd_f32, __m128, float, const, 4 ) +MAKE_ACCESSOR( simd_i32, __m128i, int,, 4 ) +MAKE_ACCESSOR( simd_i32, __m128i, int, const, 4 ) + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Specialized SSE input assembly function for general vertex gather +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +FORCE_INLINE void GatherVertices( __m128* vtxX, __m128* vtxY, __m128* vtxW, const float* inVtx, const unsigned int* inTrisPtr, int numLanes, const VertexLayout& vtxLayout ) +{ + for( int lane = 0; lane < numLanes; lane++ ) + { + for( int i = 0; i < 3; i++ ) + { + char* vPtrX = ( char* )inVtx + inTrisPtr[lane * 3 + i] * vtxLayout.mStride; + char* vPtrY = vPtrX + vtxLayout.mOffsetY; + char* vPtrW = vPtrX + vtxLayout.mOffsetW; + + simd_f32( vtxX[i] )[lane] = *( ( float* )vPtrX ); + simd_f32( vtxY[i] )[lane] = *( ( float* )vPtrY ); + simd_f32( vtxW[i] )[lane] = *( ( float* )vPtrW ); + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// SSE4.1 version +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace MaskedOcclusionCullingSSE41 +{ +FORCE_INLINE __m128i _mmw_mullo_epi32( const __m128i& a, const __m128i& b ) +{ + return _mm_mullo_epi32( a, b ); +} +FORCE_INLINE __m128i _mmw_min_epi32( const __m128i& a, const __m128i& b ) +{ + return _mm_min_epi32( a, b ); +} +FORCE_INLINE __m128i _mmw_max_epi32( const __m128i& a, const __m128i& b ) +{ + return _mm_max_epi32( a, b ); +} +FORCE_INLINE __m128i _mmw_abs_epi32( const __m128i& a ) +{ + return _mm_abs_epi32( a ); +} +FORCE_INLINE __m128 _mmw_blendv_ps( const __m128& a, const __m128& b, const __m128& c ) +{ + return _mm_blendv_ps( a, b, c ); +} +FORCE_INLINE int _mmw_testz_epi32( const __m128i& a, const __m128i& b ) +{ + return _mm_testz_si128( a, b ); +} +FORCE_INLINE __m128 _mmx_dp4_ps( const __m128& a, const __m128& b ) +{ + return _mm_dp_ps( a, b, 0xFF ); +} +FORCE_INLINE __m128 _mmw_floor_ps( const __m128& a ) +{ + return _mm_round_ps( a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC ); +} +FORCE_INLINE __m128 _mmw_ceil_ps( const __m128& a ) +{ + return _mm_round_ps( a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC ); +} +FORCE_INLINE __m128i _mmw_transpose_epi8( const __m128i& a ) +{ + const __m128i shuff = _mm_setr_epi8( 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF ); + return _mm_shuffle_epi8( a, shuff ); +} +FORCE_INLINE __m128i _mmw_sllv_ones( const __m128i& ishift ) +{ + __m128i shift = _mm_min_epi32( ishift, _mm_set1_epi32( 32 ) ); + + // Uses lookup tables and _mm_shuffle_epi8 to perform _mm_sllv_epi32(~0, shift) + const __m128i byteShiftLUT = _mm_setr_epi8( ( char )0xFF, ( char )0xFE, ( char )0xFC, ( char )0xF8, ( char )0xF0, ( char )0xE0, ( char )0xC0, ( char )0x80, 0, 0, 0, 0, 0, 0, 0, 0 ); + const __m128i byteShiftOffset = _mm_setr_epi8( 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 ); + const __m128i byteShiftShuffle = _mm_setr_epi8( 0x0, 0x0, 0x0, 0x0, 0x4, 0x4, 0x4, 0x4, 0x8, 0x8, 0x8, 0x8, 0xC, 0xC, 0xC, 0xC ); + + __m128i byteShift = _mm_shuffle_epi8( shift, byteShiftShuffle ); + byteShift = _mm_min_epi8( _mm_subs_epu8( byteShift, byteShiftOffset ), _mm_set1_epi8( 8 ) ); + __m128i retMask = _mm_shuffle_epi8( byteShiftLUT, byteShift ); + + return retMask; +} + +static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::SSE41; + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Include common algorithm implementation (general, SIMD independent code) +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "MaskedOcclusionCullingCommon.inl" + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Utility function to create a new object using the allocator callbacks +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree ) +{ + MaskedOcclusionCullingPrivate* object = ( MaskedOcclusionCullingPrivate* )alignedAlloc( 64, sizeof( MaskedOcclusionCullingPrivate ) ); + new( object ) MaskedOcclusionCullingPrivate( alignedAlloc, alignedFree ); + return object; +} +}; + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// SSE2 version +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace MaskedOcclusionCullingSSE2 +{ +FORCE_INLINE __m128i _mmw_mullo_epi32( const __m128i& a, const __m128i& b ) +{ + // Do products for even / odd lanes & merge the result + __m128i even = _mm_and_si128( _mm_mul_epu32( a, b ), _mm_setr_epi32( ~0, 0, ~0, 0 ) ); + __m128i odd = _mm_slli_epi64( _mm_mul_epu32( _mm_srli_epi64( a, 32 ), _mm_srli_epi64( b, 32 ) ), 32 ); + return _mm_or_si128( even, odd ); +} +FORCE_INLINE __m128i _mmw_min_epi32( const __m128i& a, const __m128i& b ) +{ + __m128i cond = _mm_cmpgt_epi32( a, b ); + return _mm_or_si128( _mm_andnot_si128( cond, a ), _mm_and_si128( cond, b ) ); +} +FORCE_INLINE __m128i _mmw_max_epi32( const __m128i& a, const __m128i& b ) +{ + __m128i cond = _mm_cmpgt_epi32( b, a ); + return _mm_or_si128( _mm_andnot_si128( cond, a ), _mm_and_si128( cond, b ) ); +} +FORCE_INLINE __m128i _mmw_abs_epi32( const __m128i& a ) +{ + __m128i mask = _mm_cmplt_epi32( a, _mm_setzero_si128() ); + return _mm_add_epi32( _mm_xor_si128( a, mask ), _mm_srli_epi32( mask, 31 ) ); +} +FORCE_INLINE int _mmw_testz_epi32( const __m128i& a, const __m128i& b ) +{ + return _mm_movemask_epi8( _mm_cmpeq_epi8( _mm_and_si128( a, b ), _mm_setzero_si128() ) ) == 0xFFFF; +} +FORCE_INLINE __m128 _mmw_blendv_ps( const __m128& a, const __m128& b, const __m128& c ) +{ + __m128 cond = _mm_castsi128_ps( _mm_srai_epi32( _mm_castps_si128( c ), 31 ) ); + return _mm_or_ps( _mm_andnot_ps( cond, a ), _mm_and_ps( cond, b ) ); +} +FORCE_INLINE __m128 _mmx_dp4_ps( const __m128& a, const __m128& b ) +{ + // Product and two shuffle/adds pairs (similar to hadd_ps) + __m128 prod = _mm_mul_ps( a, b ); + __m128 dp = _mm_add_ps( prod, _mm_shuffle_ps( prod, prod, _MM_SHUFFLE( 2, 3, 0, 1 ) ) ); + dp = _mm_add_ps( dp, _mm_shuffle_ps( dp, dp, _MM_SHUFFLE( 0, 1, 2, 3 ) ) ); + return dp; +} +FORCE_INLINE __m128 _mmw_floor_ps( const __m128& a ) +{ + int originalMode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE( _MM_ROUND_DOWN ); + __m128 rounded = _mm_cvtepi32_ps( _mm_cvtps_epi32( a ) ); + _MM_SET_ROUNDING_MODE( originalMode ); + return rounded; +} +FORCE_INLINE __m128 _mmw_ceil_ps( const __m128& a ) +{ + int originalMode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE( _MM_ROUND_UP ); + __m128 rounded = _mm_cvtepi32_ps( _mm_cvtps_epi32( a ) ); + _MM_SET_ROUNDING_MODE( originalMode ); + return rounded; +} +FORCE_INLINE __m128i _mmw_transpose_epi8( const __m128i& a ) +{ + // Perform transpose through two 16->8 bit pack and byte shifts + __m128i res = a; + const __m128i mask = _mm_setr_epi8( ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0 ); + res = _mm_packus_epi16( _mm_and_si128( res, mask ), _mm_srli_epi16( res, 8 ) ); + res = _mm_packus_epi16( _mm_and_si128( res, mask ), _mm_srli_epi16( res, 8 ) ); + return res; +} +FORCE_INLINE __m128i _mmw_sllv_ones( const __m128i& ishift ) +{ + __m128i shift = _mmw_min_epi32( ishift, _mm_set1_epi32( 32 ) ); + + // Uses scalar approach to perform _mm_sllv_epi32(~0, shift) + static const unsigned int maskLUT[33] = + { + ~0U << 0, ~0U << 1, ~0U << 2, ~0U << 3, ~0U << 4, ~0U << 5, ~0U << 6, ~0U << 7, ~0U << 8, ~0U << 9, ~0U << 10, ~0U << 11, ~0U << 12, ~0U << 13, ~0U << 14, ~0U << 15, + ~0U << 16, ~0U << 17, ~0U << 18, ~0U << 19, ~0U << 20, ~0U << 21, ~0U << 22, ~0U << 23, ~0U << 24, ~0U << 25, ~0U << 26, ~0U << 27, ~0U << 28, ~0U << 29, ~0U << 30, ~0U << 31, + 0U + }; + + __m128i retMask; + simd_i32( retMask )[0] = ( int )maskLUT[simd_i32( shift )[0]]; + simd_i32( retMask )[1] = ( int )maskLUT[simd_i32( shift )[1]]; + simd_i32( retMask )[2] = ( int )maskLUT[simd_i32( shift )[2]]; + simd_i32( retMask )[3] = ( int )maskLUT[simd_i32( shift )[3]]; + return retMask; +} + +static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::SSE2; + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Include common algorithm implementation (general, SIMD independent code) +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "MaskedOcclusionCullingCommon.inl" + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Utility function to create a new object using the allocator callbacks +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree ) +{ + MaskedOcclusionCullingPrivate* object = ( MaskedOcclusionCullingPrivate* )alignedAlloc( 64, sizeof( MaskedOcclusionCullingPrivate ) ); + new( object ) MaskedOcclusionCullingPrivate( alignedAlloc, alignedFree ); + return object; +} +}; + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Object construction and allocation +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +namespace MaskedOcclusionCullingAVX512 +{ +extern MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree ); +} + +namespace MaskedOcclusionCullingAVX2 +{ +extern MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree ); +} + +MaskedOcclusionCulling* MaskedOcclusionCulling::Create( Implementation RequestedSIMD ) +{ + return Create( RequestedSIMD, aligned_alloc, aligned_free ); +} + +MaskedOcclusionCulling* MaskedOcclusionCulling::Create( Implementation RequestedSIMD, pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree ) +{ + MaskedOcclusionCulling* object = nullptr; + + MaskedOcclusionCulling::Implementation impl = DetectCPUFeatures( alignedAlloc, alignedFree ); + + if( RequestedSIMD < impl ) + { + impl = RequestedSIMD; + } + + // Return best supported version + if( object == nullptr && impl >= AVX512 ) + { + object = MaskedOcclusionCullingAVX512::CreateMaskedOcclusionCulling( alignedAlloc, alignedFree ); // Use AVX512 version + } + if( object == nullptr && impl >= AVX2 ) + { + object = MaskedOcclusionCullingAVX2::CreateMaskedOcclusionCulling( alignedAlloc, alignedFree ); // Use AVX2 version + } + if( object == nullptr && impl >= SSE41 ) + { + object = MaskedOcclusionCullingSSE41::CreateMaskedOcclusionCulling( alignedAlloc, alignedFree ); // Use SSE4.1 version + } + if( object == nullptr ) + { + object = MaskedOcclusionCullingSSE2::CreateMaskedOcclusionCulling( alignedAlloc, alignedFree ); // Use SSE2 (slow) version + } + + return object; +} + +void MaskedOcclusionCulling::Destroy( MaskedOcclusionCulling* moc ) +{ + pfnAlignedFree alignedFreeCallback = moc->mAlignedFreeCallback; + moc->~MaskedOcclusionCulling(); + alignedFreeCallback( moc ); +} diff --git a/neo/libs/moc/MaskedOcclusionCulling.h b/neo/libs/moc/MaskedOcclusionCulling.h new file mode 100644 index 00000000..5e7ea995 --- /dev/null +++ b/neo/libs/moc/MaskedOcclusionCulling.h @@ -0,0 +1,596 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +//////////////////////////////////////////////////////////////////////////////// +#pragma once + +/*! + * \file MaskedOcclusionCulling.h + * \brief Masked Occlusion Culling + * + * General information + * - Input to all API functions are (x,y,w) clip-space coordinates (x positive left, y positive up, w positive away from camera). + * We entirely skip the z component and instead compute it as 1 / w, see next bullet. For TestRect the input is NDC (x/w, y/w). + * - We use a simple z = 1 / w transform, which is a bit faster than OGL/DX depth transforms. Thus, depth is REVERSED and z = 0 at + * the far plane and z = inf at w = 0. We also have to use a GREATER depth function, which explains why all the conservative + * tests will be reversed compared to what you might be used to (for example zMaxTri >= zMinBuffer is a visibility test) + * - We support different layouts for vertex data (basic AoS and SoA), but note that it's beneficial to store the position data + * as tightly in memory as possible to reduce cache misses. Big strides are bad, so it's beneficial to keep position as a separate + * stream (rather than bundled with attributes) or to keep a copy of the position data for the occlusion culling system. + * - The resolution width must be a multiple of 8 and height a multiple of 4. + * - The hierarchical Z buffer is stored OpenGL-style with the y axis pointing up. This includes the scissor box. + * - This code is only tested with Visual Studio 2015, but should hopefully be easy to port to other compilers. + */ + + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Defines used to configure the implementation +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifndef QUICK_MASK + /*! + * Configure the algorithm used for updating and merging hierarchical z buffer entries. If QUICK_MASK + * is defined to 1, use the algorithm from the paper "Masked Software Occlusion Culling", which has good + * balance between performance and low leakage. If QUICK_MASK is defined to 0, use the algorithm from + * "Masked Depth Culling for Graphics Hardware" which has less leakage, but also lower performance. + */ + #define QUICK_MASK 1 + +#endif + +#ifndef USE_D3D + /*! + * Configures the library for use with Direct3D (default) or OpenGL rendering. This changes whether the + * screen space Y axis points downwards (D3D) or upwards (OGL), and is primarily important in combination + * with the PRECISE_COVERAGE define, where this is important to ensure correct rounding and tie-breaker + * behaviour. It also affects the ScissorRect screen space coordinates. + */ + #define USE_D3D 1 + +#endif + +#ifndef PRECISE_COVERAGE + /*! + * Define PRECISE_COVERAGE to 1 to more closely match GPU rasterization rules. The increased precision comes + * at a cost of slightly lower performance. + */ + #define PRECISE_COVERAGE 1 + +#endif + +#ifndef USE_AVX512 + /*! + * Define USE_AVX512 to 1 to enable experimental AVX-512 support. It's currently mostly untested and only + * validated on simple examples using Intel SDE. Older compilers may not support AVX-512 intrinsics. + */ + #define USE_AVX512 0 + +#endif + +#ifndef CLIPPING_PRESERVES_ORDER + /*! + * Define CLIPPING_PRESERVES_ORDER to 1 to prevent clipping from reordering triangle rasterization + * order; This comes at a cost (approx 3-4%) but removes one source of temporal frame-to-frame instability. + */ + #define CLIPPING_PRESERVES_ORDER 1 + +#endif + +#ifndef ENABLE_STATS + /*! + * Define ENABLE_STATS to 1 to gather various statistics during occlusion culling. Can be used for profiling + * and debugging. Note that enabling this function will reduce performance significantly. + */ + #define ENABLE_STATS 0 + +#endif + +#ifndef MOC_RECORDER_ENABLE + /*! + * Define MOC_RECORDER_ENABLE to 1 to enable frame recorder (see FrameRecorder.h/cpp for details) + */ + #define MOC_RECORDER_ENABLE 0 + +#endif + +#if MOC_RECORDER_ENABLE + #ifndef MOC_RECORDER_ENABLE_PLAYBACK + /*! + * Define MOC_RECORDER_ENABLE_PLAYBACK to 1 to enable compilation of the playback code (not needed + for recording) + */ + #define MOC_RECORDER_ENABLE_PLAYBACK 0 + #endif +#endif + + +#if MOC_RECORDER_ENABLE + + #include + + class FrameRecorder; + +#endif // #if MOC_RECORDER_ENABLE + + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Masked occlusion culling class +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +class MaskedOcclusionCulling +{ +public: + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Memory management callback functions + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + typedef void* ( *pfnAlignedAlloc )( size_t alignment, size_t size ); + typedef void ( *pfnAlignedFree )( void* ptr ); + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Enums + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + enum Implementation + { + SSE2 = 0, + SSE41 = 1, + AVX2 = 2, + AVX512 = 3 + }; + + enum BackfaceWinding + { + BACKFACE_NONE = 0, + BACKFACE_CW = 1, + BACKFACE_CCW = 2, + }; + + enum CullingResult + { + VISIBLE = 0x0, + OCCLUDED = 0x1, + VIEW_CULLED = 0x3 + }; + + enum ClipPlanes + { + CLIP_PLANE_NONE = 0x00, + CLIP_PLANE_NEAR = 0x01, + CLIP_PLANE_LEFT = 0x02, + CLIP_PLANE_RIGHT = 0x04, + CLIP_PLANE_BOTTOM = 0x08, + CLIP_PLANE_TOP = 0x10, + CLIP_PLANE_SIDES = ( CLIP_PLANE_LEFT | CLIP_PLANE_RIGHT | CLIP_PLANE_BOTTOM | CLIP_PLANE_TOP ), + CLIP_PLANE_ALL = ( CLIP_PLANE_LEFT | CLIP_PLANE_RIGHT | CLIP_PLANE_BOTTOM | CLIP_PLANE_TOP | CLIP_PLANE_NEAR ) + }; + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Structs + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + /*! + * Used to specify custom vertex layout. Memory offsets to y and z coordinates are set through + * mOffsetY and mOffsetW, and vertex stride is given by mStride. It's possible to configure both + * AoS and SoA layouts. Note that large strides may cause more cache misses and decrease + * performance. It is advisable to store position data as compactly in memory as possible. + */ + struct VertexLayout + { + VertexLayout() {} + VertexLayout( int stride, int offsetY, int offsetZW ) : + mStride( stride ), mOffsetY( offsetY ), mOffsetW( offsetZW ) {} + + int mStride; //!< byte stride between vertices + int mOffsetY; //!< byte offset from X to Y coordinate + union + { + int mOffsetZ; //!< byte offset from X to Z coordinate + int mOffsetW; //!< byte offset from X to W coordinate + }; + }; + + /*! + * Used to control scissoring during rasterization. Note that we only provide coarse scissor support. + * The scissor box x coordinates must be a multiple of 32, and the y coordinates a multiple of 8. + * Scissoring is mainly meant as a means of enabling binning (sort middle) rasterizers in case + * application developers want to use that approach for multithreading. + */ + struct ScissorRect + { + ScissorRect() {} + ScissorRect( int minX, int minY, int maxX, int maxY ) : + mMinX( minX ), mMinY( minY ), mMaxX( maxX ), mMaxY( maxY ) {} + + int mMinX; //!< Screen space X coordinate for left side of scissor rect, inclusive and must be a multiple of 32 + int mMinY; //!< Screen space Y coordinate for bottom side of scissor rect, inclusive and must be a multiple of 8 + int mMaxX; //!< Screen space X coordinate for right side of scissor rect, non inclusive and must be a multiple of 32 + int mMaxY; //!< Screen space Y coordinate for top side of scissor rect, non inclusive and must be a multiple of 8 + }; + + /*! + * Used to specify storage area for a binlist, containing triangles. This struct is used for binning + * and multithreading. The host application is responsible for allocating memory for the binlists. + */ + struct TriList + { + unsigned int mNumTriangles; //!< Maximum number of triangles that may be stored in mPtr + unsigned int mTriIdx; //!< Index of next triangle to be written, clear before calling BinTriangles to start from the beginning of the list + float* mPtr; //!< Scratchpad buffer allocated by the host application + }; + + /*! + * Statistics that can be gathered during occluder rendering and visibility to aid debugging + * and profiling. Must be enabled by changing the ENABLE_STATS define. + */ + struct OcclusionCullingStatistics + { + struct + { + long long mNumProcessedTriangles; //!< Number of occluder triangles processed in total + long long mNumRasterizedTriangles; //!< Number of occluder triangles passing view frustum and backface culling + long long mNumTilesTraversed; //!< Number of tiles traversed by the rasterizer + long long mNumTilesUpdated; //!< Number of tiles where the hierarchical z buffer was updated + long long mNumTilesMerged; //!< Number of tiles where the hierarchical z buffer was updated + } mOccluders; + + struct + { + long long mNumProcessedRectangles; //!< Number of rects processed (TestRect()) + long long mNumProcessedTriangles; //!< Number of ocludee triangles processed (TestTriangles()) + long long mNumRasterizedTriangles; //!< Number of ocludee triangle passing view frustum and backface culling + long long mNumTilesTraversed; //!< Number of tiles traversed by triangle & rect rasterizers + } mOccludees; + }; + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Functions + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + /*! + * \brief Creates a new object with default state, no z buffer attached/allocated. + */ + static MaskedOcclusionCulling* Create( Implementation RequestedSIMD = AVX512 ); + + /*! + * \brief Creates a new object with default state, no z buffer attached/allocated. + * \param alignedAlloc Pointer to a callback function used when allocating memory + * \param alignedFree Pointer to a callback function used when freeing memory + */ + static MaskedOcclusionCulling* Create( Implementation RequestedSIMD, pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree ); + + /*! + * \brief Destroys an object and frees the z buffer memory. Note that you cannot + * use the delete operator, and should rather use this function to free up memory. + */ + static void Destroy( MaskedOcclusionCulling* moc ); + + /*! + * \brief Sets the resolution of the hierarchical depth buffer. This function will + * re-allocate the current depth buffer (if present). The contents of the + * buffer is undefined until ClearBuffer() is called. + * + * \param witdh The width of the buffer in pixels, must be a multiple of 8 + * \param height The height of the buffer in pixels, must be a multiple of 4 + */ + virtual void SetResolution( unsigned int width, unsigned int height ) = 0; + + /*! + * \brief Gets the resolution of the hierarchical depth buffer. + * + * \param witdh Output: The width of the buffer in pixels + * \param height Output: The height of the buffer in pixels + */ + virtual void GetResolution( unsigned int& width, unsigned int& height ) const = 0; + + /*! + * \brief Returns the tile size for the current implementation. + * + * \param nBinsW Number of vertical bins, the screen is divided into nBinsW x nBinsH + * rectangular bins. + * \param nBinsH Number of horizontal bins, the screen is divided into nBinsW x nBinsH + * rectangular bins. + * \param outBinWidth Output: The width of the single bin in pixels (except for the + * rightmost bin width, which is extended to resolution width) + * \param outBinHeight Output: The height of the single bin in pixels (except for the + * bottommost bin height, which is extended to resolution height) + */ + virtual void ComputeBinWidthHeight( unsigned int nBinsW, unsigned int nBinsH, unsigned int& outBinWidth, unsigned int& outBinHeight ) = 0; + + /*! + * \brief Sets the distance for the near clipping plane. Default is nearDist = 0. + * + * \param nearDist The distance to the near clipping plane, given as clip space w + */ + virtual void SetNearClipPlane( float nearDist ) = 0; + + /*! + * \brief Gets the distance for the near clipping plane. + */ + virtual float GetNearClipPlane() const = 0; + + /*! + * \brief Clears the hierarchical depth buffer. + */ + virtual void ClearBuffer() = 0; + + /*! + * \brief Merge a second hierarchical depth buffer into the main buffer. + */ + virtual void MergeBuffer( MaskedOcclusionCulling* BufferB ) = 0; + + /*! + * \brief Renders a mesh of occluder triangles and updates the hierarchical z buffer + * with conservative depth values. + * + * This function is optimized for vertex layouts with stride 16 and y and w + * offsets of 4 and 12 bytes, respectively. + * + * \param inVtx Pointer to an array of input vertices, should point to the x component + * of the first vertex. The input vertices are given as (x,y,w) coordinates + * in clip space. The memory layout can be changed using vtxLayout. + * \param inTris Pointer to an array of vertex indices. Each triangle is created + * from three indices consecutively fetched from the array. + * \param nTris The number of triangles to render (inTris must contain atleast 3*nTris + * entries) + * \param modelToClipMatrix all vertices will be transformed by this matrix before + * performing projection. If nullptr is passed the transform step will be skipped + * \param bfWinding Sets triangle winding order to consider backfacing, must be one one + * of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled + * and will not be rasterized. You may use BACKFACE_NONE to disable culling for + * double sided geometry + * \param clipPlaneMask A mask indicating which clip planes should be considered by the + * triangle clipper. Can be used as an optimization if your application can + * determine (for example during culling) that a group of triangles does not + * intersect a certain frustum plane. However, setting an incorrect mask may + * cause out of bounds memory accesses. + * \param vtxLayout A struct specifying the vertex layout (see struct for detailed + * description). For best performance, it is advisable to store position data + * as compactly in memory as possible. + * \return Will return VIEW_CULLED if all triangles are either outside the frustum or + * backface culled, returns VISIBLE otherwise. + */ + virtual CullingResult RenderTriangles( const float* inVtx, const unsigned int* inTris, int nTris, const float* modelToClipMatrix = nullptr, BackfaceWinding bfWinding = BACKFACE_CW, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, const VertexLayout& vtxLayout = VertexLayout( 16, 4, 12 ) ) = 0; + + /*! + * \brief Occlusion query for a rectangle with a given depth. The rectangle is given + * in normalized device coordinates where (x,y) coordinates between [-1,1] map + * to the visible screen area. The query uses a GREATER_EQUAL (reversed) depth + * test meaning that depth values equal to the contents of the depth buffer are + * counted as visible. + * + * \param xmin NDC coordinate of the left side of the rectangle. + * \param ymin NDC coordinate of the bottom side of the rectangle. + * \param xmax NDC coordinate of the right side of the rectangle. + * \param ymax NDC coordinate of the top side of the rectangle. + * \param ymax NDC coordinate of the top side of the rectangle. + * \param wmin Clip space W coordinate for the rectangle. + * \return The query will return VISIBLE if the rectangle may be visible, OCCLUDED + * if the rectangle is occluded by a previously rendered object, or VIEW_CULLED + * if the rectangle is outside the view frustum. + */ + virtual CullingResult TestRect( float xmin, float ymin, float xmax, float ymax, float wmin ) const = 0; + + /*! + * \brief This function is similar to RenderTriangles(), but performs an occlusion + * query instead and does not update the hierarchical z buffer. The query uses + * a GREATER_EQUAL (reversed) depth test meaning that depth values equal to the + * contents of the depth buffer are counted as visible. + * + * This function is optimized for vertex layouts with stride 16 and y and w + * offsets of 4 and 12 bytes, respectively. + * + * \param inVtx Pointer to an array of input vertices, should point to the x component + * of the first vertex. The input vertices are given as (x,y,w) coordinates + * in clip space. The memory layout can be changed using vtxLayout. + * \param inTris Pointer to an array of triangle indices. Each triangle is created + * from three indices consecutively fetched from the array. + * \param nTris The number of triangles to render (inTris must contain atleast 3*nTris + * entries) + * \param modelToClipMatrix all vertices will be transformed by this matrix before + * performing projection. If nullptr is passed the transform step will be skipped + * \param bfWinding Sets triangle winding order to consider backfacing, must be one one + * of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled + * and will not be occlusion tested. You may use BACKFACE_NONE to disable culling + * for double sided geometry + * \param clipPlaneMask A mask indicating which clip planes should be considered by the + * triangle clipper. Can be used as an optimization if your application can + * determine (for example during culling) that a group of triangles does not + * intersect a certain frustum plane. However, setting an incorrect mask may + * cause out of bounds memory accesses. + * \param vtxLayout A struct specifying the vertex layout (see struct for detailed + * description). For best performance, it is advisable to store position data + * as compactly in memory as possible. + * \return The query will return VISIBLE if the triangle mesh may be visible, OCCLUDED + * if the mesh is occluded by a previously rendered object, or VIEW_CULLED if all + * triangles are entirely outside the view frustum or backface culled. + */ + virtual CullingResult TestTriangles( const float* inVtx, const unsigned int* inTris, int nTris, const float* modelToClipMatrix = nullptr, BackfaceWinding bfWinding = BACKFACE_CW, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, const VertexLayout& vtxLayout = VertexLayout( 16, 4, 12 ) ) = 0; + + /*! + * \brief Perform input assembly, clipping , projection, triangle setup, and write + * triangles to the screen space bins they overlap. This function can be used to + * distribute work for threading (See the CullingThreadpool class for an example) + * + * \param inVtx Pointer to an array of input vertices, should point to the x component + * of the first vertex. The input vertices are given as (x,y,w) coordinates + * in clip space. The memory layout can be changed using vtxLayout. + * \param inTris Pointer to an array of vertex indices. Each triangle is created + * from three indices consecutively fetched from the array. + * \param nTris The number of triangles to render (inTris must contain atleast 3*nTris + * entries) + * \param triLists Pointer to an array of TriList objects with one TriList object per + * bin. If a triangle overlaps a bin, it will be written to the corresponding + * trilist. Note that this method appends the triangles to the current list, to + * start writing from the beginning of the list, set triList.mTriIdx = 0 + * \param nBinsW Number of vertical bins, the screen is divided into nBinsW x nBinsH + * rectangular bins. + * \param nBinsH Number of horizontal bins, the screen is divided into nBinsW x nBinsH + * rectangular bins. + * \param modelToClipMatrix all vertices will be transformed by this matrix before + * performing projection. If nullptr is passed the transform step will be skipped + * \param clipPlaneMask A mask indicating which clip planes should be considered by the + * triangle clipper. Can be used as an optimization if your application can + * determine (for example during culling) that a group of triangles does not + * intersect a certain frustum plane. However, setting an incorrect mask may + * cause out of bounds memory accesses. + * \param vtxLayout A struct specifying the vertex layout (see struct for detailed + * description). For best performance, it is advisable to store position data + * as compactly in memory as possible. + * \param bfWinding Sets triangle winding order to consider backfacing, must be one one + * of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled + * and will not be binned / rasterized. You may use BACKFACE_NONE to disable culling + * for double sided geometry + */ + virtual void BinTriangles( const float* inVtx, const unsigned int* inTris, int nTris, TriList* triLists, unsigned int nBinsW, unsigned int nBinsH, const float* modelToClipMatrix = nullptr, BackfaceWinding bfWinding = BACKFACE_CW, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, const VertexLayout& vtxLayout = VertexLayout( 16, 4, 12 ) ) = 0; + + /*! + * \brief Renders all occluder triangles in a trilist. This function can be used in + * combination with BinTriangles() to create a threded (binning) rasterizer. The + * bins can be processed independently by different threads without risking writing + * to overlapping memory regions. + * + * \param triLists A triangle list, filled using the BinTriangles() function that is to + * be rendered. + * \param scissor A scissor box limiting the rendering region to the bin. The size of each + * bin must be a multiple of 32x8 pixels due to implementation constraints. For a + * render target with (width, height) resolution and (nBinsW, nBinsH) bins, the + * size of a bin is: + * binWidth = (width / nBinsW) - (width / nBinsW) % 32; + * binHeight = (height / nBinsH) - (height / nBinsH) % 8; + * The last row and column of tiles have a different size: + * lastColBinWidth = width - (nBinsW-1)*binWidth; + * lastRowBinHeight = height - (nBinsH-1)*binHeight; + */ + virtual void RenderTrilist( const TriList& triList, const ScissorRect* scissor ) = 0; + + /*! + * \brief Creates a per-pixel depth buffer from the hierarchical z buffer representation. + * Intended for visualizing the hierarchical depth buffer for debugging. The + * buffer is written in scanline order, from the top to bottom (D3D) or bottom to + * top (OGL) of the surface. See the USE_D3D define. + * + * \param depthData Pointer to memory where the per-pixel depth data is written. Must + * hold storage for atleast width*height elements as set by setResolution. + */ + virtual void ComputePixelDepthBuffer( float* depthData, bool flipY ) = 0; + + /*! + * \brief Fetch occlusion culling statistics, returns zeroes if ENABLE_STATS define is + * not defined. The statistics can be used for profiling or debugging. + */ + virtual OcclusionCullingStatistics GetStatistics() = 0; + + /*! + * \brief Returns the implementation (CPU instruction set) version of this object. + */ + virtual Implementation GetImplementation() = 0; + + /*! + * \brief Utility function for transforming vertices and outputting them to an (x,y,z,w) + * format suitable for the occluder rasterization and occludee testing functions. + * + * \param mtx Pointer to matrix data. The matrix should column major for post + * multiplication (OGL) and row major for pre-multiplication (DX). This is + * consistent with OpenGL / DirectX behavior. + * \param inVtx Pointer to an array of input vertices. The input vertices are given as + * (x,y,z) coordinates. The memory layout can be changed using vtxLayout. + * \param xfVtx Pointer to an array to store transformed vertices. The transformed + * vertices are always stored as array of structs (AoS) (x,y,z,w) packed in memory. + * \param nVtx Number of vertices to transform. + * \param vtxLayout A struct specifying the vertex layout (see struct for detailed + * description). For best performance, it is advisable to store position data + * as compactly in memory as possible. Note that for this function, the + * w-component is assumed to be 1.0. + */ + static void TransformVertices( const float* mtx, const float* inVtx, float* xfVtx, unsigned int nVtx, const VertexLayout& vtxLayout = VertexLayout( 12, 4, 8 ) ); + + /*! + * \brief Get used memory alloc/free callbacks. + */ + void GetAllocFreeCallback( pfnAlignedAlloc& allocCallback, pfnAlignedFree& freeCallback ) + { + allocCallback = mAlignedAllocCallback, freeCallback = mAlignedFreeCallback; + } + +#if MOC_RECORDER_ENABLE + /*! + * \brief Start recording subsequent rasterization and testing calls using the FrameRecorder. + * The function calls that are recorded are: + * - ClearBuffer + * - RenderTriangles + * - TestTriangles + * - TestRect + * All inputs and outputs are recorded, which can be used for correctness validation + * and performance testing. + * + * \param outputFilePath Pointer to name of the output file. + * \return 'true' if recording was started successfully, 'false' otherwise (file access error). + */ + bool RecorderStart( const char* outputFilePath ) const; + + /*! + * \brief Stop recording, flush output and release used memory. + */ + void RecorderStop( ) const; + + /*! + * \brief Manually record triangles. This is called automatically from MaskedOcclusionCulling::RenderTriangles + * if the recording is started, but not from BinTriangles/RenderTrilist (used in multithreaded codepath), in + * which case it has to be called manually. + * + * \param inVtx Pointer to an array of input vertices, should point to the x component + * of the first vertex. The input vertices are given as (x,y,w) coordinates + * in clip space. The memory layout can be changed using vtxLayout. + * \param inTris Pointer to an array of triangle indices. Each triangle is created + * from three indices consecutively fetched from the array. + * \param nTris The number of triangles to render (inTris must contain atleast 3*nTris + * entries) + * \param modelToClipMatrix all vertices will be transformed by this matrix before + * performing projection. If nullptr is passed the transform step will be skipped + * \param bfWinding Sets triangle winding order to consider backfacing, must be one one + * of (BACKFACE_NONE, BACKFACE_CW and BACKFACE_CCW). Back-facing triangles are culled + * and will not be occlusion tested. You may use BACKFACE_NONE to disable culling + * for double sided geometry + * \param clipPlaneMask A mask indicating which clip planes should be considered by the + * triangle clipper. Can be used as an optimization if your application can + * determine (for example during culling) that a group of triangles does not + * intersect a certain frustum plane. However, setting an incorrect mask may + * cause out of bounds memory accesses. + * \param vtxLayout A struct specifying the vertex layout (see struct for detailed + * description). For best performance, it is advisable to store position data + * as compactly in memory as possible. + * \param cullingResult cull result value expected to be returned by executing the + * RenderTriangles call with recorded parameters. + */ + // + // merge the binned data back into original layout; in this case, call it manually from your Threadpool implementation (already added to CullingThreadpool). + // If recording is not enabled, calling this function will do nothing. + void RecordRenderTriangles( const float* inVtx, const unsigned int* inTris, int nTris, const float* modelToClipMatrix = nullptr, ClipPlanes clipPlaneMask = CLIP_PLANE_ALL, BackfaceWinding bfWinding = BACKFACE_CW, const VertexLayout& vtxLayout = VertexLayout( 16, 4, 12 ), CullingResult cullingResult = ( CullingResult ) - 1 ); +#endif // #if MOC_RECORDER_ENABLE + +protected: + pfnAlignedAlloc mAlignedAllocCallback; + pfnAlignedFree mAlignedFreeCallback; + + mutable OcclusionCullingStatistics mStats; + +#if MOC_RECORDER_ENABLE + mutable FrameRecorder* mRecorder; + mutable std::mutex mRecorderMutex; +#endif // #if MOC_RECORDER_ENABLE + + virtual ~MaskedOcclusionCulling() {} +}; diff --git a/neo/libs/moc/MaskedOcclusionCullingAVX2.cpp b/neo/libs/moc/MaskedOcclusionCullingAVX2.cpp new file mode 100644 index 00000000..149247e4 --- /dev/null +++ b/neo/libs/moc/MaskedOcclusionCullingAVX2.cpp @@ -0,0 +1,280 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +//////////////////////////////////////////////////////////////////////////////// +#include +#include +#include +#include "MaskedOcclusionCulling.h" +#include "CompilerSpecific.inl" + +#if MOC_RECORDER_ENABLE + #include "FrameRecorder.h" +#endif + +#if defined(__MICROSOFT_COMPILER) && _MSC_VER < 1900 + // If you remove/comment this error, the code will compile & use the SSE41 version instead. + #error Older versions than visual studio 2015 not supported due to compiler bug(s) +#endif + +#if !defined(__MICROSOFT_COMPILER) || _MSC_VER >= 1900 + +// For performance reasons, the MaskedOcclusionCullingAVX2.cpp file should be compiled with VEX encoding for SSE instructions (to avoid +// AVX-SSE transition penalties, see https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties). However, the SSE +// version in MaskedOcclusionCulling.cpp _must_ be compiled without VEX encoding to allow backwards compatibility. Best practice is to +// use lowest supported target platform (e.g. /arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files. +#ifndef __AVX2__ + #error For best performance, MaskedOcclusionCullingAVX2.cpp should be compiled with /arch:AVX2 +#endif + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// AVX specific defines and constants +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#define SIMD_LANES 8 +#define TILE_HEIGHT_SHIFT 3 + +#define SIMD_LANE_IDX _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7) + +#define SIMD_SUB_TILE_COL_OFFSET _mm256_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3) +#define SIMD_SUB_TILE_ROW_OFFSET _mm256_setr_epi32(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT) +#define SIMD_SUB_TILE_COL_OFFSET_F _mm256_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3) +#define SIMD_SUB_TILE_ROW_OFFSET_F _mm256_setr_ps(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT) + +#define SIMD_SHUFFLE_SCANLINE_TO_SUBTILES _mm256_setr_epi8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF, 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF) + +#define SIMD_LANE_YCOORD_I _mm256_setr_epi32(128, 384, 640, 896, 1152, 1408, 1664, 1920) +#define SIMD_LANE_YCOORD_F _mm256_setr_ps(128.0f, 384.0f, 640.0f, 896.0f, 1152.0f, 1408.0f, 1664.0f, 1920.0f) + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// AVX specific typedefs and functions +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +typedef __m256 __mw; +typedef __m256i __mwi; + +#define _mmw_set1_ps _mm256_set1_ps +#define _mmw_setzero_ps _mm256_setzero_ps +#define _mmw_and_ps _mm256_and_ps +#define _mmw_or_ps _mm256_or_ps +#define _mmw_xor_ps _mm256_xor_ps +#define _mmw_not_ps(a) _mm256_xor_ps((a), _mm256_castsi256_ps(_mm256_set1_epi32(~0))) +#define _mmw_andnot_ps _mm256_andnot_ps +#define _mmw_neg_ps(a) _mm256_xor_ps((a), _mm256_set1_ps(-0.0f)) +#define _mmw_abs_ps(a) _mm256_and_ps((a), _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF))) +#define _mmw_add_ps _mm256_add_ps +#define _mmw_sub_ps _mm256_sub_ps +#define _mmw_mul_ps _mm256_mul_ps +#define _mmw_div_ps _mm256_div_ps +#define _mmw_min_ps _mm256_min_ps +#define _mmw_max_ps _mm256_max_ps +#define _mmw_fmadd_ps _mm256_fmadd_ps +#define _mmw_fmsub_ps _mm256_fmsub_ps +#define _mmw_movemask_ps _mm256_movemask_ps +#define _mmw_blendv_ps _mm256_blendv_ps +#define _mmw_cmpge_ps(a,b) _mm256_cmp_ps(a, b, _CMP_GE_OQ) +#define _mmw_cmpgt_ps(a,b) _mm256_cmp_ps(a, b, _CMP_GT_OQ) +#define _mmw_cmpeq_ps(a,b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ) +#define _mmw_floor_ps(x) _mm256_round_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) +#define _mmw_ceil_ps(x) _mm256_round_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) +#define _mmw_shuffle_ps _mm256_shuffle_ps +#define _mmw_insertf32x4_ps _mm256_insertf128_ps +#define _mmw_cvtepi32_ps _mm256_cvtepi32_ps +#define _mmw_blendv_epi32(a,b,c) simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c))) + +#define _mmw_set1_epi32 _mm256_set1_epi32 +#define _mmw_setzero_epi32 _mm256_setzero_si256 +#define _mmw_and_epi32 _mm256_and_si256 +#define _mmw_or_epi32 _mm256_or_si256 +#define _mmw_xor_epi32 _mm256_xor_si256 +#define _mmw_not_epi32(a) _mm256_xor_si256((a), _mm256_set1_epi32(~0)) +#define _mmw_andnot_epi32 _mm256_andnot_si256 +#define _mmw_neg_epi32(a) _mm256_sub_epi32(_mm256_set1_epi32(0), (a)) +#define _mmw_add_epi32 _mm256_add_epi32 +#define _mmw_sub_epi32 _mm256_sub_epi32 +#define _mmw_min_epi32 _mm256_min_epi32 +#define _mmw_max_epi32 _mm256_max_epi32 +#define _mmw_subs_epu16 _mm256_subs_epu16 +#define _mmw_mullo_epi32 _mm256_mullo_epi32 +#define _mmw_cmpeq_epi32 _mm256_cmpeq_epi32 +#define _mmw_testz_epi32 _mm256_testz_si256 +#define _mmw_cmpgt_epi32 _mm256_cmpgt_epi32 +#define _mmw_srai_epi32 _mm256_srai_epi32 +#define _mmw_srli_epi32 _mm256_srli_epi32 +#define _mmw_slli_epi32 _mm256_slli_epi32 +#define _mmw_sllv_ones(x) _mm256_sllv_epi32(SIMD_BITS_ONE, x) +#define _mmw_transpose_epi8(x) _mm256_shuffle_epi8(x, SIMD_SHUFFLE_SCANLINE_TO_SUBTILES) +#define _mmw_abs_epi32 _mm256_abs_epi32 +#define _mmw_cvtps_epi32 _mm256_cvtps_epi32 +#define _mmw_cvttps_epi32 _mm256_cvttps_epi32 + +#define _mmx_dp4_ps(a, b) _mm_dp_ps(a, b, 0xFF) +#define _mmx_fmadd_ps _mm_fmadd_ps +#define _mmx_max_epi32 _mm_max_epi32 +#define _mmx_min_epi32 _mm_min_epi32 + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// SIMD casting functions +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +template FORCE_INLINE T simd_cast( Y A ); +template<> FORCE_INLINE __m128 simd_cast<__m128>( float A ) +{ + return _mm_set1_ps( A ); +} +template<> FORCE_INLINE __m128 simd_cast<__m128>( __m128i A ) +{ + return _mm_castsi128_ps( A ); +} +template<> FORCE_INLINE __m128 simd_cast<__m128>( __m128 A ) +{ + return A; +} +template<> FORCE_INLINE __m128i simd_cast<__m128i>( int A ) +{ + return _mm_set1_epi32( A ); +} +template<> FORCE_INLINE __m128i simd_cast<__m128i>( __m128 A ) +{ + return _mm_castps_si128( A ); +} +template<> FORCE_INLINE __m128i simd_cast<__m128i>( __m128i A ) +{ + return A; +} +template<> FORCE_INLINE __m256 simd_cast<__m256>( float A ) +{ + return _mm256_set1_ps( A ); +} +template<> FORCE_INLINE __m256 simd_cast<__m256>( __m256i A ) +{ + return _mm256_castsi256_ps( A ); +} +template<> FORCE_INLINE __m256 simd_cast<__m256>( __m256 A ) +{ + return A; +} +template<> FORCE_INLINE __m256i simd_cast<__m256i>( int A ) +{ + return _mm256_set1_epi32( A ); +} +template<> FORCE_INLINE __m256i simd_cast<__m256i>( __m256 A ) +{ + return _mm256_castps_si256( A ); +} +template<> FORCE_INLINE __m256i simd_cast<__m256i>( __m256i A ) +{ + return A; +} + +#define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \ + FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \ + union accessor { simd_type m_native; base_type m_array[elements]; }; \ + is_const accessor *acs = reinterpret_cast(&a); \ + return acs->m_array; \ + } + +MAKE_ACCESSOR( simd_f32, __m128, float,, 4 ) +MAKE_ACCESSOR( simd_f32, __m128, float, const, 4 ) +MAKE_ACCESSOR( simd_i32, __m128i, int,, 4 ) +MAKE_ACCESSOR( simd_i32, __m128i, int, const, 4 ) + +MAKE_ACCESSOR( simd_f32, __m256, float,, 8 ) +MAKE_ACCESSOR( simd_f32, __m256, float, const, 8 ) +MAKE_ACCESSOR( simd_i32, __m256i, int,, 8 ) +MAKE_ACCESSOR( simd_i32, __m256i, int, const, 8 ) + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Specialized AVX input assembly function for general vertex gather +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +typedef MaskedOcclusionCulling::VertexLayout VertexLayout; + +FORCE_INLINE void GatherVertices( __m256* vtxX, __m256* vtxY, __m256* vtxW, const float* inVtx, const unsigned int* inTrisPtr, int numLanes, const VertexLayout& vtxLayout ) +{ + assert( numLanes >= 1 ); + + const __m256i SIMD_TRI_IDX_OFFSET = _mm256_setr_epi32( 0, 3, 6, 9, 12, 15, 18, 21 ); + static const __m256i SIMD_LANE_MASK[9] = + { + _mm256_setr_epi32( 0, 0, 0, 0, 0, 0, 0, 0 ), + _mm256_setr_epi32( ~0, 0, 0, 0, 0, 0, 0, 0 ), + _mm256_setr_epi32( ~0, ~0, 0, 0, 0, 0, 0, 0 ), + _mm256_setr_epi32( ~0, ~0, ~0, 0, 0, 0, 0, 0 ), + _mm256_setr_epi32( ~0, ~0, ~0, ~0, 0, 0, 0, 0 ), + _mm256_setr_epi32( ~0, ~0, ~0, ~0, ~0, 0, 0, 0 ), + _mm256_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, 0, 0 ), + _mm256_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0 ), + _mm256_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 ) + }; + + // Compute per-lane index list offset that guards against out of bounds memory accesses + __m256i safeTriIdxOffset = _mm256_and_si256( SIMD_TRI_IDX_OFFSET, SIMD_LANE_MASK[numLanes] ); + + // Fetch triangle indices. + __m256i vtxIdx[3]; + vtxIdx[0] = _mmw_mullo_epi32( _mm256_i32gather_epi32( ( const int* )inTrisPtr + 0, safeTriIdxOffset, 4 ), _mmw_set1_epi32( vtxLayout.mStride ) ); + vtxIdx[1] = _mmw_mullo_epi32( _mm256_i32gather_epi32( ( const int* )inTrisPtr + 1, safeTriIdxOffset, 4 ), _mmw_set1_epi32( vtxLayout.mStride ) ); + vtxIdx[2] = _mmw_mullo_epi32( _mm256_i32gather_epi32( ( const int* )inTrisPtr + 2, safeTriIdxOffset, 4 ), _mmw_set1_epi32( vtxLayout.mStride ) ); + + char* vPtr = ( char* )inVtx; + + // Fetch triangle vertices + for( int i = 0; i < 3; i++ ) + { + vtxX[i] = _mm256_i32gather_ps( ( float* )vPtr, vtxIdx[i], 1 ); + vtxY[i] = _mm256_i32gather_ps( ( float* )( vPtr + vtxLayout.mOffsetY ), vtxIdx[i], 1 ); + vtxW[i] = _mm256_i32gather_ps( ( float* )( vPtr + vtxLayout.mOffsetW ), vtxIdx[i], 1 ); + } +} + +namespace MaskedOcclusionCullingAVX2 +{ +static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::AVX2; + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Include common algorithm implementation (general, SIMD independent code) +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "MaskedOcclusionCullingCommon.inl" + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Utility function to create a new object using the allocator callbacks +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc; +typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree; + +MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree ) +{ + MaskedOcclusionCullingPrivate* object = ( MaskedOcclusionCullingPrivate* )alignedAlloc( 64, sizeof( MaskedOcclusionCullingPrivate ) ); + new( object ) MaskedOcclusionCullingPrivate( alignedAlloc, alignedFree ); + return object; +} +}; + +#else + +namespace MaskedOcclusionCullingAVX2 +{ +typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc; +typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree; + +MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree ) +{ + return nullptr; +} +}; + +#endif diff --git a/neo/libs/moc/MaskedOcclusionCullingAVX512.cpp b/neo/libs/moc/MaskedOcclusionCullingAVX512.cpp new file mode 100644 index 00000000..17092025 --- /dev/null +++ b/neo/libs/moc/MaskedOcclusionCullingAVX512.cpp @@ -0,0 +1,364 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +//////////////////////////////////////////////////////////////////////////////// +#include +#include +#include +#include "MaskedOcclusionCulling.h" +#include "CompilerSpecific.inl" + +#if MOC_RECORDER_ENABLE + #include "FrameRecorder.h" +#endif + +// Make sure compiler supports AVX-512 intrinsics: Visual Studio 2017 (Update 3) || Intel C++ Compiler 16.0 || Clang 4.0 || GCC 5.0 +#if USE_AVX512 != 0 && ((defined(_MSC_VER) && _MSC_VER >= 1911) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1600) || (defined(__clang__) && __clang_major__ >= 4) || (defined(__GNUC__) && __GNUC__ >= 5)) + +// The MaskedOcclusionCullingAVX512.cpp file should be compiled avx2/avx512 architecture options turned on in the compiler. However, the SSE +// version in MaskedOcclusionCulling.cpp _must_ be compiled with SSE2 architecture allow backwards compatibility. Best practice is to +// use lowest supported target platform (e.g. /arch:SSE2) as project default, and elevate only the MaskedOcclusionCullingAVX2/512.cpp files. +#ifndef __AVX2__ + #error For best performance, MaskedOcclusionCullingAVX512.cpp should be compiled with /arch:AVX2 +#endif + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// AVX specific defines and constants +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#define SIMD_LANES 16 +#define TILE_HEIGHT_SHIFT 4 + +#define SIMD_LANE_IDX _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + +#define SIMD_SUB_TILE_COL_OFFSET _mm512_setr_epi32(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3) +#define SIMD_SUB_TILE_ROW_OFFSET _mm512_setr_epi32(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3) +#define SIMD_SUB_TILE_COL_OFFSET_F _mm512_setr_ps(0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3, 0, SUB_TILE_WIDTH, SUB_TILE_WIDTH * 2, SUB_TILE_WIDTH * 3) +#define SIMD_SUB_TILE_ROW_OFFSET_F _mm512_setr_ps(0, 0, 0, 0, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 2, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3, SUB_TILE_HEIGHT * 3) + +#define SIMD_SHUFFLE_SCANLINE_TO_SUBTILES _mm512_set_epi32(0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400, 0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400, 0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400, 0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400) + +#define SIMD_LANE_YCOORD_I _mm512_setr_epi32(128, 384, 640, 896, 1152, 1408, 1664, 1920, 2176, 2432, 2688, 2944, 3200, 3456, 3712, 3968) +#define SIMD_LANE_YCOORD_F _mm512_setr_ps(128.0f, 384.0f, 640.0f, 896.0f, 1152.0f, 1408.0f, 1664.0f, 1920.0f, 2176.0f, 2432.0f, 2688.0f, 2944.0f, 3200.0f, 3456.0f, 3712.0f, 3968.0f) + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// AVX specific typedefs and functions +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +typedef __m512 __mw; +typedef __m512i __mwi; + +#define _mmw_set1_ps _mm512_set1_ps +#define _mmw_setzero_ps _mm512_setzero_ps +#define _mmw_and_ps _mm512_and_ps +#define _mmw_or_ps _mm512_or_ps +#define _mmw_xor_ps _mm512_xor_ps +#define _mmw_not_ps(a) _mm512_xor_ps((a), _mm512_castsi512_ps(_mm512_set1_epi32(~0))) +#define _mmw_andnot_ps _mm512_andnot_ps +#define _mmw_neg_ps(a) _mm512_xor_ps((a), _mm512_set1_ps(-0.0f)) +#define _mmw_abs_ps(a) _mm512_and_ps((a), _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF))) +#define _mmw_add_ps _mm512_add_ps +#define _mmw_sub_ps _mm512_sub_ps +#define _mmw_mul_ps _mm512_mul_ps +#define _mmw_div_ps _mm512_div_ps +#define _mmw_min_ps _mm512_min_ps +#define _mmw_max_ps _mm512_max_ps +#define _mmw_fmadd_ps _mm512_fmadd_ps +#define _mmw_fmsub_ps _mm512_fmsub_ps +#define _mmw_shuffle_ps _mm512_shuffle_ps +#define _mmw_insertf32x4_ps _mm512_insertf32x4 +#define _mmw_cvtepi32_ps _mm512_cvtepi32_ps +#define _mmw_blendv_epi32(a,b,c) simd_cast<__mwi>(_mmw_blendv_ps(simd_cast<__mw>(a), simd_cast<__mw>(b), simd_cast<__mw>(c))) + +#define _mmw_set1_epi32 _mm512_set1_epi32 +#define _mmw_setzero_epi32 _mm512_setzero_si512 +#define _mmw_and_epi32 _mm512_and_si512 +#define _mmw_or_epi32 _mm512_or_si512 +#define _mmw_xor_epi32 _mm512_xor_si512 +#define _mmw_not_epi32(a) _mm512_xor_si512((a), _mm512_set1_epi32(~0)) +#define _mmw_andnot_epi32 _mm512_andnot_si512 +#define _mmw_neg_epi32(a) _mm512_sub_epi32(_mm512_set1_epi32(0), (a)) +#define _mmw_add_epi32 _mm512_add_epi32 +#define _mmw_sub_epi32 _mm512_sub_epi32 +#define _mmw_min_epi32 _mm512_min_epi32 +#define _mmw_max_epi32 _mm512_max_epi32 +#define _mmw_subs_epu16 _mm512_subs_epu16 +#define _mmw_mullo_epi32 _mm512_mullo_epi32 +#define _mmw_srai_epi32 _mm512_srai_epi32 +#define _mmw_srli_epi32 _mm512_srli_epi32 +#define _mmw_slli_epi32 _mm512_slli_epi32 +#define _mmw_sllv_ones(x) _mm512_sllv_epi32(SIMD_BITS_ONE, x) +#define _mmw_transpose_epi8(x) _mm512_shuffle_epi8(x, SIMD_SHUFFLE_SCANLINE_TO_SUBTILES) +#define _mmw_abs_epi32 _mm512_abs_epi32 +#define _mmw_cvtps_epi32 _mm512_cvtps_epi32 +#define _mmw_cvttps_epi32 _mm512_cvttps_epi32 + +#define _mmx_dp4_ps(a, b) _mm_dp_ps(a, b, 0xFF) +#define _mmx_fmadd_ps _mm_fmadd_ps +#define _mmx_max_epi32 _mm_max_epi32 +#define _mmx_min_epi32 _mm_min_epi32 + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// SIMD casting functions +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +template FORCE_INLINE T simd_cast( Y A ); +template<> FORCE_INLINE __m128 simd_cast<__m128>( float A ) +{ + return _mm_set1_ps( A ); +} +template<> FORCE_INLINE __m128 simd_cast<__m128>( __m128i A ) +{ + return _mm_castsi128_ps( A ); +} +template<> FORCE_INLINE __m128 simd_cast<__m128>( __m128 A ) +{ + return A; +} +template<> FORCE_INLINE __m128i simd_cast<__m128i>( int A ) +{ + return _mm_set1_epi32( A ); +} +template<> FORCE_INLINE __m128i simd_cast<__m128i>( __m128 A ) +{ + return _mm_castps_si128( A ); +} +template<> FORCE_INLINE __m128i simd_cast<__m128i>( __m128i A ) +{ + return A; +} +template<> FORCE_INLINE __m256 simd_cast<__m256>( float A ) +{ + return _mm256_set1_ps( A ); +} +template<> FORCE_INLINE __m256 simd_cast<__m256>( __m256i A ) +{ + return _mm256_castsi256_ps( A ); +} +template<> FORCE_INLINE __m256 simd_cast<__m256>( __m256 A ) +{ + return A; +} +template<> FORCE_INLINE __m256i simd_cast<__m256i>( int A ) +{ + return _mm256_set1_epi32( A ); +} +template<> FORCE_INLINE __m256i simd_cast<__m256i>( __m256 A ) +{ + return _mm256_castps_si256( A ); +} +template<> FORCE_INLINE __m256i simd_cast<__m256i>( __m256i A ) +{ + return A; +} +template<> FORCE_INLINE __m512 simd_cast<__m512>( float A ) +{ + return _mm512_set1_ps( A ); +} +template<> FORCE_INLINE __m512 simd_cast<__m512>( __m512i A ) +{ + return _mm512_castsi512_ps( A ); +} +template<> FORCE_INLINE __m512 simd_cast<__m512>( __m512 A ) +{ + return A; +} +template<> FORCE_INLINE __m512i simd_cast<__m512i>( int A ) +{ + return _mm512_set1_epi32( A ); +} +template<> FORCE_INLINE __m512i simd_cast<__m512i>( __m512 A ) +{ + return _mm512_castps_si512( A ); +} +template<> FORCE_INLINE __m512i simd_cast<__m512i>( __m512i A ) +{ + return A; +} + +#define MAKE_ACCESSOR(name, simd_type, base_type, is_const, elements) \ + FORCE_INLINE is_const base_type * name(is_const simd_type &a) { \ + union accessor { simd_type m_native; base_type m_array[elements]; }; \ + is_const accessor *acs = reinterpret_cast(&a); \ + return acs->m_array; \ + } + +MAKE_ACCESSOR( simd_f32, __m128, float,, 4 ) +MAKE_ACCESSOR( simd_f32, __m128, float, const, 4 ) +MAKE_ACCESSOR( simd_i32, __m128i, int,, 4 ) +MAKE_ACCESSOR( simd_i32, __m128i, int, const, 4 ) + +MAKE_ACCESSOR( simd_f32, __m256, float,, 8 ) +MAKE_ACCESSOR( simd_f32, __m256, float, const, 8 ) +MAKE_ACCESSOR( simd_i32, __m256i, int,, 8 ) +MAKE_ACCESSOR( simd_i32, __m256i, int, const, 8 ) + +MAKE_ACCESSOR( simd_f32, __m512, float,, 16 ) +MAKE_ACCESSOR( simd_f32, __m512, float, const, 16 ) +MAKE_ACCESSOR( simd_i32, __m512i, int,, 16 ) +MAKE_ACCESSOR( simd_i32, __m512i, int, const, 16 ) + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Specialized AVX input assembly function for general vertex gather +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +typedef MaskedOcclusionCulling::VertexLayout VertexLayout; + +FORCE_INLINE void GatherVertices( __m512* vtxX, __m512* vtxY, __m512* vtxW, const float* inVtx, const unsigned int* inTrisPtr, int numLanes, const VertexLayout& vtxLayout ) +{ + assert( numLanes >= 1 ); + + const __m512i SIMD_TRI_IDX_OFFSET = _mm512_setr_epi32( 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45 ); + static const __m512i SIMD_LANE_MASK[17] = + { + _mm512_setr_epi32( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), + _mm512_setr_epi32( ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), + _mm512_setr_epi32( ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), + _mm512_setr_epi32( ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), + _mm512_setr_epi32( ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), + _mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), + _mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), + _mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), + _mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0, 0 ), + _mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0, 0 ), + _mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0, 0 ), + _mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0, 0 ), + _mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0, 0 ), + _mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, 0 ), + _mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, 0 ), + _mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0 ), + _mm512_setr_epi32( ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 ) + }; + + // Compute per-lane index list offset that guards against out of bounds memory accesses + __m512i safeTriIdxOffset = _mm512_and_si512( SIMD_TRI_IDX_OFFSET, SIMD_LANE_MASK[numLanes] ); + + // Fetch triangle indices. + __m512i vtxIdx[3]; + vtxIdx[0] = _mmw_mullo_epi32( _mm512_i32gather_epi32( safeTriIdxOffset, ( const int* )inTrisPtr + 0, 4 ), _mmw_set1_epi32( vtxLayout.mStride ) ); + vtxIdx[1] = _mmw_mullo_epi32( _mm512_i32gather_epi32( safeTriIdxOffset, ( const int* )inTrisPtr + 1, 4 ), _mmw_set1_epi32( vtxLayout.mStride ) ); + vtxIdx[2] = _mmw_mullo_epi32( _mm512_i32gather_epi32( safeTriIdxOffset, ( const int* )inTrisPtr + 2, 4 ), _mmw_set1_epi32( vtxLayout.mStride ) ); + + char* vPtr = ( char* )inVtx; + + // Fetch triangle vertices + for( int i = 0; i < 3; i++ ) + { + vtxX[i] = _mm512_i32gather_ps( vtxIdx[i], ( float* )vPtr, 1 ); + vtxY[i] = _mm512_i32gather_ps( vtxIdx[i], ( float* )( vPtr + vtxLayout.mOffsetY ), 1 ); + vtxW[i] = _mm512_i32gather_ps( vtxIdx[i], ( float* )( vPtr + vtxLayout.mOffsetW ), 1 ); + } +} + +namespace MaskedOcclusionCullingAVX512 +{ +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Poorly implemented functions. TODO: fix common (maskedOcclusionCullingCommon.inl) code to improve perf +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +FORCE_INLINE __m512 _mmw_floor_ps( __m512 x ) +{ + return _mm512_roundscale_ps( x, 1 ); // 1 = floor +} + +FORCE_INLINE __m512 _mmw_ceil_ps( __m512 x ) +{ + return _mm512_roundscale_ps( x, 2 ); // 2 = ceil +} + +FORCE_INLINE __m512i _mmw_cmpeq_epi32( __m512i a, __m512i b ) +{ + __mmask16 mask = _mm512_cmpeq_epi32_mask( a, b ); + return _mm512_mask_mov_epi32( _mm512_set1_epi32( 0 ), mask, _mm512_set1_epi32( ~0 ) ); +} + +FORCE_INLINE __m512i _mmw_cmpgt_epi32( __m512i a, __m512i b ) +{ + __mmask16 mask = _mm512_cmpgt_epi32_mask( a, b ); + return _mm512_mask_mov_epi32( _mm512_set1_epi32( 0 ), mask, _mm512_set1_epi32( ~0 ) ); +} + +FORCE_INLINE bool _mmw_testz_epi32( __m512i a, __m512i b ) +{ + __mmask16 mask = _mm512_cmpeq_epi32_mask( _mm512_and_si512( a, b ), _mm512_set1_epi32( 0 ) ); + return mask == 0xFFFF; +} + +FORCE_INLINE __m512 _mmw_cmpge_ps( __m512 a, __m512 b ) +{ + __mmask16 mask = _mm512_cmp_ps_mask( a, b, _CMP_GE_OQ ); + return _mm512_castsi512_ps( _mm512_mask_mov_epi32( _mm512_set1_epi32( 0 ), mask, _mm512_set1_epi32( ~0 ) ) ); +} + +FORCE_INLINE __m512 _mmw_cmpgt_ps( __m512 a, __m512 b ) +{ + __mmask16 mask = _mm512_cmp_ps_mask( a, b, _CMP_GT_OQ ); + return _mm512_castsi512_ps( _mm512_mask_mov_epi32( _mm512_set1_epi32( 0 ), mask, _mm512_set1_epi32( ~0 ) ) ); +} + +FORCE_INLINE __m512 _mmw_cmpeq_ps( __m512 a, __m512 b ) +{ + __mmask16 mask = _mm512_cmp_ps_mask( a, b, _CMP_EQ_OQ ); + return _mm512_castsi512_ps( _mm512_mask_mov_epi32( _mm512_set1_epi32( 0 ), mask, _mm512_set1_epi32( ~0 ) ) ); +} + +FORCE_INLINE __mmask16 _mmw_movemask_ps( const __m512& a ) +{ + __mmask16 mask = _mm512_cmp_epi32_mask( _mm512_and_si512( _mm512_castps_si512( a ), _mm512_set1_epi32( 0x80000000 ) ), _mm512_set1_epi32( 0 ), 4 ); // a & 0x8000000 != 0 + return mask; +} + +FORCE_INLINE __m512 _mmw_blendv_ps( const __m512& a, const __m512& b, const __m512& c ) +{ + __mmask16 mask = _mmw_movemask_ps( c ); + return _mm512_mask_mov_ps( a, mask, b ); +} + +static MaskedOcclusionCulling::Implementation gInstructionSet = MaskedOcclusionCulling::AVX512; + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Include common algorithm implementation (general, SIMD independent code) +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "MaskedOcclusionCullingCommon.inl" + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Utility function to create a new object using the allocator callbacks +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc; +typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree; + +MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree ) +{ + MaskedOcclusionCullingPrivate* object = ( MaskedOcclusionCullingPrivate* )alignedAlloc( 64, sizeof( MaskedOcclusionCullingPrivate ) ); + new( object ) MaskedOcclusionCullingPrivate( alignedAlloc, alignedFree ); + return object; +} +}; + +#else + +namespace MaskedOcclusionCullingAVX512 +{ +typedef MaskedOcclusionCulling::pfnAlignedAlloc pfnAlignedAlloc; +typedef MaskedOcclusionCulling::pfnAlignedFree pfnAlignedFree; + +MaskedOcclusionCulling* CreateMaskedOcclusionCulling( pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree ) +{ + return nullptr; +} +}; + +#endif diff --git a/neo/libs/moc/MaskedOcclusionCullingCommon.inl b/neo/libs/moc/MaskedOcclusionCullingCommon.inl new file mode 100644 index 00000000..db19bd0f --- /dev/null +++ b/neo/libs/moc/MaskedOcclusionCullingCommon.inl @@ -0,0 +1,2050 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +//////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Common SIMD math utility functions +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +template FORCE_INLINE T max(const T &a, const T &b) { return a > b ? a : b; } +template FORCE_INLINE T min(const T &a, const T &b) { return a < b ? a : b; } + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Common defines and constants +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#define SIMD_ALL_LANES_MASK ((1 << SIMD_LANES) - 1) + +// Tile dimensions are 32xN pixels. These values are not tweakable and the code must also be modified +// to support different tile sizes as it is tightly coupled with the SSE/AVX register size +#define TILE_WIDTH_SHIFT 5 +#define TILE_WIDTH (1 << TILE_WIDTH_SHIFT) +#define TILE_HEIGHT (1 << TILE_HEIGHT_SHIFT) + +// Sub-tiles (used for updating the masked HiZ buffer) are 8x4 tiles, so there are 4x2 sub-tiles in a tile +#define SUB_TILE_WIDTH 8 +#define SUB_TILE_HEIGHT 4 + +// The number of fixed point bits used to represent vertex coordinates / edge slopes. +#if PRECISE_COVERAGE != 0 + #define FP_BITS 8 + #define FP_HALF_PIXEL (1 << (FP_BITS - 1)) + #define FP_INV (1.0f / (float)(1 << FP_BITS)) +#else + // Note that too low precision, without precise coverage, may cause overshoots / false coverage during rasterization. + // This is configured for 14 bits for AVX512 and 16 bits for SSE. Max tile slope delta is roughly + // (screenWidth + 2*(GUARD_BAND_PIXEL_SIZE + 1)) * (2^FP_BITS * (TILE_HEIGHT + GUARD_BAND_PIXEL_SIZE + 1)) + // and must fit in 31 bits. With this config, max image resolution (width) is ~3272, so stay well clear of this limit. + #define FP_BITS (19 - TILE_HEIGHT_SHIFT) +#endif + +// Tile dimensions in fixed point coordinates +#define FP_TILE_HEIGHT_SHIFT (FP_BITS + TILE_HEIGHT_SHIFT) +#define FP_TILE_HEIGHT (1 << FP_TILE_HEIGHT_SHIFT) + +// Maximum number of triangles that may be generated during clipping. We process SIMD_LANES triangles at a time and +// clip against 5 planes, so the max should be 5*8 = 40 (we immediately draw the first clipped triangle). +// This number must be a power of two. +#define MAX_CLIPPED (8*SIMD_LANES) +#define MAX_CLIPPED_WRAP (MAX_CLIPPED - 1) + +// Size of guard band in pixels. Clipping doesn't seem to be very expensive so we use a small guard band +// to improve rasterization performance. It's not recommended to set the guard band to zero, as this may +// cause leakage along the screen border due to precision/rounding. +#define GUARD_BAND_PIXEL_SIZE 1.0f + +// We classify triangles as big if the bounding box is wider than this given threshold and use a tighter +// but slightly more expensive traversal algorithm. This improves performance greatly for sliver triangles +#define BIG_TRIANGLE 3 + +// Only gather statistics if enabled. +#if ENABLE_STATS != 0 + #define STATS_ADD(var, val) _InterlockedExchangeAdd64( &var, val ) +#else + #define STATS_ADD(var, val) +#endif + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// SIMD common defines (constant values) +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#define SIMD_BITS_ONE _mmw_set1_epi32(~0) +#define SIMD_BITS_ZERO _mmw_setzero_epi32() +#define SIMD_TILE_WIDTH _mmw_set1_epi32(TILE_WIDTH) + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Vertex fetch utility function, need to be in global namespace due to template specialization +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +template FORCE_INLINE void VtxFetch4(__mw *v, const unsigned int *inTrisPtr, int triVtx, const float *inVtx, int numLanes) +{ + // Fetch 4 vectors (matching 1 sse part of the SIMD register), and continue to the next + const int ssePart = (SIMD_LANES / 4) - N; + for (int k = 0; k < 4; k++) + { + int lane = 4 * ssePart + k; + if (numLanes > lane) + v[k] = _mmw_insertf32x4_ps(v[k], _mm_loadu_ps(&inVtx[inTrisPtr[lane * 3 + triVtx] << 2]), ssePart); + } + VtxFetch4(v, inTrisPtr, triVtx, inVtx, numLanes); +} + +template<> FORCE_INLINE void VtxFetch4<0>(__mw *v, const unsigned int *inTrisPtr, int triVtx, const float *inVtx, int numLanes) +{ + // Workaround for unused parameter warning + (void)v; (void)inTrisPtr; (void)triVtx; (void)inVtx; (void)numLanes; +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Private class containing the implementation +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +class MaskedOcclusionCullingPrivate : public MaskedOcclusionCulling +{ +public: + struct ZTile + { + __mw mZMin[2]; + __mwi mMask; + }; + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Member variables + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + __mw mHalfWidth; + __mw mHalfHeight; + __mw mCenterX; + __mw mCenterY; + __m128 mCSFrustumPlanes[5]; + __m128 mIHalfSize; + __m128 mICenter; + __m128i mIScreenSize; + + float mNearDist; + int mWidth; + int mHeight; + int mTilesWidth; + int mTilesHeight; + + ZTile *mMaskedHiZBuffer; + ScissorRect mFullscreenScissor; + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Constructors and state handling + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + MaskedOcclusionCullingPrivate(pfnAlignedAlloc alignedAlloc, pfnAlignedFree alignedFree) : mFullscreenScissor(0, 0, 0, 0) + { + mMaskedHiZBuffer = nullptr; + mAlignedAllocCallback = alignedAlloc; + mAlignedFreeCallback = alignedFree; +#if MOC_RECORDER_ENABLE + mRecorder = nullptr; +#endif + + SetNearClipPlane(0.0f); + mCSFrustumPlanes[0] = _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f); + mCSFrustumPlanes[1] = _mm_setr_ps(1.0f, 0.0f, 1.0f, 0.0f); + mCSFrustumPlanes[2] = _mm_setr_ps(-1.0f, 0.0f, 1.0f, 0.0f); + mCSFrustumPlanes[3] = _mm_setr_ps(0.0f, 1.0f, 1.0f, 0.0f); + mCSFrustumPlanes[4] = _mm_setr_ps(0.0f, -1.0f, 1.0f, 0.0f); + + memset(&mStats, 0, sizeof(OcclusionCullingStatistics)); + + SetResolution(0, 0); + } + + ~MaskedOcclusionCullingPrivate() override + { + if (mMaskedHiZBuffer != nullptr) + mAlignedFreeCallback(mMaskedHiZBuffer); + mMaskedHiZBuffer = nullptr; + +#if MOC_RECORDER_ENABLE + assert( mRecorder == nullptr ); // forgot to call StopRecording()? +#endif + } + + void SetResolution(unsigned int width, unsigned int height) override + { + // Resolution must be a multiple of the subtile size + assert(width % SUB_TILE_WIDTH == 0 && height % SUB_TILE_HEIGHT == 0); +#if PRECISE_COVERAGE == 0 + // Test if combination of resolution & SLOPE_FP_BITS bits may cause 32-bit overflow. Note that the maximum resolution estimate + // is only an estimate (not conservative). It's advicable to stay well below the limit. + assert(width < ((1U << 31) - 1U) / ((1U << FP_BITS) * (TILE_HEIGHT + (unsigned int)(GUARD_BAND_PIXEL_SIZE + 1.0f))) - (2U * (unsigned int)(GUARD_BAND_PIXEL_SIZE + 1.0f))); +#endif + + // Delete current masked hierarchical Z buffer + if (mMaskedHiZBuffer != nullptr) + mAlignedFreeCallback(mMaskedHiZBuffer); + mMaskedHiZBuffer = nullptr; + + // Setup various resolution dependent constant values + mWidth = (int)width; + mHeight = (int)height; + mTilesWidth = (int)(width + TILE_WIDTH - 1) >> TILE_WIDTH_SHIFT; + mTilesHeight = (int)(height + TILE_HEIGHT - 1) >> TILE_HEIGHT_SHIFT; + mCenterX = _mmw_set1_ps((float)mWidth * 0.5f); + mCenterY = _mmw_set1_ps((float)mHeight * 0.5f); + mICenter = _mm_setr_ps((float)mWidth * 0.5f, (float)mWidth * 0.5f, (float)mHeight * 0.5f, (float)mHeight * 0.5f); + mHalfWidth = _mmw_set1_ps((float)mWidth * 0.5f); +#if USE_D3D != 0 + mHalfHeight = _mmw_set1_ps((float)-mHeight * 0.5f); + mIHalfSize = _mm_setr_ps((float)mWidth * 0.5f, (float)mWidth * 0.5f, (float)-mHeight * 0.5f, (float)-mHeight * 0.5f); +#else + mHalfHeight = _mmw_set1_ps((float)mHeight * 0.5f); + mIHalfSize = _mm_setr_ps((float)mWidth * 0.5f, (float)mWidth * 0.5f, (float)mHeight * 0.5f, (float)mHeight * 0.5f); +#endif + mIScreenSize = _mm_setr_epi32(mWidth - 1, mWidth - 1, mHeight - 1, mHeight - 1); + + // Setup a full screen scissor rectangle + mFullscreenScissor.mMinX = 0; + mFullscreenScissor.mMinY = 0; + mFullscreenScissor.mMaxX = mTilesWidth << TILE_WIDTH_SHIFT; + mFullscreenScissor.mMaxY = mTilesHeight << TILE_HEIGHT_SHIFT; + + // Adjust clip planes to include a small guard band to avoid clipping leaks + float guardBandWidth = (2.0f / (float)mWidth) * GUARD_BAND_PIXEL_SIZE; + float guardBandHeight = (2.0f / (float)mHeight) * GUARD_BAND_PIXEL_SIZE; + mCSFrustumPlanes[1] = _mm_setr_ps(1.0f - guardBandWidth, 0.0f, 1.0f, 0.0f); + mCSFrustumPlanes[2] = _mm_setr_ps(-1.0f + guardBandWidth, 0.0f, 1.0f, 0.0f); + mCSFrustumPlanes[3] = _mm_setr_ps(0.0f, 1.0f - guardBandHeight, 1.0f, 0.0f); + mCSFrustumPlanes[4] = _mm_setr_ps(0.0f, -1.0f + guardBandHeight, 1.0f, 0.0f); + + // Allocate masked hierarchical Z buffer (if zero size leave at nullptr) + if(mTilesWidth * mTilesHeight > 0) + mMaskedHiZBuffer = (ZTile *)mAlignedAllocCallback(64, sizeof(ZTile) * mTilesWidth * mTilesHeight); + } + + void GetResolution(unsigned int &width, unsigned int &height) const override + { + width = mWidth; + height = mHeight; + } + + void ComputeBinWidthHeight(unsigned int nBinsW, unsigned int nBinsH, unsigned int & outBinWidth, unsigned int & outBinHeight) override + { + outBinWidth = (mWidth / nBinsW) - ((mWidth / nBinsW) % TILE_WIDTH); + outBinHeight = (mHeight / nBinsH) - ((mHeight / nBinsH) % TILE_HEIGHT); + } + + void SetNearClipPlane(float nearDist) override + { + // Setup the near frustum plane + mNearDist = nearDist; + mCSFrustumPlanes[0] = _mm_setr_ps(0.0f, 0.0f, 1.0f, -nearDist); + } + + float GetNearClipPlane() const override + { + return mNearDist; + } + + void ClearBuffer() override + { + assert(mMaskedHiZBuffer != nullptr); + + // Iterate through all depth tiles and clear to default values + for (int i = 0; i < mTilesWidth * mTilesHeight; i++) + { + mMaskedHiZBuffer[i].mMask = _mmw_setzero_epi32(); + + // Clear z0 to beyond infinity to ensure we never merge with clear data + mMaskedHiZBuffer[i].mZMin[0] = _mmw_set1_ps(-1.0f); +#if QUICK_MASK != 0 + // Clear z1 to nearest depth value as it is pushed back on each update + mMaskedHiZBuffer[i].mZMin[1] = _mmw_set1_ps(FLT_MAX); +#else + mMaskedHiZBuffer[i].mZMin[1] = _mmw_setzero_ps(); +#endif + } + +#if ENABLE_STATS != 0 + memset(&mStats, 0, sizeof(OcclusionCullingStatistics)); +#endif + +#if MOC_RECORDER_ENABLE != 0 + { + std::lock_guard lock( mRecorderMutex ); + if( mRecorder != nullptr ) mRecorder->RecordClearBuffer(); + } +#endif + } + + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // MergeBuffer + // Utility Function merges another MOC buffer into the existing one + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + void MergeBuffer(MaskedOcclusionCulling* BufferB) override + { + assert(mMaskedHiZBuffer != nullptr); + + //// Iterate through all depth tiles and merge the 2 tiles + for (int i = 0; i < mTilesWidth * mTilesHeight; i++) + { + __mw *zMinB = ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mZMin; + __mw *zMinA = mMaskedHiZBuffer[i].mZMin; + __mwi RastMaskB = ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask; + +#if QUICK_MASK != 0 + // Clear z0 to beyond infinity to ensure we never merge with clear data + __mwi sign0 = _mmw_srai_epi32(simd_cast<__mwi>(zMinB[0]), 31); + // Only merge tiles that have data in zMinB[0], use the sign bit to determine if they are still in a clear state + sign0 = _mmw_cmpeq_epi32(sign0, SIMD_BITS_ZERO); + if (!_mmw_testz_epi32(sign0, sign0)) + { + STATS_ADD(mStats.mOccluders.mNumTilesMerged, 1); + zMinA[0] = _mmw_max_ps(zMinA[0], zMinB[0]); + + __mwi rastMask = mMaskedHiZBuffer[i].mMask; + __mwi deadLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ZERO); + // Mask out all subtiles failing the depth test (don't update these subtiles) + deadLane = _mmw_or_epi32(deadLane, _mmw_srai_epi32(simd_cast<__mwi>(_mmw_sub_ps(zMinA[1], zMinA[0])), 31)); + mMaskedHiZBuffer[i].mMask = _mmw_andnot_epi32(deadLane, rastMask); + } + + // Set 32bit value to -1 if any pixels are set incide the coverage mask for a subtile + __mwi LiveTile = _mmw_cmpeq_epi32(RastMaskB, SIMD_BITS_ZERO); + // invert to have bits set for clear subtiles + __mwi t0inv = _mmw_not_epi32(LiveTile); + // VPTEST sets the ZF flag if all the resulting bits are 0 (ie if all tiles are clear) + if (!_mmw_testz_epi32(t0inv, t0inv)) + { + STATS_ADD(mStats.mOccluders.mNumTilesMerged, 1); + UpdateTileQuick(i, RastMaskB, zMinB[1]); + } +#else + // Clear z0 to beyond infinity to ensure we never merge with clear data + __mwi sign1 = _mmw_srai_epi32(simd_cast<__mwi>(mMaskedHiZBuffer[i].mZMin[0]), 31); + // Only merge tiles that have data in zMinB[0], use the sign bit to determine if they are still in a clear state + sign1 = _mmw_cmpeq_epi32(sign1, SIMD_BITS_ZERO); + + // Set 32bit value to -1 if any pixels are set incide the coverage mask for a subtile + __mwi LiveTile1 = _mmw_cmpeq_epi32(mMaskedHiZBuffer[i].mMask, SIMD_BITS_ZERO); + // invert to have bits set for clear subtiles + __mwi t1inv = _mmw_not_epi32(LiveTile1); + // VPTEST sets the ZF flag if all the resulting bits are 0 (ie if all tiles are clear) + if (_mmw_testz_epi32(sign1, sign1) && _mmw_testz_epi32(t1inv, t1inv)) + { + mMaskedHiZBuffer[i].mMask = ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask; + mMaskedHiZBuffer[i].mZMin[0] = zMinB[0]; + mMaskedHiZBuffer[i].mZMin[1] = zMinB[1]; + } + else + { + // Clear z0 to beyond infinity to ensure we never merge with clear data + __mwi sign0 = _mmw_srai_epi32(simd_cast<__mwi>(zMinB[0]), 31); + sign0 = _mmw_cmpeq_epi32(sign0, SIMD_BITS_ZERO); + // Only merge tiles that have data in zMinB[0], use the sign bit to determine if they are still in a clear state + if (!_mmw_testz_epi32(sign0, sign0)) + { + // build a mask for Zmin[0], full if the layer has been completed, or partial if tile is still partly filled. + // cant just use the completement of the mask, as tiles might not get updated by merge + __mwi sign1 = _mmw_srai_epi32(simd_cast<__mwi>(zMinB[1]), 31); + __mwi LayerMask0 = _mmw_not_epi32(sign1); + __mwi LayerMask1 = _mmw_not_epi32(((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask); + __mwi rastMask = _mmw_or_epi32(LayerMask0, LayerMask1); + + UpdateTileAccurate(i, rastMask, zMinB[0]); + } + + // Set 32bit value to -1 if any pixels are set incide the coverage mask for a subtile + __mwi LiveTile = _mmw_cmpeq_epi32(((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask, SIMD_BITS_ZERO); + // invert to have bits set for clear subtiles + __mwi t0inv = _mmw_not_epi32(LiveTile); + // VPTEST sets the ZF flag if all the resulting bits are 0 (ie if all tiles are clear) + if (!_mmw_testz_epi32(t0inv, t0inv)) + { + UpdateTileAccurate(i, ((MaskedOcclusionCullingPrivate*)BufferB)->mMaskedHiZBuffer[i].mMask, zMinB[1]); + } + + if (_mmw_testz_epi32(sign0, sign0) && _mmw_testz_epi32(t0inv, t0inv)) + STATS_ADD(mStats.mOccluders.mNumTilesMerged, 1); + + } + +#endif + } + } + + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Polygon clipping functions + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + FORCE_INLINE int ClipPolygon(__m128 *outVtx, __m128 *inVtx, const __m128 &plane, int n) const + { + __m128 p0 = inVtx[n - 1]; + __m128 dist0 = _mmx_dp4_ps(p0, plane); + + // Loop over all polygon edges and compute intersection with clip plane (if any) + int nout = 0; + for (int k = 0; k < n; k++) + { + __m128 p1 = inVtx[k]; + __m128 dist1 = _mmx_dp4_ps(p1, plane); + int dist0Neg = _mm_movemask_ps(dist0); + if (!dist0Neg) // dist0 > 0.0f + outVtx[nout++] = p0; + + // Edge intersects the clip plane if dist0 and dist1 have opposing signs + if (_mm_movemask_ps(_mm_xor_ps(dist0, dist1))) + { + // Always clip from the positive side to avoid T-junctions + if (!dist0Neg) + { + __m128 t = _mm_div_ps(dist0, _mm_sub_ps(dist0, dist1)); + outVtx[nout++] = _mmx_fmadd_ps(_mm_sub_ps(p1, p0), t, p0); + } + else + { + __m128 t = _mm_div_ps(dist1, _mm_sub_ps(dist1, dist0)); + outVtx[nout++] = _mmx_fmadd_ps(_mm_sub_ps(p0, p1), t, p1); + } + } + + dist0 = dist1; + p0 = p1; + } + return nout; + } + + template void TestClipPlane(__mw *vtxX, __mw *vtxY, __mw *vtxW, unsigned int &straddleMask, unsigned int &triMask, ClipPlanes clipPlaneMask) + { + straddleMask = 0; + // Skip masked clip planes + if (!(clipPlaneMask & CLIP_PLANE)) + return; + + // Evaluate all 3 vertices against the frustum plane + __mw planeDp[3]; + for (int i = 0; i < 3; ++i) + { + switch (CLIP_PLANE) + { + case ClipPlanes::CLIP_PLANE_LEFT: planeDp[i] = _mmw_add_ps(vtxW[i], vtxX[i]); break; + case ClipPlanes::CLIP_PLANE_RIGHT: planeDp[i] = _mmw_sub_ps(vtxW[i], vtxX[i]); break; + case ClipPlanes::CLIP_PLANE_BOTTOM: planeDp[i] = _mmw_add_ps(vtxW[i], vtxY[i]); break; + case ClipPlanes::CLIP_PLANE_TOP: planeDp[i] = _mmw_sub_ps(vtxW[i], vtxY[i]); break; + case ClipPlanes::CLIP_PLANE_NEAR: planeDp[i] = _mmw_sub_ps(vtxW[i], _mmw_set1_ps(mNearDist)); break; + } + } + + // Look at FP sign and determine if tri is inside, outside or straddles the frustum plane + __mw inside = _mmw_andnot_ps(planeDp[0], _mmw_andnot_ps(planeDp[1], _mmw_not_ps(planeDp[2]))); + __mw outside = _mmw_and_ps(planeDp[0], _mmw_and_ps(planeDp[1], planeDp[2])); + unsigned int inMask = (unsigned int)_mmw_movemask_ps(inside); + unsigned int outMask = (unsigned int)_mmw_movemask_ps(outside); + straddleMask = (~outMask) & (~inMask); + triMask &= ~outMask; + } + + FORCE_INLINE void ClipTriangleAndAddToBuffer(__mw *vtxX, __mw *vtxY, __mw *vtxW, __m128 *clippedTrisBuffer, int &clipWriteIdx, unsigned int &triMask, unsigned int triClipMask, ClipPlanes clipPlaneMask) + { + if (!triClipMask) + return; + + // Inside test all 3 triangle vertices against all active frustum planes + unsigned int straddleMask[5]; + TestClipPlane(vtxX, vtxY, vtxW, straddleMask[0], triMask, clipPlaneMask); + TestClipPlane(vtxX, vtxY, vtxW, straddleMask[1], triMask, clipPlaneMask); + TestClipPlane(vtxX, vtxY, vtxW, straddleMask[2], triMask, clipPlaneMask); + TestClipPlane(vtxX, vtxY, vtxW, straddleMask[3], triMask, clipPlaneMask); + TestClipPlane(vtxX, vtxY, vtxW, straddleMask[4], triMask, clipPlaneMask); + + // Clip triangle against straddling planes and add to the clipped triangle buffer + __m128 vtxBuf[2][8]; + +#if CLIPPING_PRESERVES_ORDER != 0 + unsigned int clipMask = triClipMask & triMask; + unsigned int clipAndStraddleMask = (straddleMask[0] | straddleMask[1] | straddleMask[2] | straddleMask[3] | straddleMask[4]) & clipMask; + // no clipping needed after all - early out + if (clipAndStraddleMask == 0) + return; + while( clipMask ) + { + // Find and setup next triangle to clip + unsigned int triIdx = find_clear_lsb(&clipMask); + unsigned int triBit = (1U << triIdx); + assert(triIdx < SIMD_LANES); + + int bufIdx = 0; + int nClippedVerts = 3; + for (int i = 0; i < 3; i++) + vtxBuf[0][i] = _mm_setr_ps(simd_f32(vtxX[i])[triIdx], simd_f32(vtxY[i])[triIdx], simd_f32(vtxW[i])[triIdx], 1.0f); + + // Clip triangle with straddling planes. + for (int i = 0; i < 5; ++i) + { + if ((straddleMask[i] & triBit) && (clipPlaneMask & (1 << i))) // <- second part maybe not needed? + { + nClippedVerts = ClipPolygon(vtxBuf[bufIdx ^ 1], vtxBuf[bufIdx], mCSFrustumPlanes[i], nClippedVerts); + bufIdx ^= 1; + } + } + + if (nClippedVerts >= 3) + { + // Write all triangles into the clip buffer and process them next loop iteration + clippedTrisBuffer[clipWriteIdx * 3 + 0] = vtxBuf[bufIdx][0]; + clippedTrisBuffer[clipWriteIdx * 3 + 1] = vtxBuf[bufIdx][1]; + clippedTrisBuffer[clipWriteIdx * 3 + 2] = vtxBuf[bufIdx][2]; + clipWriteIdx = (clipWriteIdx + 1) & (MAX_CLIPPED - 1); + for (int i = 2; i < nClippedVerts - 1; i++) + { + clippedTrisBuffer[clipWriteIdx * 3 + 0] = vtxBuf[bufIdx][0]; + clippedTrisBuffer[clipWriteIdx * 3 + 1] = vtxBuf[bufIdx][i]; + clippedTrisBuffer[clipWriteIdx * 3 + 2] = vtxBuf[bufIdx][i + 1]; + clipWriteIdx = (clipWriteIdx + 1) & (MAX_CLIPPED - 1); + } + } + } + // since all triangles were copied to clip buffer for next iteration, skip further processing + triMask = 0; +#else + unsigned int clipMask = (straddleMask[0] | straddleMask[1] | straddleMask[2] | straddleMask[3] | straddleMask[4]) & (triClipMask & triMask); + while (clipMask) + { + // Find and setup next triangle to clip + unsigned int triIdx = find_clear_lsb(&clipMask); + unsigned int triBit = (1U << triIdx); + assert(triIdx < SIMD_LANES); + + int bufIdx = 0; + int nClippedVerts = 3; + for (int i = 0; i < 3; i++) + vtxBuf[0][i] = _mm_setr_ps(simd_f32(vtxX[i])[triIdx], simd_f32(vtxY[i])[triIdx], simd_f32(vtxW[i])[triIdx], 1.0f); + + // Clip triangle with straddling planes. + for (int i = 0; i < 5; ++i) + { + if ((straddleMask[i] & triBit) && (clipPlaneMask & (1 << i))) + { + nClippedVerts = ClipPolygon(vtxBuf[bufIdx ^ 1], vtxBuf[bufIdx], mCSFrustumPlanes[i], nClippedVerts); + bufIdx ^= 1; + } + } + + if (nClippedVerts >= 3) + { + // Write the first triangle back into the list of currently processed triangles + for (int i = 0; i < 3; i++) + { + simd_f32(vtxX[i])[triIdx] = simd_f32(vtxBuf[bufIdx][i])[0]; + simd_f32(vtxY[i])[triIdx] = simd_f32(vtxBuf[bufIdx][i])[1]; + simd_f32(vtxW[i])[triIdx] = simd_f32(vtxBuf[bufIdx][i])[2]; + } + // Write the remaining triangles into the clip buffer and process them next loop iteration + for (int i = 2; i < nClippedVerts - 1; i++) + { + clippedTrisBuffer[clipWriteIdx * 3 + 0] = vtxBuf[bufIdx][0]; + clippedTrisBuffer[clipWriteIdx * 3 + 1] = vtxBuf[bufIdx][i]; + clippedTrisBuffer[clipWriteIdx * 3 + 2] = vtxBuf[bufIdx][i + 1]; + clipWriteIdx = (clipWriteIdx + 1) & (MAX_CLIPPED - 1); + } + } + else // Kill triangles that was removed by clipping + triMask &= ~triBit; + } +#endif + } + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Vertex transform & projection + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + FORCE_INLINE void TransformVerts(__mw *vtxX, __mw *vtxY, __mw *vtxW, const float *modelToClipMatrix) + { + if (modelToClipMatrix != nullptr) + { + for (int i = 0; i < 3; ++i) + { + __mw tmpX, tmpY, tmpW; + tmpX = _mmw_fmadd_ps(vtxX[i], _mmw_set1_ps(modelToClipMatrix[0]), _mmw_fmadd_ps(vtxY[i], _mmw_set1_ps(modelToClipMatrix[4]), _mmw_fmadd_ps(vtxW[i], _mmw_set1_ps(modelToClipMatrix[8]), _mmw_set1_ps(modelToClipMatrix[12])))); + tmpY = _mmw_fmadd_ps(vtxX[i], _mmw_set1_ps(modelToClipMatrix[1]), _mmw_fmadd_ps(vtxY[i], _mmw_set1_ps(modelToClipMatrix[5]), _mmw_fmadd_ps(vtxW[i], _mmw_set1_ps(modelToClipMatrix[9]), _mmw_set1_ps(modelToClipMatrix[13])))); + tmpW = _mmw_fmadd_ps(vtxX[i], _mmw_set1_ps(modelToClipMatrix[3]), _mmw_fmadd_ps(vtxY[i], _mmw_set1_ps(modelToClipMatrix[7]), _mmw_fmadd_ps(vtxW[i], _mmw_set1_ps(modelToClipMatrix[11]), _mmw_set1_ps(modelToClipMatrix[15])))); + vtxX[i] = tmpX; vtxY[i] = tmpY; vtxW[i] = tmpW; + } + } + } + +#if PRECISE_COVERAGE != 0 + FORCE_INLINE void ProjectVertices(__mwi *ipVtxX, __mwi *ipVtxY, __mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw *vtxX, const __mw *vtxY, const __mw *vtxW) + { +#if USE_D3D != 0 + static const int vertexOrder[] = {2, 1, 0}; +#else + static const int vertexOrder[] = {0, 1, 2}; +#endif + + // Project vertices and transform to screen space. Snap to sub-pixel coordinates with FP_BITS precision. + for (int i = 0; i < 3; i++) + { + int idx = vertexOrder[i]; + __mw rcpW = _mmw_div_ps(_mmw_set1_ps(1.0f), vtxW[i]); + __mw screenX = _mmw_fmadd_ps(_mmw_mul_ps(vtxX[i], mHalfWidth), rcpW, mCenterX); + __mw screenY = _mmw_fmadd_ps(_mmw_mul_ps(vtxY[i], mHalfHeight), rcpW, mCenterY); + ipVtxX[idx] = _mmw_cvtps_epi32(_mmw_mul_ps(screenX, _mmw_set1_ps(float(1 << FP_BITS)))); + ipVtxY[idx] = _mmw_cvtps_epi32(_mmw_mul_ps(screenY, _mmw_set1_ps(float(1 << FP_BITS)))); + pVtxX[idx] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxX[idx]), _mmw_set1_ps(FP_INV)); + pVtxY[idx] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxY[idx]), _mmw_set1_ps(FP_INV)); + pVtxZ[idx] = rcpW; + } + } +#else + FORCE_INLINE void ProjectVertices(__mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw *vtxX, const __mw *vtxY, const __mw *vtxW) + { +#if USE_D3D != 0 + static const int vertexOrder[] = {2, 1, 0}; +#else + static const int vertexOrder[] = {0, 1, 2}; +#endif + // Project vertices and transform to screen space. Round to nearest integer pixel coordinate + for (int i = 0; i < 3; i++) + { + int idx = vertexOrder[i]; + __mw rcpW = _mmw_div_ps(_mmw_set1_ps(1.0f), vtxW[i]); + + // The rounding modes are set to match HW rasterization with OpenGL. In practice our samples are placed + // in the (1,0) corner of each pixel, while HW rasterizer uses (0.5, 0.5). We get (1,0) because of the + // floor used when interpolating along triangle edges. The rounding modes match an offset of (0.5, -0.5) + pVtxX[idx] = _mmw_ceil_ps(_mmw_fmadd_ps(_mmw_mul_ps(vtxX[i], mHalfWidth), rcpW, mCenterX)); + pVtxY[idx] = _mmw_floor_ps(_mmw_fmadd_ps(_mmw_mul_ps(vtxY[i], mHalfHeight), rcpW, mCenterY)); + pVtxZ[idx] = rcpW; + } + } +#endif + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Common SSE/AVX input assembly functions, note that there are specialized gathers for the general case in the SSE/AVX specific files + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + FORCE_INLINE void GatherVerticesFast(__mw *vtxX, __mw *vtxY, __mw *vtxW, const float *inVtx, const unsigned int *inTrisPtr, int numLanes) + { + // This function assumes that the vertex layout is four packed x, y, z, w-values. + // Since the layout is known we can get some additional performance by using a + // more optimized gather strategy. + assert(numLanes >= 1); + + // Gather vertices + __mw v[4], swz[4]; + for (int i = 0; i < 3; i++) + { + // Load 4 (x,y,z,w) vectors per SSE part of the SIMD register (so 4 vectors for SSE, 8 vectors for AVX) + // this fetch uses templates to unroll the loop + VtxFetch4(v, inTrisPtr, i, inVtx, numLanes); + + // Transpose each individual SSE part of the SSE/AVX register (similar to _MM_TRANSPOSE4_PS) + swz[0] = _mmw_shuffle_ps(v[0], v[1], 0x44); + swz[2] = _mmw_shuffle_ps(v[0], v[1], 0xEE); + swz[1] = _mmw_shuffle_ps(v[2], v[3], 0x44); + swz[3] = _mmw_shuffle_ps(v[2], v[3], 0xEE); + + vtxX[i] = _mmw_shuffle_ps(swz[0], swz[1], 0x88); + vtxY[i] = _mmw_shuffle_ps(swz[0], swz[1], 0xDD); + vtxW[i] = _mmw_shuffle_ps(swz[2], swz[3], 0xDD); + } + } + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Rasterization functions + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + FORCE_INLINE void ComputeBoundingBox(__mwi &bbminX, __mwi &bbminY, __mwi &bbmaxX, __mwi &bbmaxY, const __mw *vX, const __mw *vY, const ScissorRect *scissor) + { + static const __mwi SIMD_PAD_W_MASK = _mmw_set1_epi32(~(TILE_WIDTH - 1)); + static const __mwi SIMD_PAD_H_MASK = _mmw_set1_epi32(~(TILE_HEIGHT - 1)); + + // Find Min/Max vertices + bbminX = _mmw_cvttps_epi32(_mmw_min_ps(vX[0], _mmw_min_ps(vX[1], vX[2]))); + bbminY = _mmw_cvttps_epi32(_mmw_min_ps(vY[0], _mmw_min_ps(vY[1], vY[2]))); + bbmaxX = _mmw_cvttps_epi32(_mmw_max_ps(vX[0], _mmw_max_ps(vX[1], vX[2]))); + bbmaxY = _mmw_cvttps_epi32(_mmw_max_ps(vY[0], _mmw_max_ps(vY[1], vY[2]))); + + // Clamp to tile boundaries + bbminX = _mmw_and_epi32(bbminX, SIMD_PAD_W_MASK); + bbmaxX = _mmw_and_epi32(_mmw_add_epi32(bbmaxX, _mmw_set1_epi32(TILE_WIDTH)), SIMD_PAD_W_MASK); + bbminY = _mmw_and_epi32(bbminY, SIMD_PAD_H_MASK); + bbmaxY = _mmw_and_epi32(_mmw_add_epi32(bbmaxY, _mmw_set1_epi32(TILE_HEIGHT)), SIMD_PAD_H_MASK); + + // Clip to scissor + bbminX = _mmw_max_epi32(bbminX, _mmw_set1_epi32(scissor->mMinX)); + bbmaxX = _mmw_min_epi32(bbmaxX, _mmw_set1_epi32(scissor->mMaxX)); + bbminY = _mmw_max_epi32(bbminY, _mmw_set1_epi32(scissor->mMinY)); + bbmaxY = _mmw_min_epi32(bbmaxY, _mmw_set1_epi32(scissor->mMaxY)); + } + +#if PRECISE_COVERAGE != 0 + FORCE_INLINE void SortVertices(__mwi *vX, __mwi *vY) + { + // Rotate the triangle in the winding order until v0 is the vertex with lowest Y value + for (int i = 0; i < 2; i++) + { + __mwi ey1 = _mmw_sub_epi32(vY[1], vY[0]); + __mwi ey2 = _mmw_sub_epi32(vY[2], vY[0]); + __mwi swapMask = _mmw_or_epi32(_mmw_or_epi32(ey1, ey2), _mmw_cmpeq_epi32(simd_cast<__mwi>(ey2), SIMD_BITS_ZERO)); + __mwi sX, sY; + sX = _mmw_blendv_epi32(vX[2], vX[0], swapMask); + vX[0] = _mmw_blendv_epi32(vX[0], vX[1], swapMask); + vX[1] = _mmw_blendv_epi32(vX[1], vX[2], swapMask); + vX[2] = sX; + sY = _mmw_blendv_epi32(vY[2], vY[0], swapMask); + vY[0] = _mmw_blendv_epi32(vY[0], vY[1], swapMask); + vY[1] = _mmw_blendv_epi32(vY[1], vY[2], swapMask); + vY[2] = sY; + } + } + + FORCE_INLINE int CullBackfaces(__mwi *ipVtxX, __mwi *ipVtxY, __mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw &ccwMask, BackfaceWinding bfWinding) + { + // Reverse vertex order if non cw faces are considered front facing (rasterizer code requires CCW order) + if (!(bfWinding & BACKFACE_CW)) + { + __mw tmpX, tmpY, tmpZ; + __mwi itmpX, itmpY; + itmpX = _mmw_blendv_epi32(ipVtxX[2], ipVtxX[0], simd_cast<__mwi>(ccwMask)); + itmpY = _mmw_blendv_epi32(ipVtxY[2], ipVtxY[0], simd_cast<__mwi>(ccwMask)); + tmpX = _mmw_blendv_ps(pVtxX[2], pVtxX[0], ccwMask); + tmpY = _mmw_blendv_ps(pVtxY[2], pVtxY[0], ccwMask); + tmpZ = _mmw_blendv_ps(pVtxZ[2], pVtxZ[0], ccwMask); + ipVtxX[2] = _mmw_blendv_epi32(ipVtxX[0], ipVtxX[2], simd_cast<__mwi>(ccwMask)); + ipVtxY[2] = _mmw_blendv_epi32(ipVtxY[0], ipVtxY[2], simd_cast<__mwi>(ccwMask)); + pVtxX[2] = _mmw_blendv_ps(pVtxX[0], pVtxX[2], ccwMask); + pVtxY[2] = _mmw_blendv_ps(pVtxY[0], pVtxY[2], ccwMask); + pVtxZ[2] = _mmw_blendv_ps(pVtxZ[0], pVtxZ[2], ccwMask); + ipVtxX[0] = itmpX; + ipVtxY[0] = itmpY; + pVtxX[0] = tmpX; + pVtxY[0] = tmpY; + pVtxZ[0] = tmpZ; + } + + // Return a lane mask with all front faces set + return ((bfWinding & BACKFACE_CCW) ? 0 : _mmw_movemask_ps(ccwMask)) | ((bfWinding & BACKFACE_CW) ? 0 : ~_mmw_movemask_ps(ccwMask)); + } +#else + FORCE_INLINE void SortVertices(__mw *vX, __mw *vY) + { + // Rotate the triangle in the winding order until v0 is the vertex with lowest Y value + for (int i = 0; i < 2; i++) + { + __mw ey1 = _mmw_sub_ps(vY[1], vY[0]); + __mw ey2 = _mmw_sub_ps(vY[2], vY[0]); + __mw swapMask = _mmw_or_ps(_mmw_or_ps(ey1, ey2), simd_cast<__mw>(_mmw_cmpeq_epi32(simd_cast<__mwi>(ey2), SIMD_BITS_ZERO))); + __mw sX, sY; + sX = _mmw_blendv_ps(vX[2], vX[0], swapMask); + vX[0] = _mmw_blendv_ps(vX[0], vX[1], swapMask); + vX[1] = _mmw_blendv_ps(vX[1], vX[2], swapMask); + vX[2] = sX; + sY = _mmw_blendv_ps(vY[2], vY[0], swapMask); + vY[0] = _mmw_blendv_ps(vY[0], vY[1], swapMask); + vY[1] = _mmw_blendv_ps(vY[1], vY[2], swapMask); + vY[2] = sY; + } + } + + FORCE_INLINE int CullBackfaces(__mw *pVtxX, __mw *pVtxY, __mw *pVtxZ, const __mw &ccwMask, BackfaceWinding bfWinding) + { + // Reverse vertex order if non cw faces are considered front facing (rasterizer code requires CCW order) + if (!(bfWinding & BACKFACE_CW)) + { + __mw tmpX, tmpY, tmpZ; + tmpX = _mmw_blendv_ps(pVtxX[2], pVtxX[0], ccwMask); + tmpY = _mmw_blendv_ps(pVtxY[2], pVtxY[0], ccwMask); + tmpZ = _mmw_blendv_ps(pVtxZ[2], pVtxZ[0], ccwMask); + pVtxX[2] = _mmw_blendv_ps(pVtxX[0], pVtxX[2], ccwMask); + pVtxY[2] = _mmw_blendv_ps(pVtxY[0], pVtxY[2], ccwMask); + pVtxZ[2] = _mmw_blendv_ps(pVtxZ[0], pVtxZ[2], ccwMask); + pVtxX[0] = tmpX; + pVtxY[0] = tmpY; + pVtxZ[0] = tmpZ; + } + + // Return a lane mask with all front faces set + return ((bfWinding & BACKFACE_CCW) ? 0 : _mmw_movemask_ps(ccwMask)) | ((bfWinding & BACKFACE_CW) ? 0 : ~_mmw_movemask_ps(ccwMask)); + } +#endif + + FORCE_INLINE void ComputeDepthPlane(const __mw *pVtxX, const __mw *pVtxY, const __mw *pVtxZ, __mw &zPixelDx, __mw &zPixelDy) const + { + // Setup z(x,y) = z0 + dx*x + dy*y screen space depth plane equation + __mw x2 = _mmw_sub_ps(pVtxX[2], pVtxX[0]); + __mw x1 = _mmw_sub_ps(pVtxX[1], pVtxX[0]); + __mw y1 = _mmw_sub_ps(pVtxY[1], pVtxY[0]); + __mw y2 = _mmw_sub_ps(pVtxY[2], pVtxY[0]); + __mw z1 = _mmw_sub_ps(pVtxZ[1], pVtxZ[0]); + __mw z2 = _mmw_sub_ps(pVtxZ[2], pVtxZ[0]); + __mw d = _mmw_div_ps(_mmw_set1_ps(1.0f), _mmw_fmsub_ps(x1, y2, _mmw_mul_ps(y1, x2))); + zPixelDx = _mmw_mul_ps(_mmw_fmsub_ps(z1, y2, _mmw_mul_ps(y1, z2)), d); + zPixelDy = _mmw_mul_ps(_mmw_fmsub_ps(x1, z2, _mmw_mul_ps(z1, x2)), d); + } + + FORCE_INLINE void UpdateTileQuick(int tileIdx, const __mwi &coverage, const __mw &zTriv) + { + // Update heuristic used in the paper "Masked Software Occlusion Culling", + // good balance between performance and accuracy + STATS_ADD(mStats.mOccluders.mNumTilesUpdated, 1); + assert(tileIdx >= 0 && tileIdx < mTilesWidth*mTilesHeight); + + __mwi mask = mMaskedHiZBuffer[tileIdx].mMask; + __mw *zMin = mMaskedHiZBuffer[tileIdx].mZMin; + + // Swizzle coverage mask to 8x4 subtiles and test if any subtiles are not covered at all + __mwi rastMask = coverage; + __mwi deadLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ZERO); + + // Mask out all subtiles failing the depth test (don't update these subtiles) + deadLane = _mmw_or_epi32(deadLane, _mmw_srai_epi32(simd_cast<__mwi>(_mmw_sub_ps(zTriv, zMin[0])), 31)); + rastMask = _mmw_andnot_epi32(deadLane, rastMask); + + // Use distance heuristic to discard layer 1 if incoming triangle is significantly nearer to observer + // than the buffer contents. See Section 3.2 in "Masked Software Occlusion Culling" + __mwi coveredLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ONE); + __mw diff = _mmw_fmsub_ps(zMin[1], _mmw_set1_ps(2.0f), _mmw_add_ps(zTriv, zMin[0])); + __mwi discardLayerMask = _mmw_andnot_epi32(deadLane, _mmw_or_epi32(_mmw_srai_epi32(simd_cast<__mwi>(diff), 31), coveredLane)); + + // Update the mask with incoming triangle coverage + mask = _mmw_or_epi32(_mmw_andnot_epi32(discardLayerMask, mask), rastMask); + + __mwi maskFull = _mmw_cmpeq_epi32(mask, SIMD_BITS_ONE); + + // Compute new value for zMin[1]. This has one of four outcomes: zMin[1] = min(zMin[1], zTriv), zMin[1] = zTriv, + // zMin[1] = FLT_MAX or unchanged, depending on if the layer is updated, discarded, fully covered, or not updated + __mw opA = _mmw_blendv_ps(zTriv, zMin[1], simd_cast<__mw>(deadLane)); + __mw opB = _mmw_blendv_ps(zMin[1], zTriv, simd_cast<__mw>(discardLayerMask)); + __mw z1min = _mmw_min_ps(opA, opB); + zMin[1] = _mmw_blendv_ps(z1min, _mmw_set1_ps(FLT_MAX), simd_cast<__mw>(maskFull)); + + // Propagate zMin[1] back to zMin[0] if tile was fully covered, and update the mask + zMin[0] = _mmw_blendv_ps(zMin[0], z1min, simd_cast<__mw>(maskFull)); + mMaskedHiZBuffer[tileIdx].mMask = _mmw_andnot_epi32(maskFull, mask); + } + + FORCE_INLINE void UpdateTileAccurate(int tileIdx, const __mwi &coverage, const __mw &zTriv) + { + assert(tileIdx >= 0 && tileIdx < mTilesWidth*mTilesHeight); + + __mw *zMin = mMaskedHiZBuffer[tileIdx].mZMin; + __mwi &mask = mMaskedHiZBuffer[tileIdx].mMask; + + // Swizzle coverage mask to 8x4 subtiles + __mwi rastMask = coverage; + + // Perform individual depth tests with layer 0 & 1 and mask out all failing pixels + __mw sdist0 = _mmw_sub_ps(zMin[0], zTriv); + __mw sdist1 = _mmw_sub_ps(zMin[1], zTriv); + __mwi sign0 = _mmw_srai_epi32(simd_cast<__mwi>(sdist0), 31); + __mwi sign1 = _mmw_srai_epi32(simd_cast<__mwi>(sdist1), 31); + __mwi triMask = _mmw_and_epi32(rastMask, _mmw_or_epi32(_mmw_andnot_epi32(mask, sign0), _mmw_and_epi32(mask, sign1))); + + // Early out if no pixels survived the depth test (this test is more accurate than + // the early culling test in TraverseScanline()) + __mwi t0 = _mmw_cmpeq_epi32(triMask, SIMD_BITS_ZERO); + __mwi t0inv = _mmw_not_epi32(t0); + if (_mmw_testz_epi32(t0inv, t0inv)) + return; + + STATS_ADD(mStats.mOccluders.mNumTilesUpdated, 1); + + __mw zTri = _mmw_blendv_ps(zTriv, zMin[0], simd_cast<__mw>(t0)); + + // Test if incoming triangle completely overwrites layer 0 or 1 + __mwi layerMask0 = _mmw_andnot_epi32(triMask, _mmw_not_epi32(mask)); + __mwi layerMask1 = _mmw_andnot_epi32(triMask, mask); + __mwi lm0 = _mmw_cmpeq_epi32(layerMask0, SIMD_BITS_ZERO); + __mwi lm1 = _mmw_cmpeq_epi32(layerMask1, SIMD_BITS_ZERO); + __mw z0 = _mmw_blendv_ps(zMin[0], zTri, simd_cast<__mw>(lm0)); + __mw z1 = _mmw_blendv_ps(zMin[1], zTri, simd_cast<__mw>(lm1)); + + // Compute distances used for merging heuristic + __mw d0 = _mmw_abs_ps(sdist0); + __mw d1 = _mmw_abs_ps(sdist1); + __mw d2 = _mmw_abs_ps(_mmw_sub_ps(z0, z1)); + + // Find minimum distance + __mwi c01 = simd_cast<__mwi>(_mmw_sub_ps(d0, d1)); + __mwi c02 = simd_cast<__mwi>(_mmw_sub_ps(d0, d2)); + __mwi c12 = simd_cast<__mwi>(_mmw_sub_ps(d1, d2)); + // Two tests indicating which layer the incoming triangle will merge with or + // overwrite. d0min indicates that the triangle will overwrite layer 0, and + // d1min flags that the triangle will overwrite layer 1. + __mwi d0min = _mmw_or_epi32(_mmw_and_epi32(c01, c02), _mmw_or_epi32(lm0, t0)); + __mwi d1min = _mmw_andnot_epi32(d0min, _mmw_or_epi32(c12, lm1)); + + /////////////////////////////////////////////////////////////////////////////// + // Update depth buffer entry. NOTE: we always merge into layer 0, so if the + // triangle should be merged with layer 1, we first swap layer 0 & 1 and then + // merge into layer 0. + /////////////////////////////////////////////////////////////////////////////// + + // Update mask based on which layer the triangle overwrites or was merged into + __mw inner = _mmw_blendv_ps(simd_cast<__mw>(triMask), simd_cast<__mw>(layerMask1), simd_cast<__mw>(d0min)); + mask = simd_cast<__mwi>(_mmw_blendv_ps(inner, simd_cast<__mw>(layerMask0), simd_cast<__mw>(d1min))); + + // Update the zMin[0] value. There are four outcomes: overwrite with layer 1, + // merge with layer 1, merge with zTri or overwrite with layer 1 and then merge + // with zTri. + __mw e0 = _mmw_blendv_ps(z0, z1, simd_cast<__mw>(d1min)); + __mw e1 = _mmw_blendv_ps(z1, zTri, simd_cast<__mw>(_mmw_or_epi32(d1min, d0min))); + zMin[0] = _mmw_min_ps(e0, e1); + + // Update the zMin[1] value. There are three outcomes: keep current value, + // overwrite with zTri, or overwrite with z1 + __mw z1t = _mmw_blendv_ps(zTri, z1, simd_cast<__mw>(d0min)); + zMin[1] = _mmw_blendv_ps(z1t, z0, simd_cast<__mw>(d1min)); + } + + template + FORCE_INLINE int TraverseScanline(int leftOffset, int rightOffset, int tileIdx, int rightEvent, int leftEvent, const __mwi *events, const __mw &zTriMin, const __mw &zTriMax, const __mw &iz0, float zx) + { + // Floor edge events to integer pixel coordinates (shift out fixed point bits) + int eventOffset = leftOffset << TILE_WIDTH_SHIFT; + __mwi right[NRIGHT], left[NLEFT]; + for (int i = 0; i < NRIGHT; ++i) + right[i] = _mmw_max_epi32(_mmw_sub_epi32(_mmw_srai_epi32(events[rightEvent + i], FP_BITS), _mmw_set1_epi32(eventOffset)), SIMD_BITS_ZERO); + for (int i = 0; i < NLEFT; ++i) + left[i] = _mmw_max_epi32(_mmw_sub_epi32(_mmw_srai_epi32(events[leftEvent - i], FP_BITS), _mmw_set1_epi32(eventOffset)), SIMD_BITS_ZERO); + + __mw z0 = _mmw_add_ps(iz0, _mmw_set1_ps(zx*leftOffset)); + int tileIdxEnd = tileIdx + rightOffset; + tileIdx += leftOffset; + for (;;) + { + if (TEST_Z) + STATS_ADD(mStats.mOccludees.mNumTilesTraversed, 1); + else + STATS_ADD(mStats.mOccluders.mNumTilesTraversed, 1); + + // Perform a coarse test to quickly discard occluded tiles +#if QUICK_MASK != 0 + // Only use the reference layer (layer 0) to cull as it is always conservative + __mw zMinBuf = mMaskedHiZBuffer[tileIdx].mZMin[0]; +#else + // Compute zMin for the overlapped layers + __mwi mask = mMaskedHiZBuffer[tileIdx].mMask; + __mw zMin0 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[0], mMaskedHiZBuffer[tileIdx].mZMin[1], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_set1_epi32(~0)))); + __mw zMin1 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[1], mMaskedHiZBuffer[tileIdx].mZMin[0], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_setzero_epi32()))); + __mw zMinBuf = _mmw_min_ps(zMin0, zMin1); +#endif + __mw dist0 = _mmw_sub_ps(zTriMax, zMinBuf); + if (_mmw_movemask_ps(dist0) != SIMD_ALL_LANES_MASK) + { + // Compute coverage mask for entire 32xN using shift operations + __mwi accumulatedMask = _mmw_sllv_ones(left[0]); + for (int i = 1; i < NLEFT; ++i) + accumulatedMask = _mmw_and_epi32(accumulatedMask, _mmw_sllv_ones(left[i])); + for (int i = 0; i < NRIGHT; ++i) + accumulatedMask = _mmw_andnot_epi32(_mmw_sllv_ones(right[i]), accumulatedMask); + + if (TEST_Z) + { + // Perform a conservative visibility test (test zMax against buffer for each covered 8x4 subtile) + __mw zSubTileMax = _mmw_min_ps(z0, zTriMax); + __mwi zPass = simd_cast<__mwi>(_mmw_cmpge_ps(zSubTileMax, zMinBuf)); + + __mwi rastMask = _mmw_transpose_epi8(accumulatedMask); + __mwi deadLane = _mmw_cmpeq_epi32(rastMask, SIMD_BITS_ZERO); + zPass = _mmw_andnot_epi32(deadLane, zPass); + + if (!_mmw_testz_epi32(zPass, zPass)) + return CullingResult::VISIBLE; + } + else + { + // Compute interpolated min for each 8x4 subtile and update the masked hierarchical z buffer entry + __mw zSubTileMin = _mmw_max_ps(z0, zTriMin); +#if QUICK_MASK != 0 + UpdateTileQuick(tileIdx, _mmw_transpose_epi8(accumulatedMask), zSubTileMin); +#else + UpdateTileAccurate(tileIdx, _mmw_transpose_epi8(accumulatedMask), zSubTileMin); +#endif + } + } + + // Update buffer address, interpolate z and edge events + tileIdx++; + if (tileIdx >= tileIdxEnd) + break; + z0 = _mmw_add_ps(z0, _mmw_set1_ps(zx)); + for (int i = 0; i < NRIGHT; ++i) + right[i] = _mmw_subs_epu16(right[i], SIMD_TILE_WIDTH); // Trick, use sub saturated to avoid checking against < 0 for shift (values should fit in 16 bits) + for (int i = 0; i < NLEFT; ++i) + left[i] = _mmw_subs_epu16(left[i], SIMD_TILE_WIDTH); + } + + return TEST_Z ? CullingResult::OCCLUDED : CullingResult::VISIBLE; + } + + + template +#if PRECISE_COVERAGE != 0 + FORCE_INLINE int RasterizeTriangle(unsigned int triIdx, int bbWidth, int tileRowIdx, int tileMidRowIdx, int tileEndRowIdx, const __mwi *eventStart, const __mw *slope, const __mwi *slopeTileDelta, const __mw &zTriMin, const __mw &zTriMax, __mw &z0, float zx, float zy, const __mwi *edgeY, const __mwi *absEdgeX, const __mwi *slopeSign, const __mwi *eventStartRemainder, const __mwi *slopeTileRemainder) +#else + FORCE_INLINE int RasterizeTriangle(unsigned int triIdx, int bbWidth, int tileRowIdx, int tileMidRowIdx, int tileEndRowIdx, const __mwi *eventStart, const __mwi *slope, const __mwi *slopeTileDelta, const __mw &zTriMin, const __mw &zTriMax, __mw &z0, float zx, float zy) +#endif + { + if (TEST_Z) + STATS_ADD(mStats.mOccludees.mNumRasterizedTriangles, 1); + else + STATS_ADD(mStats.mOccluders.mNumRasterizedTriangles, 1); + + int cullResult; + +#if PRECISE_COVERAGE != 0 + #define LEFT_EDGE_BIAS -1 + #define RIGHT_EDGE_BIAS 1 + #define UPDATE_TILE_EVENTS_Y(i) \ + triEventRemainder[i] = _mmw_sub_epi32(triEventRemainder[i], triSlopeTileRemainder[i]); \ + __mwi overflow##i = _mmw_srai_epi32(triEventRemainder[i], 31); \ + triEventRemainder[i] = _mmw_add_epi32(triEventRemainder[i], _mmw_and_epi32(overflow##i, triEdgeY[i])); \ + triEvent[i] = _mmw_add_epi32(triEvent[i], _mmw_add_epi32(triSlopeTileDelta[i], _mmw_and_epi32(overflow##i, triSlopeSign[i]))) + + __mwi triEvent[3], triSlopeSign[3], triSlopeTileDelta[3], triEdgeY[3], triSlopeTileRemainder[3], triEventRemainder[3]; + for (int i = 0; i < 3; ++i) + { + triSlopeSign[i] = _mmw_set1_epi32(simd_i32(slopeSign[i])[triIdx]); + triSlopeTileDelta[i] = _mmw_set1_epi32(simd_i32(slopeTileDelta[i])[triIdx]); + triEdgeY[i] = _mmw_set1_epi32(simd_i32(edgeY[i])[triIdx]); + triSlopeTileRemainder[i] = _mmw_set1_epi32(simd_i32(slopeTileRemainder[i])[triIdx]); + + __mw triSlope = _mmw_set1_ps(simd_f32(slope[i])[triIdx]); + __mwi triAbsEdgeX = _mmw_set1_epi32(simd_i32(absEdgeX[i])[triIdx]); + __mwi triStartRemainder = _mmw_set1_epi32(simd_i32(eventStartRemainder[i])[triIdx]); + __mwi triEventStart = _mmw_set1_epi32(simd_i32(eventStart[i])[triIdx]); + + __mwi scanlineDelta = _mmw_cvttps_epi32(_mmw_mul_ps(triSlope, SIMD_LANE_YCOORD_F)); + __mwi scanlineSlopeRemainder = _mmw_sub_epi32(_mmw_mullo_epi32(triAbsEdgeX, SIMD_LANE_YCOORD_I), _mmw_mullo_epi32(_mmw_abs_epi32(scanlineDelta), triEdgeY[i])); + + triEventRemainder[i] = _mmw_sub_epi32(triStartRemainder, scanlineSlopeRemainder); + __mwi overflow = _mmw_srai_epi32(triEventRemainder[i], 31); + triEventRemainder[i] = _mmw_add_epi32(triEventRemainder[i], _mmw_and_epi32(overflow, triEdgeY[i])); + triEvent[i] = _mmw_add_epi32(_mmw_add_epi32(triEventStart, scanlineDelta), _mmw_and_epi32(overflow, triSlopeSign[i])); + } + +#else + #define LEFT_EDGE_BIAS 0 + #define RIGHT_EDGE_BIAS 0 + #define UPDATE_TILE_EVENTS_Y(i) triEvent[i] = _mmw_add_epi32(triEvent[i], triSlopeTileDelta[i]); + + // Get deltas used to increment edge events each time we traverse one scanline of tiles + __mwi triSlopeTileDelta[3]; + triSlopeTileDelta[0] = _mmw_set1_epi32(simd_i32(slopeTileDelta[0])[triIdx]); + triSlopeTileDelta[1] = _mmw_set1_epi32(simd_i32(slopeTileDelta[1])[triIdx]); + triSlopeTileDelta[2] = _mmw_set1_epi32(simd_i32(slopeTileDelta[2])[triIdx]); + + // Setup edge events for first batch of SIMD_LANES scanlines + __mwi triEvent[3]; + triEvent[0] = _mmw_add_epi32(_mmw_set1_epi32(simd_i32(eventStart[0])[triIdx]), _mmw_mullo_epi32(SIMD_LANE_IDX, _mmw_set1_epi32(simd_i32(slope[0])[triIdx]))); + triEvent[1] = _mmw_add_epi32(_mmw_set1_epi32(simd_i32(eventStart[1])[triIdx]), _mmw_mullo_epi32(SIMD_LANE_IDX, _mmw_set1_epi32(simd_i32(slope[1])[triIdx]))); + triEvent[2] = _mmw_add_epi32(_mmw_set1_epi32(simd_i32(eventStart[2])[triIdx]), _mmw_mullo_epi32(SIMD_LANE_IDX, _mmw_set1_epi32(simd_i32(slope[2])[triIdx]))); +#endif + + // For big triangles track start & end tile for each scanline and only traverse the valid region + int startDelta, endDelta, topDelta, startEvent, endEvent, topEvent; + if (TIGHT_TRAVERSAL) + { + startDelta = simd_i32(slopeTileDelta[2])[triIdx] + LEFT_EDGE_BIAS; + endDelta = simd_i32(slopeTileDelta[0])[triIdx] + RIGHT_EDGE_BIAS; + topDelta = simd_i32(slopeTileDelta[1])[triIdx] + (MID_VTX_RIGHT ? RIGHT_EDGE_BIAS : LEFT_EDGE_BIAS); + + // Compute conservative bounds for the edge events over a 32xN tile + startEvent = simd_i32(eventStart[2])[triIdx] + min(0, startDelta); + endEvent = simd_i32(eventStart[0])[triIdx] + max(0, endDelta) + (TILE_WIDTH << FP_BITS); + if (MID_VTX_RIGHT) + topEvent = simd_i32(eventStart[1])[triIdx] + max(0, topDelta) + (TILE_WIDTH << FP_BITS); + else + topEvent = simd_i32(eventStart[1])[triIdx] + min(0, topDelta); + } + + if (tileRowIdx <= tileMidRowIdx) + { + int tileStopIdx = min(tileEndRowIdx, tileMidRowIdx); + // Traverse the bottom half of the triangle + while (tileRowIdx < tileStopIdx) + { + int start = 0, end = bbWidth; + if (TIGHT_TRAVERSAL) + { + // Compute tighter start and endpoints to avoid traversing empty space + start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS))); + end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS))); + startEvent += startDelta; + endEvent += endDelta; + } + + // Traverse the scanline and update the masked hierarchical z buffer + cullResult = TraverseScanline(start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0, zx); + + if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query + return CullingResult::VISIBLE; + + // move to the next scanline of tiles, update edge events and interpolate z + tileRowIdx += mTilesWidth; + z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy)); + UPDATE_TILE_EVENTS_Y(0); + UPDATE_TILE_EVENTS_Y(2); + } + + // Traverse the middle scanline of tiles. We must consider all three edges only in this region + if (tileRowIdx < tileEndRowIdx) + { + int start = 0, end = bbWidth; + if (TIGHT_TRAVERSAL) + { + // Compute tighter start and endpoints to avoid traversing lots of empty space + start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS))); + end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS))); + + // Switch the traversal start / end to account for the upper side edge + endEvent = MID_VTX_RIGHT ? topEvent : endEvent; + endDelta = MID_VTX_RIGHT ? topDelta : endDelta; + startEvent = MID_VTX_RIGHT ? startEvent : topEvent; + startDelta = MID_VTX_RIGHT ? startDelta : topDelta; + startEvent += startDelta; + endEvent += endDelta; + } + + // Traverse the scanline and update the masked hierarchical z buffer. + if (MID_VTX_RIGHT) + cullResult = TraverseScanline(start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0, zx); + else + cullResult = TraverseScanline(start, end, tileRowIdx, 0, 2, triEvent, zTriMin, zTriMax, z0, zx); + + if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query + return CullingResult::VISIBLE; + + tileRowIdx += mTilesWidth; + } + + // Traverse the top half of the triangle + if (tileRowIdx < tileEndRowIdx) + { + // move to the next scanline of tiles, update edge events and interpolate z + z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy)); + int i0 = MID_VTX_RIGHT + 0; + int i1 = MID_VTX_RIGHT + 1; + UPDATE_TILE_EVENTS_Y(i0); + UPDATE_TILE_EVENTS_Y(i1); + for (;;) + { + int start = 0, end = bbWidth; + if (TIGHT_TRAVERSAL) + { + // Compute tighter start and endpoints to avoid traversing lots of empty space + start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS))); + end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS))); + startEvent += startDelta; + endEvent += endDelta; + } + + // Traverse the scanline and update the masked hierarchical z buffer + cullResult = TraverseScanline(start, end, tileRowIdx, MID_VTX_RIGHT + 0, MID_VTX_RIGHT + 1, triEvent, zTriMin, zTriMax, z0, zx); + + if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query + return CullingResult::VISIBLE; + + // move to the next scanline of tiles, update edge events and interpolate z + tileRowIdx += mTilesWidth; + if (tileRowIdx >= tileEndRowIdx) + break; + z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy)); + UPDATE_TILE_EVENTS_Y(i0); + UPDATE_TILE_EVENTS_Y(i1); + } + } + } + else + { + if (TIGHT_TRAVERSAL) + { + // For large triangles, switch the traversal start / end to account for the upper side edge + endEvent = MID_VTX_RIGHT ? topEvent : endEvent; + endDelta = MID_VTX_RIGHT ? topDelta : endDelta; + startEvent = MID_VTX_RIGHT ? startEvent : topEvent; + startDelta = MID_VTX_RIGHT ? startDelta : topDelta; + } + + // Traverse the top half of the triangle + if (tileRowIdx < tileEndRowIdx) + { + int i0 = MID_VTX_RIGHT + 0; + int i1 = MID_VTX_RIGHT + 1; + for (;;) + { + int start = 0, end = bbWidth; + if (TIGHT_TRAVERSAL) + { + // Compute tighter start and endpoints to avoid traversing lots of empty space + start = max(0, min(bbWidth - 1, startEvent >> (TILE_WIDTH_SHIFT + FP_BITS))); + end = min(bbWidth, ((int)endEvent >> (TILE_WIDTH_SHIFT + FP_BITS))); + startEvent += startDelta; + endEvent += endDelta; + } + + // Traverse the scanline and update the masked hierarchical z buffer + cullResult = TraverseScanline(start, end, tileRowIdx, MID_VTX_RIGHT + 0, MID_VTX_RIGHT + 1, triEvent, zTriMin, zTriMax, z0, zx); + + if (TEST_Z && cullResult == CullingResult::VISIBLE) // Early out if performing occlusion query + return CullingResult::VISIBLE; + + // move to the next scanline of tiles, update edge events and interpolate z + tileRowIdx += mTilesWidth; + if (tileRowIdx >= tileEndRowIdx) + break; + z0 = _mmw_add_ps(z0, _mmw_set1_ps(zy)); + UPDATE_TILE_EVENTS_Y(i0); + UPDATE_TILE_EVENTS_Y(i1); + } + } + } + + return TEST_Z ? CullingResult::OCCLUDED : CullingResult::VISIBLE; + } + + template +#if PRECISE_COVERAGE != 0 + FORCE_INLINE int RasterizeTriangleBatch(__mwi ipVtxX[3], __mwi ipVtxY[3], __mw pVtxX[3], __mw pVtxY[3], __mw pVtxZ[3], unsigned int triMask, const ScissorRect *scissor) +#else + FORCE_INLINE int RasterizeTriangleBatch(__mw pVtxX[3], __mw pVtxY[3], __mw pVtxZ[3], unsigned int triMask, const ScissorRect *scissor) +#endif + { + int cullResult = CullingResult::VIEW_CULLED; + + ////////////////////////////////////////////////////////////////////////////// + // Compute bounding box and clamp to tile coordinates + ////////////////////////////////////////////////////////////////////////////// + + __mwi bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY; + ComputeBoundingBox(bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY, pVtxX, pVtxY, scissor); + + // Clamp bounding box to tiles (it's already padded in computeBoundingBox) + __mwi bbTileMinX = _mmw_srai_epi32(bbPixelMinX, TILE_WIDTH_SHIFT); + __mwi bbTileMinY = _mmw_srai_epi32(bbPixelMinY, TILE_HEIGHT_SHIFT); + __mwi bbTileMaxX = _mmw_srai_epi32(bbPixelMaxX, TILE_WIDTH_SHIFT); + __mwi bbTileMaxY = _mmw_srai_epi32(bbPixelMaxY, TILE_HEIGHT_SHIFT); + __mwi bbTileSizeX = _mmw_sub_epi32(bbTileMaxX, bbTileMinX); + __mwi bbTileSizeY = _mmw_sub_epi32(bbTileMaxY, bbTileMinY); + + // Cull triangles with zero bounding box + __mwi bboxSign = _mmw_or_epi32(_mmw_sub_epi32(bbTileSizeX, _mmw_set1_epi32(1)), _mmw_sub_epi32(bbTileSizeY, _mmw_set1_epi32(1))); + triMask &= ~_mmw_movemask_ps(simd_cast<__mw>(bboxSign)) & SIMD_ALL_LANES_MASK; + if (triMask == 0x0) + return cullResult; + + if (!TEST_Z) + cullResult = CullingResult::VISIBLE; + + ////////////////////////////////////////////////////////////////////////////// + // Set up screen space depth plane + ////////////////////////////////////////////////////////////////////////////// + + __mw zPixelDx, zPixelDy; + ComputeDepthPlane(pVtxX, pVtxY, pVtxZ, zPixelDx, zPixelDy); + + // Compute z value at min corner of bounding box. Offset to make sure z is conservative for all 8x4 subtiles + __mw bbMinXV0 = _mmw_sub_ps(_mmw_cvtepi32_ps(bbPixelMinX), pVtxX[0]); + __mw bbMinYV0 = _mmw_sub_ps(_mmw_cvtepi32_ps(bbPixelMinY), pVtxY[0]); + __mw zPlaneOffset = _mmw_fmadd_ps(zPixelDx, bbMinXV0, _mmw_fmadd_ps(zPixelDy, bbMinYV0, pVtxZ[0])); + __mw zTileDx = _mmw_mul_ps(zPixelDx, _mmw_set1_ps((float)TILE_WIDTH)); + __mw zTileDy = _mmw_mul_ps(zPixelDy, _mmw_set1_ps((float)TILE_HEIGHT)); + if (TEST_Z) + { + zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_max_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDx, _mmw_set1_ps(SUB_TILE_WIDTH)))); + zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_max_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDy, _mmw_set1_ps(SUB_TILE_HEIGHT)))); + } + else + { + zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_min_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDx, _mmw_set1_ps(SUB_TILE_WIDTH)))); + zPlaneOffset = _mmw_add_ps(zPlaneOffset, _mmw_min_ps(_mmw_setzero_ps(), _mmw_mul_ps(zPixelDy, _mmw_set1_ps(SUB_TILE_HEIGHT)))); + } + + // Compute Zmin and Zmax for the triangle (used to narrow the range for difficult tiles) + __mw zMin = _mmw_min_ps(pVtxZ[0], _mmw_min_ps(pVtxZ[1], pVtxZ[2])); + __mw zMax = _mmw_max_ps(pVtxZ[0], _mmw_max_ps(pVtxZ[1], pVtxZ[2])); + + ////////////////////////////////////////////////////////////////////////////// + // Sort vertices (v0 has lowest Y, and the rest is in winding order) and + // compute edges. Also find the middle vertex and compute tile + ////////////////////////////////////////////////////////////////////////////// + +#if PRECISE_COVERAGE != 0 + + // Rotate the triangle in the winding order until v0 is the vertex with lowest Y value + SortVertices(ipVtxX, ipVtxY); + + // Compute edges + __mwi edgeX[3] = { _mmw_sub_epi32(ipVtxX[1], ipVtxX[0]), _mmw_sub_epi32(ipVtxX[2], ipVtxX[1]), _mmw_sub_epi32(ipVtxX[2], ipVtxX[0]) }; + __mwi edgeY[3] = { _mmw_sub_epi32(ipVtxY[1], ipVtxY[0]), _mmw_sub_epi32(ipVtxY[2], ipVtxY[1]), _mmw_sub_epi32(ipVtxY[2], ipVtxY[0]) }; + + // Classify if the middle vertex is on the left or right and compute its position + int midVtxRight = ~_mmw_movemask_ps(simd_cast<__mw>(edgeY[1])); + __mwi midPixelX = _mmw_blendv_epi32(ipVtxX[1], ipVtxX[2], edgeY[1]); + __mwi midPixelY = _mmw_blendv_epi32(ipVtxY[1], ipVtxY[2], edgeY[1]); + __mwi midTileY = _mmw_srai_epi32(_mmw_max_epi32(midPixelY, SIMD_BITS_ZERO), TILE_HEIGHT_SHIFT + FP_BITS); + __mwi bbMidTileY = _mmw_max_epi32(bbTileMinY, _mmw_min_epi32(bbTileMaxY, midTileY)); + + // Compute edge events for the bottom of the bounding box, or for the middle tile in case of + // the edge originating from the middle vertex. + __mwi xDiffi[2], yDiffi[2]; + xDiffi[0] = _mmw_sub_epi32(ipVtxX[0], _mmw_slli_epi32(bbPixelMinX, FP_BITS)); + xDiffi[1] = _mmw_sub_epi32(midPixelX, _mmw_slli_epi32(bbPixelMinX, FP_BITS)); + yDiffi[0] = _mmw_sub_epi32(ipVtxY[0], _mmw_slli_epi32(bbPixelMinY, FP_BITS)); + yDiffi[1] = _mmw_sub_epi32(midPixelY, _mmw_slli_epi32(bbMidTileY, FP_BITS + TILE_HEIGHT_SHIFT)); + + ////////////////////////////////////////////////////////////////////////////// + // Edge slope setup - Note we do not conform to DX/GL rasterization rules + ////////////////////////////////////////////////////////////////////////////// + + // Potentially flip edge to ensure that all edges have positive Y slope. + edgeX[1] = _mmw_blendv_epi32(edgeX[1], _mmw_neg_epi32(edgeX[1]), edgeY[1]); + edgeY[1] = _mmw_abs_epi32(edgeY[1]); + + // Compute floating point slopes + __mw slope[3]; + slope[0] = _mmw_div_ps(_mmw_cvtepi32_ps(edgeX[0]), _mmw_cvtepi32_ps(edgeY[0])); + slope[1] = _mmw_div_ps(_mmw_cvtepi32_ps(edgeX[1]), _mmw_cvtepi32_ps(edgeY[1])); + slope[2] = _mmw_div_ps(_mmw_cvtepi32_ps(edgeX[2]), _mmw_cvtepi32_ps(edgeY[2])); + + // Modify slope of horizontal edges to make sure they mask out pixels above/below the edge. The slope is set to screen + // width to mask out all pixels above or below the horizontal edge. We must also add a small bias to acount for that + // vertices may end up off screen due to clipping. We're assuming that the round off error is no bigger than 1.0 + __mw horizontalSlopeDelta = _mmw_set1_ps(2.0f * ((float)mWidth + 2.0f*(GUARD_BAND_PIXEL_SIZE + 1.0f))); + __mwi horizontalSlope0 = _mmw_cmpeq_epi32(edgeY[0], _mmw_setzero_epi32()); + __mwi horizontalSlope1 = _mmw_cmpeq_epi32(edgeY[1], _mmw_setzero_epi32()); + slope[0] = _mmw_blendv_ps(slope[0], horizontalSlopeDelta, simd_cast<__mw>(horizontalSlope0)); + slope[1] = _mmw_blendv_ps(slope[1], _mmw_neg_ps(horizontalSlopeDelta), simd_cast<__mw>(horizontalSlope1)); + + __mwi vy[3] = { yDiffi[0], yDiffi[1], yDiffi[0] }; + __mwi offset0 = _mmw_and_epi32(_mmw_add_epi32(yDiffi[0], _mmw_set1_epi32(FP_HALF_PIXEL - 1)), _mmw_set1_epi32((int)((~0u) << FP_BITS))); + __mwi offset1 = _mmw_and_epi32(_mmw_add_epi32(yDiffi[1], _mmw_set1_epi32(FP_HALF_PIXEL - 1)), _mmw_set1_epi32((int)((~0u) << FP_BITS))); + vy[0] = _mmw_blendv_epi32(yDiffi[0], offset0, horizontalSlope0); + vy[1] = _mmw_blendv_epi32(yDiffi[1], offset1, horizontalSlope1); + + // Compute edge events for the bottom of the bounding box, or for the middle tile in case of + // the edge originating from the middle vertex. + __mwi slopeSign[3], absEdgeX[3]; + __mwi slopeTileDelta[3], eventStartRemainder[3], slopeTileRemainder[3], eventStart[3]; + for (int i = 0; i < 3; i++) + { + // Common, compute slope sign (used to propagate the remainder term when overflowing) is postive or negative x-direction + slopeSign[i] = _mmw_blendv_epi32(_mmw_set1_epi32(1), _mmw_set1_epi32(-1), edgeX[i]); + absEdgeX[i] = _mmw_abs_epi32(edgeX[i]); + + // Delta and error term for one vertical tile step. The exact delta is exactDelta = edgeX / edgeY, due to limited precision we + // repersent the delta as delta = qoutient + remainder / edgeY, where quotient = int(edgeX / edgeY). In this case, since we step + // one tile of scanlines at a time, the slope is computed for a tile-sized step. + slopeTileDelta[i] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[i], _mmw_set1_ps(FP_TILE_HEIGHT))); + slopeTileRemainder[i] = _mmw_sub_epi32(_mmw_slli_epi32(absEdgeX[i], FP_TILE_HEIGHT_SHIFT), _mmw_mullo_epi32(_mmw_abs_epi32(slopeTileDelta[i]), edgeY[i])); + + // Jump to bottom scanline of tile row, this is the bottom of the bounding box, or the middle vertex of the triangle. + // The jump can be in both positive and negative y-direction due to clipping / offscreen vertices. + __mwi tileStartDir = _mmw_blendv_epi32(slopeSign[i], _mmw_neg_epi32(slopeSign[i]), vy[i]); + __mwi tieBreaker = _mmw_blendv_epi32(_mmw_set1_epi32(0), _mmw_set1_epi32(1), tileStartDir); + __mwi tileStartSlope = _mmw_cvttps_epi32(_mmw_mul_ps(slope[i], _mmw_cvtepi32_ps(_mmw_neg_epi32(vy[i])))); + __mwi tileStartRemainder = _mmw_sub_epi32(_mmw_mullo_epi32(absEdgeX[i], _mmw_abs_epi32(vy[i])), _mmw_mullo_epi32(_mmw_abs_epi32(tileStartSlope), edgeY[i])); + + eventStartRemainder[i] = _mmw_sub_epi32(tileStartRemainder, tieBreaker); + __mwi overflow = _mmw_srai_epi32(eventStartRemainder[i], 31); + eventStartRemainder[i] = _mmw_add_epi32(eventStartRemainder[i], _mmw_and_epi32(overflow, edgeY[i])); + eventStartRemainder[i] = _mmw_blendv_epi32(eventStartRemainder[i], _mmw_sub_epi32(_mmw_sub_epi32(edgeY[i], eventStartRemainder[i]), _mmw_set1_epi32(1)), vy[i]); + + //eventStart[i] = xDiffi[i & 1] + tileStartSlope + (overflow & tileStartDir) + _mmw_set1_epi32(FP_HALF_PIXEL - 1) + tieBreaker; + eventStart[i] = _mmw_add_epi32(_mmw_add_epi32(xDiffi[i & 1], tileStartSlope), _mmw_and_epi32(overflow, tileStartDir)); + eventStart[i] = _mmw_add_epi32(_mmw_add_epi32(eventStart[i], _mmw_set1_epi32(FP_HALF_PIXEL - 1)), tieBreaker); + } + +#else // PRECISE_COVERAGE + + SortVertices(pVtxX, pVtxY); + + // Compute edges + __mw edgeX[3] = { _mmw_sub_ps(pVtxX[1], pVtxX[0]), _mmw_sub_ps(pVtxX[2], pVtxX[1]), _mmw_sub_ps(pVtxX[2], pVtxX[0]) }; + __mw edgeY[3] = { _mmw_sub_ps(pVtxY[1], pVtxY[0]), _mmw_sub_ps(pVtxY[2], pVtxY[1]), _mmw_sub_ps(pVtxY[2], pVtxY[0]) }; + + // Classify if the middle vertex is on the left or right and compute its position + int midVtxRight = ~_mmw_movemask_ps(edgeY[1]); + __mw midPixelX = _mmw_blendv_ps(pVtxX[1], pVtxX[2], edgeY[1]); + __mw midPixelY = _mmw_blendv_ps(pVtxY[1], pVtxY[2], edgeY[1]); + __mwi midTileY = _mmw_srai_epi32(_mmw_max_epi32(_mmw_cvttps_epi32(midPixelY), SIMD_BITS_ZERO), TILE_HEIGHT_SHIFT); + __mwi bbMidTileY = _mmw_max_epi32(bbTileMinY, _mmw_min_epi32(bbTileMaxY, midTileY)); + + ////////////////////////////////////////////////////////////////////////////// + // Edge slope setup - Note we do not conform to DX/GL rasterization rules + ////////////////////////////////////////////////////////////////////////////// + + // Compute floating point slopes + __mw slope[3]; + slope[0] = _mmw_div_ps(edgeX[0], edgeY[0]); + slope[1] = _mmw_div_ps(edgeX[1], edgeY[1]); + slope[2] = _mmw_div_ps(edgeX[2], edgeY[2]); + + // Modify slope of horizontal edges to make sure they mask out pixels above/below the edge. The slope is set to screen + // width to mask out all pixels above or below the horizontal edge. We must also add a small bias to acount for that + // vertices may end up off screen due to clipping. We're assuming that the round off error is no bigger than 1.0 + __mw horizontalSlopeDelta = _mmw_set1_ps((float)mWidth + 2.0f*(GUARD_BAND_PIXEL_SIZE + 1.0f)); + slope[0] = _mmw_blendv_ps(slope[0], horizontalSlopeDelta, _mmw_cmpeq_ps(edgeY[0], _mmw_setzero_ps())); + slope[1] = _mmw_blendv_ps(slope[1], _mmw_neg_ps(horizontalSlopeDelta), _mmw_cmpeq_ps(edgeY[1], _mmw_setzero_ps())); + + // Convert floaing point slopes to fixed point + __mwi slopeFP[3]; + slopeFP[0] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[0], _mmw_set1_ps(1 << FP_BITS))); + slopeFP[1] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[1], _mmw_set1_ps(1 << FP_BITS))); + slopeFP[2] = _mmw_cvttps_epi32(_mmw_mul_ps(slope[2], _mmw_set1_ps(1 << FP_BITS))); + + // Fan out edge slopes to avoid (rare) cracks at vertices. We increase right facing slopes + // by 1 LSB, which results in overshooting vertices slightly, increasing triangle coverage. + // e0 is always right facing, e1 depends on if the middle vertex is on the left or right + slopeFP[0] = _mmw_add_epi32(slopeFP[0], _mmw_set1_epi32(1)); + slopeFP[1] = _mmw_add_epi32(slopeFP[1], _mmw_srli_epi32(_mmw_not_epi32(simd_cast<__mwi>(edgeY[1])), 31)); + + // Compute slope deltas for an SIMD_LANES scanline step (tile height) + __mwi slopeTileDelta[3]; + slopeTileDelta[0] = _mmw_slli_epi32(slopeFP[0], TILE_HEIGHT_SHIFT); + slopeTileDelta[1] = _mmw_slli_epi32(slopeFP[1], TILE_HEIGHT_SHIFT); + slopeTileDelta[2] = _mmw_slli_epi32(slopeFP[2], TILE_HEIGHT_SHIFT); + + // Compute edge events for the bottom of the bounding box, or for the middle tile in case of + // the edge originating from the middle vertex. + __mwi xDiffi[2], yDiffi[2]; + xDiffi[0] = _mmw_slli_epi32(_mmw_sub_epi32(_mmw_cvttps_epi32(pVtxX[0]), bbPixelMinX), FP_BITS); + xDiffi[1] = _mmw_slli_epi32(_mmw_sub_epi32(_mmw_cvttps_epi32(midPixelX), bbPixelMinX), FP_BITS); + yDiffi[0] = _mmw_sub_epi32(_mmw_cvttps_epi32(pVtxY[0]), bbPixelMinY); + yDiffi[1] = _mmw_sub_epi32(_mmw_cvttps_epi32(midPixelY), _mmw_slli_epi32(bbMidTileY, TILE_HEIGHT_SHIFT)); + + __mwi eventStart[3]; + eventStart[0] = _mmw_sub_epi32(xDiffi[0], _mmw_mullo_epi32(slopeFP[0], yDiffi[0])); + eventStart[1] = _mmw_sub_epi32(xDiffi[1], _mmw_mullo_epi32(slopeFP[1], yDiffi[1])); + eventStart[2] = _mmw_sub_epi32(xDiffi[0], _mmw_mullo_epi32(slopeFP[2], yDiffi[0])); +#endif + + ////////////////////////////////////////////////////////////////////////////// + // Split bounding box into bottom - middle - top region. + ////////////////////////////////////////////////////////////////////////////// + + __mwi bbBottomIdx = _mmw_add_epi32(bbTileMinX, _mmw_mullo_epi32(bbTileMinY, _mmw_set1_epi32(mTilesWidth))); + __mwi bbTopIdx = _mmw_add_epi32(bbTileMinX, _mmw_mullo_epi32(_mmw_add_epi32(bbTileMinY, bbTileSizeY), _mmw_set1_epi32(mTilesWidth))); + __mwi bbMidIdx = _mmw_add_epi32(bbTileMinX, _mmw_mullo_epi32(midTileY, _mmw_set1_epi32(mTilesWidth))); + + ////////////////////////////////////////////////////////////////////////////// + // Loop over non-culled triangle and change SIMD axis to per-pixel + ////////////////////////////////////////////////////////////////////////////// + while (triMask) + { + unsigned int triIdx = find_clear_lsb(&triMask); + int triMidVtxRight = (midVtxRight >> triIdx) & 1; + + // Get Triangle Zmin zMax + __mw zTriMax = _mmw_set1_ps(simd_f32(zMax)[triIdx]); + __mw zTriMin = _mmw_set1_ps(simd_f32(zMin)[triIdx]); + + // Setup Zmin value for first set of 8x4 subtiles + __mw z0 = _mmw_fmadd_ps(_mmw_set1_ps(simd_f32(zPixelDx)[triIdx]), SIMD_SUB_TILE_COL_OFFSET_F, + _mmw_fmadd_ps(_mmw_set1_ps(simd_f32(zPixelDy)[triIdx]), SIMD_SUB_TILE_ROW_OFFSET_F, _mmw_set1_ps(simd_f32(zPlaneOffset)[triIdx]))); + float zx = simd_f32(zTileDx)[triIdx]; + float zy = simd_f32(zTileDy)[triIdx]; + + // Get dimension of bounding box bottom, mid & top segments + int bbWidth = simd_i32(bbTileSizeX)[triIdx]; + int bbHeight = simd_i32(bbTileSizeY)[triIdx]; + int tileRowIdx = simd_i32(bbBottomIdx)[triIdx]; + int tileMidRowIdx = simd_i32(bbMidIdx)[triIdx]; + int tileEndRowIdx = simd_i32(bbTopIdx)[triIdx]; + + if (bbWidth > BIG_TRIANGLE && bbHeight > BIG_TRIANGLE) // For big triangles we use a more expensive but tighter traversal algorithm + { +#if PRECISE_COVERAGE != 0 + if (triMidVtxRight) + cullResult &= RasterizeTriangle(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder); + else + cullResult &= RasterizeTriangle(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder); +#else + if (triMidVtxRight) + cullResult &= RasterizeTriangle(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy); + else + cullResult &= RasterizeTriangle(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy); +#endif + } + else + { +#if PRECISE_COVERAGE != 0 + if (triMidVtxRight) + cullResult &= RasterizeTriangle(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder); + else + cullResult &= RasterizeTriangle(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slope, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy, edgeY, absEdgeX, slopeSign, eventStartRemainder, slopeTileRemainder); +#else + if (triMidVtxRight) + cullResult &= RasterizeTriangle(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy); + else + cullResult &= RasterizeTriangle(triIdx, bbWidth, tileRowIdx, tileMidRowIdx, tileEndRowIdx, eventStart, slopeFP, slopeTileDelta, zTriMin, zTriMax, z0, zx, zy); +#endif + } + + if (TEST_Z && cullResult == CullingResult::VISIBLE) + return CullingResult::VISIBLE; + } + + return cullResult; + } + + template + FORCE_INLINE CullingResult RenderTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout) + { + assert(mMaskedHiZBuffer != nullptr); + + if (TEST_Z) + STATS_ADD(mStats.mOccludees.mNumProcessedTriangles, nTris); + else + STATS_ADD(mStats.mOccluders.mNumProcessedTriangles, nTris); + +#if PRECISE_COVERAGE != 0 + int originalRoundingMode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); +#endif + + int clipHead = 0; + int clipTail = 0; + __m128 clipTriBuffer[MAX_CLIPPED * 3]; + int cullResult = CullingResult::VIEW_CULLED; + + const unsigned int *inTrisPtr = inTris; + int numLanes = SIMD_LANES; + int triIndex = 0; + while (triIndex < nTris || clipHead != clipTail) + { + __mw vtxX[3], vtxY[3], vtxW[3]; + unsigned int triMask = SIMD_ALL_LANES_MASK; + + GatherTransformClip( clipHead, clipTail, numLanes, nTris, triIndex, vtxX, vtxY, vtxW, inVtx, inTrisPtr, vtxLayout, modelToClipMatrix, clipTriBuffer, triMask, clipPlaneMask ); + + if (triMask == 0x0) + continue; + + ////////////////////////////////////////////////////////////////////////////// + // Project, transform to screen space and perform backface culling. Note + // that we use z = 1.0 / vtx.w for depth, which means that z = 0 is far and + // z = 1 is near. We must also use a greater than depth test, and in effect + // everything is reversed compared to regular z implementations. + ////////////////////////////////////////////////////////////////////////////// + + __mw pVtxX[3], pVtxY[3], pVtxZ[3]; + +#if PRECISE_COVERAGE != 0 + __mwi ipVtxX[3], ipVtxY[3]; + ProjectVertices(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW); +#else + ProjectVertices(pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW); +#endif + + // Perform backface test. + __mw triArea1 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[1], pVtxX[0]), _mmw_sub_ps(pVtxY[2], pVtxY[0])); + __mw triArea2 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[0], pVtxX[2]), _mmw_sub_ps(pVtxY[0], pVtxY[1])); + __mw triArea = _mmw_sub_ps(triArea1, triArea2); + __mw ccwMask = _mmw_cmpgt_ps(triArea, _mmw_setzero_ps()); + +#if PRECISE_COVERAGE != 0 + triMask &= CullBackfaces(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding); +#else + triMask &= CullBackfaces(pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding); +#endif + + if (triMask == 0x0) + continue; + + ////////////////////////////////////////////////////////////////////////////// + // Setup and rasterize a SIMD batch of triangles + ////////////////////////////////////////////////////////////////////////////// +#if PRECISE_COVERAGE != 0 + cullResult &= RasterizeTriangleBatch(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, triMask, &mFullscreenScissor); +#else + cullResult &= RasterizeTriangleBatch(pVtxX, pVtxY, pVtxZ, triMask, &mFullscreenScissor); +#endif + + if (TEST_Z && cullResult == CullingResult::VISIBLE) { +#if PRECISE_COVERAGE != 0 + _MM_SET_ROUNDING_MODE(originalRoundingMode); +#endif + return CullingResult::VISIBLE; + } + } + +#if PRECISE_COVERAGE != 0 + _MM_SET_ROUNDING_MODE(originalRoundingMode); +#endif + return (CullingResult)cullResult; + } + + CullingResult RenderTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout) override + { + CullingResult retVal; + + if (vtxLayout.mStride == 16 && vtxLayout.mOffsetY == 4 && vtxLayout.mOffsetW == 12) + retVal = (CullingResult)RenderTriangles<0, 1>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout); + else + retVal = (CullingResult)RenderTriangles<0, 0>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout); + +#if MOC_RECORDER_ENABLE + RecordRenderTriangles( inVtx, inTris, nTris, modelToClipMatrix, clipPlaneMask, bfWinding, vtxLayout, retVal ); +#endif + return retVal; + } + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Occlusion query functions + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + CullingResult TestTriangles(const float *inVtx, const unsigned int *inTris, int nTris, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout) override + { + CullingResult retVal; + + if (vtxLayout.mStride == 16 && vtxLayout.mOffsetY == 4 && vtxLayout.mOffsetW == 12) + retVal = (CullingResult)RenderTriangles<1, 1>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout); + else + retVal = (CullingResult)RenderTriangles<1, 0>(inVtx, inTris, nTris, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout); + +#if MOC_RECORDER_ENABLE + { + std::lock_guard lock( mRecorderMutex ); + if( mRecorder != nullptr ) mRecorder->RecordTestTriangles( retVal, inVtx, inTris, nTris, modelToClipMatrix, clipPlaneMask, bfWinding, vtxLayout ); + } +#endif + return retVal; + } + + CullingResult TestRect( float xmin, float ymin, float xmax, float ymax, float wmin ) const override + { + STATS_ADD(mStats.mOccludees.mNumProcessedRectangles, 1); + assert(mMaskedHiZBuffer != nullptr); + + static const __m128i SIMD_TILE_PAD = _mm_setr_epi32(0, TILE_WIDTH, 0, TILE_HEIGHT); + static const __m128i SIMD_TILE_PAD_MASK = _mm_setr_epi32(~(TILE_WIDTH - 1), ~(TILE_WIDTH - 1), ~(TILE_HEIGHT - 1), ~(TILE_HEIGHT - 1)); + static const __m128i SIMD_SUB_TILE_PAD = _mm_setr_epi32(0, SUB_TILE_WIDTH, 0, SUB_TILE_HEIGHT); + static const __m128i SIMD_SUB_TILE_PAD_MASK = _mm_setr_epi32(~(SUB_TILE_WIDTH - 1), ~(SUB_TILE_WIDTH - 1), ~(SUB_TILE_HEIGHT - 1), ~(SUB_TILE_HEIGHT - 1)); + + ////////////////////////////////////////////////////////////////////////////// + // Compute screen space bounding box and guard for out of bounds + ////////////////////////////////////////////////////////////////////////////// +#if USE_D3D != 0 + __m128 pixelBBox = _mmx_fmadd_ps(_mm_setr_ps(xmin, xmax, ymax, ymin), mIHalfSize, mICenter); +#else + __m128 pixelBBox = _mmx_fmadd_ps(_mm_setr_ps(xmin, xmax, ymin, ymax), mIHalfSize, mICenter); +#endif + __m128i pixelBBoxi = _mm_cvttps_epi32(pixelBBox); + pixelBBoxi = _mmx_max_epi32(_mm_setzero_si128(), _mmx_min_epi32(mIScreenSize, pixelBBoxi)); + + ////////////////////////////////////////////////////////////////////////////// + // Pad bounding box to (32xN) tiles. Tile BB is used for looping / traversal + ////////////////////////////////////////////////////////////////////////////// + __m128i tileBBoxi = _mm_and_si128(_mm_add_epi32(pixelBBoxi, SIMD_TILE_PAD), SIMD_TILE_PAD_MASK); + int txMin = simd_i32(tileBBoxi)[0] >> TILE_WIDTH_SHIFT; + int txMax = simd_i32(tileBBoxi)[1] >> TILE_WIDTH_SHIFT; + int tileRowIdx = (simd_i32(tileBBoxi)[2] >> TILE_HEIGHT_SHIFT)*mTilesWidth; + int tileRowIdxEnd = (simd_i32(tileBBoxi)[3] >> TILE_HEIGHT_SHIFT)*mTilesWidth; + + if (simd_i32(tileBBoxi)[0] == simd_i32(tileBBoxi)[1] || simd_i32(tileBBoxi)[2] == simd_i32(tileBBoxi)[3]) + { +#if MOC_RECORDER_ENABLE + { + std::lock_guard lock( mRecorderMutex ); + if( mRecorder != nullptr ) mRecorder->RecordTestRect( CullingResult::VIEW_CULLED, xmin, ymin, xmax, ymax, wmin ); + } +#endif + return CullingResult::VIEW_CULLED; + } + + /////////////////////////////////////////////////////////////////////////////// + // Pad bounding box to (8x4) subtiles. Skip SIMD lanes outside the subtile BB + /////////////////////////////////////////////////////////////////////////////// + __m128i subTileBBoxi = _mm_and_si128(_mm_add_epi32(pixelBBoxi, SIMD_SUB_TILE_PAD), SIMD_SUB_TILE_PAD_MASK); + __mwi stxmin = _mmw_set1_epi32(simd_i32(subTileBBoxi)[0] - 1); // - 1 to be able to use GT test + __mwi stymin = _mmw_set1_epi32(simd_i32(subTileBBoxi)[2] - 1); // - 1 to be able to use GT test + __mwi stxmax = _mmw_set1_epi32(simd_i32(subTileBBoxi)[1]); + __mwi stymax = _mmw_set1_epi32(simd_i32(subTileBBoxi)[3]); + + // Setup pixel coordinates used to discard lanes outside subtile BB + __mwi startPixelX = _mmw_add_epi32(SIMD_SUB_TILE_COL_OFFSET, _mmw_set1_epi32(simd_i32(tileBBoxi)[0])); + __mwi pixelY = _mmw_add_epi32(SIMD_SUB_TILE_ROW_OFFSET, _mmw_set1_epi32(simd_i32(tileBBoxi)[2])); + + ////////////////////////////////////////////////////////////////////////////// + // Compute z from w. Note that z is reversed order, 0 = far, 1 = near, which + // means we use a greater than test, so zMax is used to test for visibility. + ////////////////////////////////////////////////////////////////////////////// + __mw zMax = _mmw_div_ps(_mmw_set1_ps(1.0f), _mmw_set1_ps(wmin)); + + for (;;) + { + __mwi pixelX = startPixelX; + for (int tx = txMin;;) + { + STATS_ADD(mStats.mOccludees.mNumTilesTraversed, 1); + + int tileIdx = tileRowIdx + tx; + assert(tileIdx >= 0 && tileIdx < mTilesWidth*mTilesHeight); + + // Fetch zMin from masked hierarchical Z buffer +#if QUICK_MASK != 0 + __mw zBuf = mMaskedHiZBuffer[tileIdx].mZMin[0]; +#else + __mwi mask = mMaskedHiZBuffer[tileIdx].mMask; + __mw zMin0 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[0], mMaskedHiZBuffer[tileIdx].mZMin[1], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_set1_epi32(~0)))); + __mw zMin1 = _mmw_blendv_ps(mMaskedHiZBuffer[tileIdx].mZMin[1], mMaskedHiZBuffer[tileIdx].mZMin[0], simd_cast<__mw>(_mmw_cmpeq_epi32(mask, _mmw_setzero_epi32()))); + __mw zBuf = _mmw_min_ps(zMin0, zMin1); +#endif + // Perform conservative greater than test against hierarchical Z buffer (zMax >= zBuf means the subtile is visible) + __mwi zPass = simd_cast<__mwi>(_mmw_cmpge_ps(zMax, zBuf)); //zPass = zMax >= zBuf ? ~0 : 0 + + // Mask out lanes corresponding to subtiles outside the bounding box + __mwi bboxTestMin = _mmw_and_epi32(_mmw_cmpgt_epi32(pixelX, stxmin), _mmw_cmpgt_epi32(pixelY, stymin)); + __mwi bboxTestMax = _mmw_and_epi32(_mmw_cmpgt_epi32(stxmax, pixelX), _mmw_cmpgt_epi32(stymax, pixelY)); + __mwi boxMask = _mmw_and_epi32(bboxTestMin, bboxTestMax); + zPass = _mmw_and_epi32(zPass, boxMask); + + // If not all tiles failed the conservative z test we can immediately terminate the test + if (!_mmw_testz_epi32(zPass, zPass)) + { +#if MOC_RECORDER_ENABLE + { + std::lock_guard lock( mRecorderMutex ); + if( mRecorder != nullptr ) mRecorder->RecordTestRect( CullingResult::VISIBLE, xmin, ymin, xmax, ymax, wmin ); + } +#endif + return CullingResult::VISIBLE; + } + + if (++tx >= txMax) + break; + pixelX = _mmw_add_epi32(pixelX, _mmw_set1_epi32(TILE_WIDTH)); + } + + tileRowIdx += mTilesWidth; + if (tileRowIdx >= tileRowIdxEnd) + break; + pixelY = _mmw_add_epi32(pixelY, _mmw_set1_epi32(TILE_HEIGHT)); + } +#if MOC_RECORDER_ENABLE + { + std::lock_guard lock( mRecorderMutex ); + if( mRecorder != nullptr ) mRecorder->RecordTestRect( CullingResult::OCCLUDED, xmin, ymin, xmax, ymax, wmin ); + } +#endif + return CullingResult::OCCLUDED; + } + + template + FORCE_INLINE void BinTriangles(const float *inVtx, const unsigned int *inTris, int nTris, TriList *triLists, unsigned int nBinsW, unsigned int nBinsH, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout) + { + assert(mMaskedHiZBuffer != nullptr); + +#if PRECISE_COVERAGE != 0 + int originalRoundingMode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); +#endif + + STATS_ADD(mStats.mOccluders.mNumProcessedTriangles, nTris); + + int clipHead = 0; + int clipTail = 0; + __m128 clipTriBuffer[MAX_CLIPPED * 3]; + + const unsigned int *inTrisPtr = inTris; + int numLanes = SIMD_LANES; + int triIndex = 0; + while (triIndex < nTris || clipHead != clipTail) + { + unsigned int triMask = SIMD_ALL_LANES_MASK; + __mw vtxX[3], vtxY[3], vtxW[3]; + + GatherTransformClip( clipHead, clipTail, numLanes, nTris, triIndex, vtxX, vtxY, vtxW, inVtx, inTrisPtr, vtxLayout, modelToClipMatrix, clipTriBuffer, triMask, clipPlaneMask ); + + if (triMask == 0x0) + continue; + + ////////////////////////////////////////////////////////////////////////////// + // Project, transform to screen space and perform backface culling. Note + // that we use z = 1.0 / vtx.w for depth, which means that z = 0 is far and + // z = 1 is near. We must also use a greater than depth test, and in effect + // everything is reversed compared to regular z implementations. + ////////////////////////////////////////////////////////////////////////////// + + __mw pVtxX[3], pVtxY[3], pVtxZ[3]; + +#if PRECISE_COVERAGE != 0 + __mwi ipVtxX[3], ipVtxY[3]; + ProjectVertices(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW); +#else + ProjectVertices(pVtxX, pVtxY, pVtxZ, vtxX, vtxY, vtxW); +#endif + + // Perform backface test. + __mw triArea1 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[1], pVtxX[0]), _mmw_sub_ps(pVtxY[2], pVtxY[0])); + __mw triArea2 = _mmw_mul_ps(_mmw_sub_ps(pVtxX[0], pVtxX[2]), _mmw_sub_ps(pVtxY[0], pVtxY[1])); + __mw triArea = _mmw_sub_ps(triArea1, triArea2); + __mw ccwMask = _mmw_cmpgt_ps(triArea, _mmw_setzero_ps()); + +#if PRECISE_COVERAGE != 0 + triMask &= CullBackfaces(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding); +#else + triMask &= CullBackfaces(pVtxX, pVtxY, pVtxZ, ccwMask, bfWinding); +#endif + + if (triMask == 0x0) + continue; + + ////////////////////////////////////////////////////////////////////////////// + // Bin triangles + ////////////////////////////////////////////////////////////////////////////// + + unsigned int binWidth; + unsigned int binHeight; + ComputeBinWidthHeight(nBinsW, nBinsH, binWidth, binHeight); + + // Compute pixel bounding box + __mwi bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY; + ComputeBoundingBox(bbPixelMinX, bbPixelMinY, bbPixelMaxX, bbPixelMaxY, pVtxX, pVtxY, &mFullscreenScissor); + + while (triMask) + { + unsigned int triIdx = find_clear_lsb(&triMask); + + // Clamp bounding box to bins + int startX = min(nBinsW-1, simd_i32(bbPixelMinX)[triIdx] / binWidth); + int startY = min(nBinsH-1, simd_i32(bbPixelMinY)[triIdx] / binHeight); + int endX = min(nBinsW, (simd_i32(bbPixelMaxX)[triIdx] + binWidth - 1) / binWidth); + int endY = min(nBinsH, (simd_i32(bbPixelMaxY)[triIdx] + binHeight - 1) / binHeight); + + for (int y = startY; y < endY; ++y) + { + for (int x = startX; x < endX; ++x) + { + int binIdx = x + y * nBinsW; + unsigned int writeTriIdx = triLists[binIdx].mTriIdx; + for (int i = 0; i < 3; ++i) + { +#if PRECISE_COVERAGE != 0 + ((int*)triLists[binIdx].mPtr)[i * 3 + writeTriIdx * 9 + 0] = simd_i32(ipVtxX[i])[triIdx]; + ((int*)triLists[binIdx].mPtr)[i * 3 + writeTriIdx * 9 + 1] = simd_i32(ipVtxY[i])[triIdx]; +#else + triLists[binIdx].mPtr[i * 3 + writeTriIdx * 9 + 0] = simd_f32(pVtxX[i])[triIdx]; + triLists[binIdx].mPtr[i * 3 + writeTriIdx * 9 + 1] = simd_f32(pVtxY[i])[triIdx]; +#endif + triLists[binIdx].mPtr[i * 3 + writeTriIdx * 9 + 2] = simd_f32(pVtxZ[i])[triIdx]; + } + triLists[binIdx].mTriIdx++; + } + } + } + } +#if PRECISE_COVERAGE != 0 + _MM_SET_ROUNDING_MODE(originalRoundingMode); +#endif + } + + void BinTriangles(const float *inVtx, const unsigned int *inTris, int nTris, TriList *triLists, unsigned int nBinsW, unsigned int nBinsH, const float *modelToClipMatrix, BackfaceWinding bfWinding, ClipPlanes clipPlaneMask, const VertexLayout &vtxLayout) override + { + if (vtxLayout.mStride == 16 && vtxLayout.mOffsetY == 4 && vtxLayout.mOffsetW == 12) + BinTriangles(inVtx, inTris, nTris, triLists, nBinsW, nBinsH, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout); + else + BinTriangles(inVtx, inTris, nTris, triLists, nBinsW, nBinsH, modelToClipMatrix, bfWinding, clipPlaneMask, vtxLayout); + } + + template + void GatherTransformClip( int & clipHead, int & clipTail, int & numLanes, int nTris, int & triIndex, __mw * vtxX, __mw * vtxY, __mw * vtxW, const float * inVtx, const unsigned int * &inTrisPtr, const VertexLayout & vtxLayout, const float * modelToClipMatrix, __m128 * clipTriBuffer, unsigned int &triMask, ClipPlanes clipPlaneMask ) + { + ////////////////////////////////////////////////////////////////////////////// + // Assemble triangles from the index list + ////////////////////////////////////////////////////////////////////////////// + unsigned int triClipMask = SIMD_ALL_LANES_MASK; + + if( clipHead != clipTail ) + { + int clippedTris = clipHead > clipTail ? clipHead - clipTail : MAX_CLIPPED + clipHead - clipTail; + clippedTris = min( clippedTris, SIMD_LANES ); + +#if CLIPPING_PRESERVES_ORDER != 0 + // if preserving order, don't mix clipped and new triangles, handle the clip buffer fully + // and then continue gathering; this is not as efficient - ideally we want to gather + // at the end (if clip buffer has less than SIMD_LANES triangles) but that requires + // more modifications below - something to do in the future. + numLanes = 0; +#else + // Fill out SIMD registers by fetching more triangles. + numLanes = max( 0, min( SIMD_LANES - clippedTris, nTris - triIndex ) ); +#endif + + if( numLanes > 0 ) { + if( FAST_GATHER ) + GatherVerticesFast( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes ); + else + GatherVertices( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes, vtxLayout ); + + TransformVerts( vtxX, vtxY, vtxW, modelToClipMatrix ); + } + + for( int clipTri = numLanes; clipTri < numLanes + clippedTris; clipTri++ ) + { + int triIdx = clipTail * 3; + for( int i = 0; i < 3; i++ ) + { + simd_f32( vtxX[i] )[clipTri] = simd_f32( clipTriBuffer[triIdx + i] )[0]; + simd_f32( vtxY[i] )[clipTri] = simd_f32( clipTriBuffer[triIdx + i] )[1]; + simd_f32( vtxW[i] )[clipTri] = simd_f32( clipTriBuffer[triIdx + i] )[2]; + } + clipTail = ( clipTail + 1 ) & ( MAX_CLIPPED - 1 ); + } + + triIndex += numLanes; + inTrisPtr += numLanes * 3; + + triMask = ( 1U << ( clippedTris + numLanes ) ) - 1; + triClipMask = ( 1U << numLanes ) - 1; // Don't re-clip already clipped triangles + } + else + { + numLanes = min( SIMD_LANES, nTris - triIndex ); + triMask = ( 1U << numLanes ) - 1; + triClipMask = triMask; + + if( FAST_GATHER ) + GatherVerticesFast( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes ); + else + GatherVertices( vtxX, vtxY, vtxW, inVtx, inTrisPtr, numLanes, vtxLayout ); + + TransformVerts( vtxX, vtxY, vtxW, modelToClipMatrix ); + + triIndex += SIMD_LANES; + inTrisPtr += SIMD_LANES * 3; + } + + ////////////////////////////////////////////////////////////////////////////// + // Clip transformed triangles + ////////////////////////////////////////////////////////////////////////////// + + if( clipPlaneMask != ClipPlanes::CLIP_PLANE_NONE ) + ClipTriangleAndAddToBuffer( vtxX, vtxY, vtxW, clipTriBuffer, clipHead, triMask, triClipMask, clipPlaneMask ); + } + + void RenderTrilist(const TriList &triList, const ScissorRect *scissor) override + { + assert(mMaskedHiZBuffer != nullptr); + + // Setup fullscreen scissor rect as default + scissor = scissor == nullptr ? &mFullscreenScissor : scissor; + + for (unsigned int i = 0; i < triList.mTriIdx; i += SIMD_LANES) + { + ////////////////////////////////////////////////////////////////////////////// + // Fetch triangle vertices + ////////////////////////////////////////////////////////////////////////////// + + unsigned int numLanes = min((unsigned int)SIMD_LANES, triList.mTriIdx - i); + unsigned int triMask = (1U << numLanes) - 1; + + __mw pVtxX[3], pVtxY[3], pVtxZ[3]; +#if PRECISE_COVERAGE != 0 + __mwi ipVtxX[3], ipVtxY[3]; + for (unsigned int l = 0; l < numLanes; ++l) + { + unsigned int triIdx = i + l; + for (int v = 0; v < 3; ++v) + { + simd_i32(ipVtxX[v])[l] = ((int*)triList.mPtr)[v * 3 + triIdx * 9 + 0]; + simd_i32(ipVtxY[v])[l] = ((int*)triList.mPtr)[v * 3 + triIdx * 9 + 1]; + simd_f32(pVtxZ[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 2]; + } + } + + for (int v = 0; v < 3; ++v) + { + pVtxX[v] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxX[v]), _mmw_set1_ps(FP_INV)); + pVtxY[v] = _mmw_mul_ps(_mmw_cvtepi32_ps(ipVtxY[v]), _mmw_set1_ps(FP_INV)); + } + + ////////////////////////////////////////////////////////////////////////////// + // Setup and rasterize a SIMD batch of triangles + ////////////////////////////////////////////////////////////////////////////// + + RasterizeTriangleBatch(ipVtxX, ipVtxY, pVtxX, pVtxY, pVtxZ, triMask, scissor); +#else + for (unsigned int l = 0; l < numLanes; ++l) + { + unsigned int triIdx = i + l; + for (int v = 0; v < 3; ++v) + { + simd_f32(pVtxX[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 0]; + simd_f32(pVtxY[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 1]; + simd_f32(pVtxZ[v])[l] = triList.mPtr[v * 3 + triIdx * 9 + 2]; + } + } + + ////////////////////////////////////////////////////////////////////////////// + // Setup and rasterize a SIMD batch of triangles + ////////////////////////////////////////////////////////////////////////////// + + RasterizeTriangleBatch(pVtxX, pVtxY, pVtxZ, triMask, scissor); +#endif + + } + } + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Debugging and statistics + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + MaskedOcclusionCulling::Implementation GetImplementation() override + { + return gInstructionSet; + } + + void ComputePixelDepthBuffer(float *depthData, bool flipY) override + { + assert(mMaskedHiZBuffer != nullptr); + for (int y = 0; y < mHeight; y++) + { + for (int x = 0; x < mWidth; x++) + { + // Compute 32xN tile index (SIMD value offset) + int tx = x / TILE_WIDTH; + int ty = y / TILE_HEIGHT; + int tileIdx = ty * mTilesWidth + tx; + + // Compute 8x4 subtile index (SIMD lane offset) + int stx = (x % TILE_WIDTH) / SUB_TILE_WIDTH; + int sty = (y % TILE_HEIGHT) / SUB_TILE_HEIGHT; + int subTileIdx = sty * 4 + stx; + + // Compute pixel index in subtile (bit index in 32-bit word) + int px = (x % SUB_TILE_WIDTH); + int py = (y % SUB_TILE_HEIGHT); + int bitIdx = py * 8 + px; + + int pixelLayer = (simd_i32(mMaskedHiZBuffer[tileIdx].mMask)[subTileIdx] >> bitIdx) & 1; + float pixelDepth = simd_f32(mMaskedHiZBuffer[tileIdx].mZMin[pixelLayer])[subTileIdx]; + + if( flipY ) + depthData[( mHeight - y - 1 ) * mWidth + x] = pixelDepth; + else + depthData[y * mWidth + x] = pixelDepth; + } + } + } + + OcclusionCullingStatistics GetStatistics() override + { + return mStats; + } + +}; diff --git a/neo/libs/moc/README.md b/neo/libs/moc/README.md new file mode 100644 index 00000000..47bc4dc6 --- /dev/null +++ b/neo/libs/moc/README.md @@ -0,0 +1,450 @@ +# MaskedOcclusionCulling + +This code accompanies the research paper ["Masked Software Occlusion Culling"](https://software.intel.com/en-us/articles/masked-software-occlusion-culling), +and implements an efficient alternative to the hierarchical depth buffer algorithm. Our algorithm decouples depth values and coverage, and operates directly +on the hierarchical depth buffer. It lets us efficiently parallelize both coverage computations and hierarchical depth buffer updates. + +## Update May 2018 + +Added the ability to merge 2 depth buffers, this allows both an alterative method for parallelizing buffer creation and a way to reduce silhouette bleed when input data cannot be roughly sorted from front to back, for example rendering large terrain patches with foreground occluders in an open world game engine. + +## Requirements + +This code is mainly optimized for AVX capable CPUs. However, we also provide SSE 4.1 and SSE 2 implementations for backwards compatibility. The appropriate +implementation will be chosen during run-time based on the CPU's capabilities. + +## Notes on build time + +The code is optimized for runtime performance and may require a long time to compile due to heavy code inlining. This can be worked around by compiling +a library file. An alternative solution is to disable *whole program optimizations* for the `MaskedOcclusionCulling.cpp`, +`MaskedOcclusionCullingAVX2.cpp` and `MaskedOcclusionCullingAVX512.cpp` files. It does not impact runtime performance, but greatly reduces the time of program linking. + +## Notes on coordinate systems and winding + +Most inputs are given as clip space (x,y,w) coordinates assuming the same right handed coordinate system as used by DirectX and OpenGL (x positive right, y +positive up and w positive in the view direction). Note that we use the clip space w coordinate for depth and disregard the z coordinate. Internally our +masked hierarchical depth buffer stores *depth = 1 / w*. + +The `TestRect()` function is an exception and instead accepts normalized device coordinates (NDC), *(x' = x/w, y' = y/w)*, where the visible screen region +maps to the range [-1,1] for *x'* and *y'* (x positive right and y positive up). Again, this is consistent with both DirectX and OpenGL behavior. + +By default, the screen space coordinate system used internally to access our hierarchical depth buffer follows DirectX conventions (y positive down), which is +**not** consistent with OpenGL (y positive up). This can be configured by changing the `USE_D3D` define. The screen space coordinate system affects the layout +of the buffer returned by the `ComputePixelDepthBuffer()` function, scissor rectangles (which are specified in screen space coordinates), and rasterization +tie-breaker rules if `PRECISE_COVERAGE` is enabled. + +## API / Tutorial + +We have made an effort to keep the API as simple and minimal as possible. The rendering functions are quite similar to submitting DirectX or OpenGL drawcalls +and we hope they will feel natural to anyone with graphics programming experience. In the following we will use the example project as a tutorial to showcase +the API. Please refer to the documentation in the header file for further details. + +### Setup + +We begin by creating a new instance of the occlusion culling object. The object is created using the static `Create()` function rather than a standard +constructor, and can be destroyed using the `Destroy()` function. The reason for using the factory `Create()`/`Destroy()` design pattern is that we want to +support custom (aligned) memory allocators, and that the library choses either the AVX-512, AVX or SSE implementation based on the CPU's capabilities. + +```C++ +MaskedOcclusionCulling *moc = MaskedOcclusionCulling::Create(); + +... + +MaskedOcclusionCulling::Destroy(moc); +``` + +The created object is empty and has no hierarchical depth buffer attached, so we must first allocate a buffer using the `SetResolution()` function. This function can +also be used later to resize the hierarchical depth buffer, causing it to be re-allocated. Note that the resolution width must be a multiple of 8, and the height +a multiple of 4. This is a limitation of the occlusion culling algorithm. + +```C++ +int width = 1920; +int height = 1080; +moc.SetResolution(width, height); // Set full HD resolution +``` +After setting the resolution we can start rendering occluders and performing occlusion queries. We must first clear the hierarchical depth buffer + +```C++ +// Clear hierarchical depth buffer to far depth +moc.ClearDepthBuffer(); +``` + +**Optional** The `SetNearClipPlane()` function can be used to configure the distance to the near clipping plane to make the occlusion culling renderer match your DX/GL +renderer. The default value for the near plane is 0 which should work as expected unless your application relies on having onscreen geometry clipped by +the near plane. + +```C++ +float nearClipDist = 1.0f; +moc.SetNearClipPlane(nearClipDist); // Set near clipping dist (optional) +``` + +### Occluder rendering + +The `RenderTriangles()` function renders triangle meshes to the hierarchical depth buffer. Similar to DirectX/OpenGL, meshes are constructed from a vertex array +and an triangle index array. By default, the vertices are given as *(x,y,z,w)* floating point clip space coordinates, but the *z*-coordinate is ignored and +instead we use *depth = 1 / w*. We expose a `TransformVertices()` utility function to transform vertices from *(x,y,z,1)* model/world space to *(x,y,z,w)* clip +space, but you can use your own transform code as well. For more information on the `TransformVertices()` function, please refer to the documentaiton in the +header file. + +The triangle index array is identical to a DirectX or OpenGL triangle list and connects vertices to form triangles. Every three indices in the array form a new +triangle, so the size of the array must be a multiple of 3. Note that we only support triangle lists, and we currently have no plans on supporting other primitives +such as strips or fans. + +```C++ +struct ClipSpaceVertex { float x, y, z, w; }; + +// Create an example triangle. The z component of each vertex is not used by the +// occlusion culling system. +ClipspaceVertex triVerts[] = { { 5, 0, 0, 10 }, { 30, 0, 0, 20 }, { 10, 50, 0, 40 } }; +unsigned int triIndices[] = { 0, 1, 2 }; +unsigned int nTris = 1; + +// Render an example triangle +moc.RenderTriangles(triVerts, triIndices, nTris); +``` + +**Transform** It is possible to include a transform when calling `RenderTriangles()`, by passing the modelToClipSpace parameter. This is equivalent to calling `TransformVertices()`, followed +by `RenderTriangles()`, but performing the transform as shown in the example below typically +leads to better performance. + +```C++ +// Example matrix swapping the x and y coordinates +float swapxyMatrix[4][4] = { + {0,1,0,0}, + {1,0,0,0}, + {0,0,1,0}, + {0,0,0,1}}; + +// Render triangle with transform. +moc.RenderTriangles(triVerts, triIndices, nTris, swapxyMatrix); +``` + +**Backface Culling** By default, clockwise winded triangles are considered backfacing and are culled when rasterizing occluders. However, you can +configure the `RenderTriangles()` function to backface cull either clockwise or counter-clockwise winded triangles, or to disable backface culling +for two-sided rendering. + +```C++ +// A clockwise winded (normally backfacing) triangle +ClipspaceVertex cwTriVerts[] = { { 7, -7, 0, 20 },{ 7.5, -7, 0, 20 },{ 7, -7.5, 0, 20 } }; +unsigned int cwTriIndices[] = { 0, 1, 2 }; + +// Render with counter-clockwise backface culling, the triangle is drawn +moc->RenderTriangles((float*)cwTriVerts, cwTriIndices, 1, nullptr, BACKFACE_CCW); +``` + +The rasterization code only handles counter-clockwise winded triangles, so configurable backface culling is implemented by re-winding clockwise winded triangles +on the fly. Therefore, other culling modes than `BACKFACE_CW` may decrease performance slightly. + +**Clip Flags** `RenderTriangles()` accepts an additional parameter to optimize polygon clipping. The calling application may disable any clipping plane if it can +guarantee that the mesh does not intersect said clipping plane. In the example below we have a quad which is entirely on screen, and we can disable +all clipping planes. **Warning** it is unsafe to incorrectly disable clipping planes and this may cause the program to crash or perform out of bounds +memory accesses. Consider this a power user feature (use `CLIP_PLANE_ALL` to clip against the full frustum when in doubt). + +```C++ +// Create a quad completely within the view frustum +ClipspaceVertex quadVerts[] + = { { -150, -150, 0, 200 },{ -10, -65, 0, 75 },{ 0, 0, 0, 20 },{ -40, 10, 0, 50 } }; +unsigned int quadIndices[] = { 0, 1, 2, 0, 2, 3 }; +unsigned int nTris = 2; + +// Render the quad. As an optimization, indicate that clipping is not required +moc.RenderTriangles((float*)quadVerts, quadIndices, nTris, nullptr, BACKFACE_CW, CLIP_PLANE_NONE); +``` + +**Vertex Storage Layout** Finally, the `RenderTriangles()` supports configurable vertex storage layout. The code so far has used an array of structs (AoS) layout based +on the `ClipSpaceVertex` struct, and this is the default behaviour. You may use the `VertexLayout` struct to configure the memory layout of the vertex data. Note that +the vertex pointer passed to the `RenderTriangles()` should point at the *x* coordinate of the first vertex, so there is no x coordinate offset specified in the struct. + +```C++ +struct VertexLayout +{ + int mStride; // Stride between vertices + int mOffsetY; // Offset to vertex y coordinate + int mOffsetW; // Offset to vertex w coordinate +}; +``` + +For example, you can configure a struct of arrays (SoA) layout as follows + +```C++ +// A triangle specified on struct of arrays (SoA) form +float SoAVerts[] = { + 10, 10, 7, // x-coordinates + -10, -7, -10, // y-coordinates + 10, 10, 10 // w-coordinates +}; + +// Set vertex layout (stride, y offset, w offset) +VertexLayout SoAVertexLayout(sizeof(float), 3 * sizeof(float), 6 * sizeof(float)); + +// Render triangle with SoA layout +moc.RenderTriangles((float*)SoAVerts, triIndices, 1, nullptr, BACKFACE_CW, CLIP_PLANE_ALL, SoAVertexLayout); +``` + +Vertex layout may affect performance. We have seen no large performance impact when using either SoA or AoS layout, but generally speaking the +vertex position data should be packed as compactly into memory as possible to minimize number of cache misses. It is, for example, not advicable to bundle vertex +position data together with normals, texture coordinates, etc. and using a large stride. + +### Occlusion queries + +After rendering a few occluder meshes you can begin to perform occlusion queries. There are two functions for occlusion queries, called `TestTriangles()` and +`TestRect()`. The `TestTriangles()` function is identical to `RenderTriangles()` with the exception being that it performs an occlusion query and does not +update the hierarchical depth buffer. The result of the occlusion query is returned as an enum, which indicates if the triangles are `VISIBLE`, `OCCLUDED`, or were +`VIEW_CULLED`. Here, `VIEW_CULLED` means that all triangles were either frustum or back face culling, so no occlusion culling test had to be performed. + +```C++ +// A triangle that is partly, but not completely, overlapped by the quad rendered before +ClipspaceVertex oqTriVerts[] = { { 0, 50, 0, 200 },{ -60, -60, 0, 200 },{ 20, -40, 0, 200 } }; +unsigned int oqTriIndices[] = { 0, 1, 2 }; +unsigned int nTris = 1; + +// Perform an occlusion query. The triangle is visible and the query should return VISIBLE +CullingResult result = moc.TestTriangles((float*)oqTriVerts, oqTriIndices, nTris); +``` + +The `TestRect()` function performs an occlusion query for a rectangular screen space region with a given depth. It can be used to, for example, quickly test +the projected bounding box of an object to determine if the entire object is visible or not. The function is considerably faster than `TestTriangles()` becuase +it does not require input assembly, clipping, or triangle rasterization. The queries are typically less accurate as screen space bounding rectangles tend to +grow quite large, but we've personally seen best overall performance using this type of culling. + +```C++ +// Perform an occlusion query testing if a rectangle is visible. The rectangle is completely +// behind the previously drawn quad, so the query should indicate that it's occluded +result = moc.TestRect(-0.6f, -0.6f, -0.4f, -0.4f, 100); +``` + +Unlike the other functions the input to `TestRect()` is normalized device coordinates (NDC). Normalized device coordinates are projected clip space coordinates +*(x' = x/w, y' = y/w)* and the visible screen maps to the range [-1,1] for both the *x'* and *y'* coordinate. The w coordinate is still given in clip space, +however. It is up to the application to compute projected bounding rectangles from the object's bounding shapes. + +### Debugging and visualization + +We expose a utility function, `ComputePixelDepthBuffer()` that can be used to visualize the hierarchical depth buffer used internally by the occlusion culling +system. The function fills in a complete per-pixel depth buffer, but the internal representation is hierarchical with just two depth values and a mask stored per +tile. It is not reasonable to expect the image to completely match the exact depth buffer, and you may notice some areas where backrgound objects leak through +the foreground. Leakage is part of the algorithm (and one reason for the high performance), and we have +not found it to be problematic. However, if you experience issues due to leakage you may want to disable the `QUICK_MASK` define, described in more detail in the +section on [hierarchical depth buffer updates](#update). + +```C++ +// Compute a per pixel depth buffer from the hierarchical depth buffer, used for visualization. +float *perPixelZBuffer = new float[width * height]; +moc.ComputePixelDepthBuffer(perPixelZBuffer); +``` + +We also support basic instrumentation to help with profiling and debugging. By defining `ENABLE_STATS` in the header file, the occlusion culling code will +gather statistics about the number of occluders rendered and occlusion queries performed. For more details about the statistics, see the +`OcclusionCullingStatistics` struct. The statistics can be queried using the `GetStatistics()` function, which will simply return a zeroed struct if `ENABLE_STATS` +is not defined. Note that instrumentation reduces performance somewhat and should generally be disabled in release builds. + +```C++ +OcclusionCullingStatistics stats = moc.GetStatistics(); +``` + +### Memory management + +As shown in the example below, you may optionally provide callback functions for allocating and freeing memory when creating a +`MaskedOcclusionCulling` object. The functions must support aligned allocations. + +```C++ +void *alignedAllocCallback(size_t alignment, size_t size) +{ + ... +} + +void alignedFreeCallback(void *ptr) +{ + ... +} + +MaskedOcclusionCulling *moc = MaskedOcclusionCulling::Create(alignedAllocCallback, alignedFreeCallback); +``` + +## Hierarchical depth buffer update algorithm and render order + +The library contains two update algorithms / heuristics for the hierarchical depth buffer, one focused on speed and one focused on accuracy. The +active algorithm can be configured using the `QUICK_MASK` define. Setting the define (default) enables algorithm is described in the research paper +["Masked Software Occlusion Culling"](https://software.intel.com/en-us/articles/masked-software-occlusion-culling), which has a good balance between low +leakage and good performance. Not defining `QUICK_MASK` enables the mergine heuristic used in the paper +["Masked Depth Culling for Graphics Hardware"](http://dl.acm.org/citation.cfm?id=2818138). It is more accurate, with less leakage, but also has lower performance. + +If you experience problems due to leakage you may want to use the more accurate update algorithm. However, rendering order can also affect the quality +of the hierarchical depth buffer, with the best order being rendering objects front-to-back. We perform early depth culling tests during occluder + +rendering, so rendering in front-to-back order will not only improve quality, but also greatly improve performance of occluder rendering. If your scene +is stored in a hierarchical data structure, it is often possible to modify the traversal algorithm to traverse nodes in approximate front-to-back order, +see the research paper ["Masked Software Occlusion Culling"](https://software.intel.com/en-us/articles/masked-software-occlusion-culling) for an example. + +## Interleaving occluder rendering and occlusion queries + +The library supports *light weight* switching between occluder rendering and occlusion queries. While it is still possible to do occlusion culling +as a standard two pass algorithm (first render all occluders, then perform all queries) it is typically beneficial to interleave occluder rendering with +queries. + +This is especially powerful when rendering objects in front-to-back order. After drawing the first few occluder triangles, you can start performing +occlusion queries, and if the occlusion query indicate that an object is occluded there is no need to draw the occluder mesh for that object. This +can greatly improve the performance of the occlusion culling pass in itself. As described in further detail in the research paper +["Masked Software Occlusion Culling"](https://software.intel.com/en-us/articles/masked-software-occlusion-culling), this may be used to perform early exits in +BVH traversal code. + +## Rasterization precision + +The library supports high precision rasterization through Bresenham interpolation, and this may be enabled by changing the `PRECISE_COVERAGE` define in +the header file. The high precision rasterizer is somewhat slower (5-15%) than using the default rasterizer, but is compliant with DirectX 11 and OpenGL +rasterization rules. We have empirically verified it on a large set of randomly generated on-screen triangles. While there still may be differences to GPU +rasterization due to clipping or vertex transform precision differences, we have not noticed any differences in rasterized coverage in our test scenes. Note +that tie breaker rules and vertex rounding behaves differently between DirectX and OpenGL due to the direction of the screen space Y axis. The `USE_D3D` define +(enabled by default) can be used to toggle between DirectX or OpenGL behaviour. + +## Multi-threading and binned rendering + +Multi-threading is supported through a binning rasterizer. The `MaskedOcclusionCulling` class exposes two functions, `BinTriangles()` and `RenderTrilist()` +that may be used to perform binning, and render all triangles assigned to a bin. Using binned rasterization makes it simple to guarantee that no two threads are +accessing the same part of the framebuffer, as rendering is limited to a particular bin, or region of the screen. + +Binned rendering starts by performing geometry processing (primitive assembly, vertex transform, clipping, and projection) followed by a binning step, where +triangles are written to all bins they overlap. This is performed using the `BinTriangles()` function, which is very similar to the `RenderTriangles()` +function, but provides some additional parameters for specifying the number of bins the screen is split into. The calling application also needs to pass a +pointer to an array of `TriList` object, with one instance per bin. Each `TriList` object points to a "scratchpad" memory buffer, and all triangles overlapping +that bin will be written to the buffer. + +```C++ +const int binsW = 4; +const int binsH = 4; + +float *dataBuffer = new float[binsW*BinsH*1024*3*3]; // Allocate storage for 1k triangles in each trilist +TriList *triLists = new TriList[binsW*binsH]; // Allocate trilists for 4x4 = 16 bins +for (int i = 0; i < binsW*BinsH; ++i) +{ + triLists[i].mNumTriangles = 1024; // triangle list capacity + triLists[i].mTriIdx = 0; // Set triangle write pointer to first element + triLists[i].mData = dataBuffer + i*1024*1024; +} + +// Perform geometry processing and write triangles to the triLists of all bins they overlap. +moc.BinTriangles(triVerts, triIndices, nTris, triLists, binsW, binsW); +``` + +After generating the triangle lists for each bin, the triangles may be rendered using the `RenderTrilist()` function and the rendering region should be +limited using a scissor rectangle. It should be noted that the `BinTriangles()` function makes assumptions on the size of the bins, and the calling +application must therefore always compute the scissor region of each bin, relying on the `ComputeBinWidthHeight()` utility function as shown in the +example below. Note that the scissor rectangle is specified in screen space coordinates which depends on the `USE_D3D` define. + +```C++ +unsigned int binWidth, binHeight; +moc.ComputeBinWidthHeight(mBinsW, mBinsH, binWidth, binHeight); + +for (int by = 0; by < binsH; ++by) +{ + for (int bx = 0; bx < binsW ; ++bx) + { + // Compute scissor rectangle that matches the one assumed by BinTriangles() + // note that the ScissorRect is specified in pixel coordinates, with (0,0) + // being the bottom left corner + ScissorRect binRect; + binRect.minX = bx*binWidth; + binRect.maxX = bx + 1 == binsW ? screenWidth : (bx + 1) * binWidth; + binRect.minY = by*binHeight; + binRect.maxY = by + 1 == binsH ? screenHeight : (by + 1) * binHeight; + + // Render all triangles overlapping the current bin. + moc.RenderTrilist(triLists[bx + by*4], &binRect); + } +} +``` + +### Multi-threading example + +This library includes a multi-threading example in the `CullingThreadpool` class. The class interface is similar to that of `MaskedOcclusionCulling`, but occluder +rendering is performed asynchronously. Calling the `CullingThreadpool::RenderTriangles()` function adds a render job to a command queue and immediately return +to the calling thread, rather than immediately performing the rendering work. Internally, the class uses the `BinTriangles()` and `RenderTrilist()` functions to +bin all triangles of the `CullingThreadpool::RenderTriangles()` call, and distribute the tiles. At any time, there may be a number of binning jobs, and tile +rendering jobs unprocessed, and the scheduler picks the most urgent job and process it first. If a thread runs out of available jobs, task stealing is used as a +means of improving load-balancing. + +The occlusion query functions `CullingThreadpool::TestTriangles()` and `CullingThreadpool::TestRect()` immediately return the result of the query. However, the +query depends on the contents of the hierarchical depth buffer you may need to wait for the worker threads to finish to make sure the query is performed on the +most up to date version of the buffer, this can be accomplished by calling `CullingThreadpool::Flush()`. It is not always necessary to work with the most up to +date version of the hierarchical depth buffer for a query. While the result may be incorrect, it is still always conservative in that occluded objects may be +classified as visible, but not the other way around. Since the `CullingThreadpool::Flush()` causes a wait it may be beneficial to work against a slightly out of +date version of the hierarchical depth buffer if your application will cause a lot of flushes. We found this particularly true when implementing threading in +our interleaved BVH traversal algorithm (see the ["Masked Software Occlusion Culling"](https://software.intel.com/en-us/articles/masked-software-occlusion-culling) +paper) where each BVH traversal step is based on the outcome of an occlusion query interleaved with occluder rendering for the BVH-leaves. + +The `CullingThreadpool` class was written as an example and not the de-facto threading approach. In some cases we believe it is possible to improve performance +further by threading occlusion queries, or thread the entire occlusion culling system, including scene graph traversal. However, it does provide a simple means +of enabling multi-threading in a traditional single threaded application as the APIs is very similar to the `MaskedOcclusionCulling` class, and may be called from +a single threaded application. As previously mentioned we integrated this implementation in our interleaved BVH traversal algorithm (see the ["Masked Software Occlusion Culling"](https://software.intel.com/en-us/articles/masked-software-occlusion-culling) +paper) and noted speedup of roughly *3x*, running on four threads, compared to our previous single threaded implementation. + +## Compiling + +The code has been reworked to support more platforms and compilers, such as [Intel C++ Compiler](https://software.intel.com/en-us/intel-compilers), [G++](https://gcc.gnu.org/) +and [LLVM/Clang](http://releases.llvm.org/download.html). The original Visual Studio 2015 projects remain and works with both ICC and Microsoft's compilers. Other compilers +are supported through [CMake](https://cmake.org/). See the `CMakeLists.txt` files in the `Example` and `FillrateTest` folders. You can use CMake to generate a +Visual Studio project for Clang on Windows: + +``` +md \Example\Clang +cd \Example\Clang +cmake -G"Visual Studio 14 2015 Win64" -T"LLVM-vs2014" .. +``` + +or build the library with G++/Clang on linux systems (the `D3DValidate` sample only works on Windows as it relies on Direct 3D) + +``` +mkdir /Example/Release +cd /Example/Release +cmake -DCMAKE_BUILD_TYPE=Release .. +make +``` + +Note that AVX-512 support is only experimental at the moment, and has only been verified through [Intel SDE](https://software.intel.com/en-us/articles/pre-release-license-agreement-for-intel-software-development-emulator-accept-end-user-license-agreement-and-download). +If using the original visual studio project, you need to "opt in" for AVX-512 support by setting `#define USE_AVX512 1`. When building with CMake you can +enable AVX support using the `-DUSE_AVX512=ON` option: + +``` +cmake -DUSE_AVX512=ON -G"Visual Studio 14 2015 Win64" -T"LLVM-vs2014" .. +``` + +## Version History + +* Version 1.4: + * Added support for merging 2 depth buffers as detailed in GDC 2018 presenation. + * Fixed Profiling counters to be thread safe removing a race condition when runing the CullingThreadpool class. +* Version 1.3: + * **Experimental**: Added support for AVX-512 capable CPUs. Currently only verified through [emulator](https://software.intel.com/en-us/articles/intel-software-development-emulator). + * Added multiplatform support. Code now compiles on Visual C++ Compiler, Intel C++ Compiler, GCC, and Clang. + * Added configurable backface culling, to support two-sided occluder rendering. +* Version 1.2: + * Added support for threading, through a binning rasterizer. The `CullingThreadpool` class implements an example multi-threaded task system with a very similar + API to the `MaskedOcclusionCulling`class. + * Added support for higher precision rasterization, with DirectX and OpenGL compliant rasterization rules. + * **Note:** The default screen space coordinate system has been changed from OpenGL to DirectX conventions. If you upgrade from an older version of the library + this will flip the y coordinate of scissor boxes and the images returned by `ComputePixelDepthBuffer()`. Disabling the `USE_D3D` define changes back to OpenGL conventions. +* Version 1.1: + * Added support for SSE4.1 and SSE2 capable CPUs for backwards compatibility. The SSE versions must emulate some operations using + simpler instructions, and are therefore less efficient, with the SSE2 version having the lowest performance. +* Version 1.0: + * Initial revision, only support for AVX2 capable CPUs + +## Differences to the research paper + +This code does not exactly match implementation used in +["Masked Software Occlusion Culling"](https://software.intel.com/en-us/articles/masked-software-occlusion-culling), and performance may vary slightly +from what is presented in the research paper. We aimed for making the API as simple as possible and have removed many limitations, in particular +requirements on input data being aligned to SIMD boundaries. This affects performance slightly in both directions. Unaligned loads and +gathers are more costly, but unaligned data may be packed more efficiently in memory leading to fewer cache misses. + +## License agreement + +See the Apache 2.0 license.txt for full license agreement details. + +Disclaimer: + +This software is subject to the U.S. Export Administration Regulations and other U.S. +law, and may not be exported or re-exported to certain countries (Cuba, Iran, North +Korea, Sudan, and Syria) or to persons or entities prohibited from receiving U.S. +exports (including Denied Parties, Specially Designated Nationals, and entities on the +Bureau of Export Administration Entity List or involved with missile technology or +nuclear, chemical or biological weapons).. diff --git a/neo/libs/moc/license.txt b/neo/libs/moc/license.txt new file mode 100644 index 00000000..e71036ef --- /dev/null +++ b/neo/libs/moc/license.txt @@ -0,0 +1,181 @@ + +Apache License + Version 2.0, January 2004 + + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and +distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright +owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities +that control, are controlled by, or are under common control with that entity. +For the purposes of this definition, "control" means (i) the power, direct or +indirect, to cause the direction or management of such entity, whether by +contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising +permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including +but not limited to software source code, documentation source, and configuration +files. + +"Object" form shall mean any form resulting from mechanical transformation or +translation of a Source form, including but not limited to compiled object code, +generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made +available under the License, as indicated by a copyright notice that is included +in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that +is based on (or derived from) the Work and for which the editorial revisions, +annotations, elaborations, or other modifications represent, as a whole, an +original work of authorship. For the purposes of this License, Derivative Works +shall not include works that remain separable from, or merely link (or bind by +name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version +of the Work and any modifications or additions to that Work or Derivative Works +thereof, that is intentionally submitted to Licensor for inclusion in the Work +by the copyright owner or by an individual or Legal Entity authorized to submit +on behalf of the copyright owner. For the purposes of this definition, +"submitted" means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, and +issue tracking systems that are managed by, or on behalf of, the Licensor for +the purpose of discussing and improving the Work, but excluding communication +that is conspicuously marked or otherwise designated in writing by the copyright +owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently +incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of this +License, each Contributor hereby grants to You a perpetual, worldwide, +non-exclusive, no-charge, royalty-free, irrevocable copyright license to +reproduce, prepare Derivative Works of, publicly display, publicly perform, +sublicense, and distribute the Work and such Derivative Works in Source or +Object form. + +3. Grant of Patent License. Subject to the terms and conditions of this License, +each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, +no-charge, royalty-free, irrevocable (except as stated in this section) patent +license to make, have made, use, offer to sell, sell, import, and otherwise +transfer the Work, where such license applies only to those patent claims +licensable by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) with the Work +to which such Contribution(s) was submitted. If You institute patent litigation +against any entity (including a cross-claim or counterclaim in a lawsuit) +alleging that the Work or a Contribution incorporated within the Work +constitutes direct or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate as of the date +such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the Work or +Derivative Works thereof in any medium, with or without modifications, and in +Source or Object form, provided that You meet the following conditions: + You must give any other recipients of the Work or Derivative Works a copy of + this License; and + + + You must cause any modified files to carry prominent notices stating that You + changed the files; and + + + You must retain, in the Source form of any Derivative Works that You + distribute, all copyright, patent, trademark, and attribution notices from the + Source form of the Work, excluding those notices that do not pertain to any + part of the Derivative Works; and + + + If the Work includes a "NOTICE" text file as part of its distribution, then + any Derivative Works that You distribute must include a readable copy of the + attribution notices contained within such NOTICE file, excluding those notices + that do not pertain to any part of the Derivative Works, in at least one of + the following places: within a NOTICE text file distributed as part of the + Derivative Works; within the Source form or documentation, if provided along + with the Derivative Works; or, within a display generated by the Derivative + Works, if and wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and do not modify the + License. You may add Your own attribution notices within Derivative Works that + You distribute, alongside or as an addendum to the NOTICE text from the Work, + provided that such additional attribution notices cannot be construed as + modifying the License. +You may add Your own copyright statement to Your modifications and may provide +additional or different license terms and conditions for use, reproduction, or +distribution of Your modifications, or for any such Derivative Works as a whole, +provided Your use, reproduction, and distribution of the Work otherwise complies +with the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, any +Contribution intentionally submitted for inclusion in the Work by You to the +Licensor shall be under the terms and conditions of this License, without any +additional terms or conditions. Notwithstanding the above, nothing herein shall +supersede or modify the terms of any separate license agreement you may have +executed with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade names, +trademarks, service marks, or product names of the Licensor, except as required +for reasonable and customary use in describing the origin of the Work and +reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or agreed to in +writing, Licensor provides the Work (and each Contributor provides its +Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied, including, without limitation, any warranties +or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any risks +associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, whether in +tort (including negligence), contract, or otherwise, unless required by +applicable law (such as deliberate and grossly negligent acts) or agreed to in +writing, shall any Contributor be liable to You for damages, including any +direct, indirect, special, incidental, or consequential damages of any character +arising as a result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, work stoppage, +computer failure or malfunction, or any and all other commercial damages or +losses), even if such Contributor has been advised of the possibility of such +damages. + +9. Accepting Warranty or Additional Liability. While redistributing the Work or +Derivative Works thereof, You may choose to offer, and charge a fee for, +acceptance of support, warranty, indemnity, or other liability obligations +and/or rights consistent with this License. However, in accepting such +obligations, You may act only on Your own behalf and on Your sole +responsibility, not on behalf of any other Contributor, and only if You agree to +indemnify, defend, and hold each Contributor harmless for any liability incurred +by, or claims asserted against, such Contributor by reason of your accepting any +such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work + +To apply the Apache License to your work, attach the following boilerplate +notice, with the fields enclosed by brackets "[]" replaced with your own +identifying information. (Don't include the brackets!) The text should be +enclosed in the appropriate comment syntax for the file format. We also +recommend that a file or class name and description of purpose be included on +the same "printed page" as the copyright notice for easier identification within +third-party archives. + +Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, +Version 2.0 (the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or +agreed to in writing, software distributed under the License is distributed on +an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +or implied. See the License for the specific language governing permissions and +limitations under the License. diff --git a/neo/renderer/RenderCommon.h b/neo/renderer/RenderCommon.h index 1428e296..af4a1375 100644 --- a/neo/renderer/RenderCommon.h +++ b/neo/renderer/RenderCommon.h @@ -1546,6 +1546,16 @@ void R_LinkDrawSurfToView( drawSurf_t* drawSurf, viewDef_t* viewDef ); void R_AddModels(); +/* +============================================================ + +TR_FRONTEND_MASKED_OCCLUSION_CULLING + +============================================================ +*/ + +void R_FillMaskedOcclusionBufferWithModels(); + /* ============================================================= diff --git a/neo/renderer/tr_frontend_addmodels.cpp b/neo/renderer/tr_frontend_addmodels.cpp index 00d17fda..ea0d8e17 100644 --- a/neo/renderer/tr_frontend_addmodels.cpp +++ b/neo/renderer/tr_frontend_addmodels.cpp @@ -3,7 +3,7 @@ Doom 3 BFG Edition GPL Source Code Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. -Copyright (C) 2014-2016 Robert Beckebans +Copyright (C) 2014-2024 Robert Beckebans Copyright (C) 2014-2016 Kot in Action Creative Artel This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). @@ -46,64 +46,7 @@ idCVar r_lodMaterialDistance( "r_lodMaterialDistance", "500", CVAR_RENDERER | CV static const float CHECK_BOUNDS_EPSILON = 1.0f; -/* -================== -R_SortViewEntities -================== -*/ -viewEntity_t* R_SortViewEntities( viewEntity_t* vEntities ) -{ - SCOPED_PROFILE_EVENT( "R_SortViewEntities" ); - // We want to avoid having a single AddModel for something complex be - // the last thing processed and hurt the parallel occupancy, so - // sort dynamic models first, _area models second, then everything else. - viewEntity_t* dynamics = NULL; - viewEntity_t* areas = NULL; - viewEntity_t* others = NULL; - for( viewEntity_t* vEntity = vEntities; vEntity != NULL; ) - { - viewEntity_t* next = vEntity->next; - const idRenderModel* model = vEntity->entityDef->parms.hModel; - if( model->IsDynamicModel() != DM_STATIC ) - { - vEntity->next = dynamics; - dynamics = vEntity; - } - else if( model->IsStaticWorldModel() ) - { - vEntity->next = areas; - areas = vEntity; - } - else - { - vEntity->next = others; - others = vEntity; - } - vEntity = next; - } - - // concatenate the lists - viewEntity_t* all = others; - - for( viewEntity_t* vEntity = areas; vEntity != NULL; ) - { - viewEntity_t* next = vEntity->next; - vEntity->next = all; - all = vEntity; - vEntity = next; - } - - for( viewEntity_t* vEntity = dynamics; vEntity != NULL; ) - { - viewEntity_t* next = vEntity->next; - vEntity->next = all; - all = vEntity; - vEntity = next; - } - - return all; -} /* ================== @@ -1115,7 +1058,8 @@ void R_AddModels() { SCOPED_PROFILE_EVENT( "R_AddModels" ); - tr.viewDef->viewEntitys = R_SortViewEntities( tr.viewDef->viewEntitys ); + // RB: already done in R_FillMaskedOcclusionBufferWithModels + // tr.viewDef->viewEntitys = R_SortViewEntities( tr.viewDef->viewEntitys ); //------------------------------------------------- // Go through each view entity that is either visible to the view, or to diff --git a/neo/renderer/tr_frontend_main.cpp b/neo/renderer/tr_frontend_main.cpp index cc7e98af..4e86102e 100644 --- a/neo/renderer/tr_frontend_main.cpp +++ b/neo/renderer/tr_frontend_main.cpp @@ -645,6 +645,9 @@ void R_RenderView( viewDef_t* parms ) // wait for any shadow volume jobs from the previous frame to finish tr.frontEndJobList->Wait(); + // RB: render worldspawn geometry to the software culling buffer + R_FillMaskedOcclusionBufferWithModels(); + // make sure that interactions exist for all light / entity combinations that are visible // add any pre-generated light shadows, and calculate the light shader values R_AddLights(); diff --git a/neo/renderer/tr_frontend_masked_occlusion_culling.cpp b/neo/renderer/tr_frontend_masked_occlusion_culling.cpp new file mode 100644 index 00000000..93d9c484 --- /dev/null +++ b/neo/renderer/tr_frontend_masked_occlusion_culling.cpp @@ -0,0 +1,557 @@ +/* +=========================================================================== + +Doom 3 BFG Edition GPL Source Code +Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company. +Copyright (C) 2024 Robert Beckebans + +This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code"). + +Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Doom 3 BFG Edition Source Code. If not, see . + +In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below. + +If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA. + +=========================================================================== +*/ +#include "precompiled.h" +#pragma hdrstop + +#include "../libs/moc/MaskedOcclusionCulling.h" + +#include "RenderCommon.h" +#include "Model_local.h" + +static const float CHECK_BOUNDS_EPSILON = 1.0f; + +/* +================== +R_SortViewEntities +================== +*/ +viewEntity_t* R_SortViewEntities( viewEntity_t* vEntities ) +{ + SCOPED_PROFILE_EVENT( "R_SortViewEntities" ); + + // We want to avoid having a single AddModel for something complex be + // the last thing processed and hurt the parallel occupancy, so + // sort dynamic models first, _area models second, then everything else. + viewEntity_t* dynamics = NULL; + viewEntity_t* areas = NULL; + viewEntity_t* others = NULL; + for( viewEntity_t* vEntity = vEntities; vEntity != NULL; ) + { + viewEntity_t* next = vEntity->next; + const idRenderModel* model = vEntity->entityDef->parms.hModel; + if( model->IsDynamicModel() != DM_STATIC ) + { + vEntity->next = dynamics; + dynamics = vEntity; + } + else if( model->IsStaticWorldModel() ) + { + vEntity->next = areas; + areas = vEntity; + } + else + { + vEntity->next = others; + others = vEntity; + } + vEntity = next; + } + + // concatenate the lists + viewEntity_t* all = others; + + for( viewEntity_t* vEntity = areas; vEntity != NULL; ) + { + viewEntity_t* next = vEntity->next; + vEntity->next = all; + all = vEntity; + vEntity = next; + } + + for( viewEntity_t* vEntity = dynamics; vEntity != NULL; ) + { + viewEntity_t* next = vEntity->next; + vEntity->next = all; + all = vEntity; + vEntity = next; + } + + return all; +} + + +/* +=================== +R_RenderSingleModel + +May be run in parallel. + +Here is where dynamic models actually get instantiated, and necessary +interaction surfaces get created. This is all done on a sort-by-model +basis to keep source data in cache (most likely L2) as any interactions +and shadows are generated, since dynamic models will typically be lit by +two or more lights. +=================== +*/ +void R_RenderSingleModel( viewEntity_t* vEntity ) +{ + // we will add all interaction surfs here, to be chained to the lights in later serial code + vEntity->drawSurfs = NULL; + + // RB + vEntity->useLightGrid = false; + + // globals we really should pass in... + const viewDef_t* viewDef = tr.viewDef; + + idRenderEntityLocal* entityDef = vEntity->entityDef; + const renderEntity_t* renderEntity = &entityDef->parms; + const idRenderWorldLocal* world = entityDef->world; + + if( viewDef->isXraySubview && entityDef->parms.xrayIndex == 1 ) + { + return; + } + else if( !viewDef->isXraySubview && entityDef->parms.xrayIndex == 2 ) + { + return; + } + + SCOPED_PROFILE_EVENT( renderEntity->hModel == NULL ? "Unknown Model" : renderEntity->hModel->Name() ); + + // calculate the znear for testing whether or not the view is inside a shadow projection + const float znear = ( viewDef->renderView.cramZNear ) ? ( r_znear.GetFloat() * 0.25f ) : r_znear.GetFloat(); + + // if the entity wasn't seen through a portal chain, it was added just for light shadows + const bool modelIsVisible = !vEntity->scissorRect.IsEmpty(); + const bool addInteractions = modelIsVisible && ( !viewDef->isXraySubview || entityDef->parms.xrayIndex == 2 ); + const int entityIndex = entityDef->index; + + extern idCVar r_lodMaterialDistance; + + //--------------------------- + // Find which of the visible lights contact this entity + // + // If the entity doesn't accept light or cast shadows from any surface, + // this can be skipped. + // + // OPTIMIZE: world areas can assume all referenced lights are used + //--------------------------- + int numContactedLights = 0; + static const int MAX_CONTACTED_LIGHTS = 128; + viewLight_t* contactedLights[MAX_CONTACTED_LIGHTS]; + idInteraction* staticInteractions[MAX_CONTACTED_LIGHTS]; + + if( renderEntity->hModel == NULL || + renderEntity->hModel->ModelHasInteractingSurfaces() || + renderEntity->hModel->ModelHasShadowCastingSurfaces() ) + { + SCOPED_PROFILE_EVENT( "Find lights" ); + for( viewLight_t* vLight = viewDef->viewLights; vLight != NULL; vLight = vLight->next ) + { + if( vLight->scissorRect.IsEmpty() ) + { + continue; + } + if( vLight->entityInteractionState != NULL ) + { + // new code path, everything was done in AddLight + if( vLight->entityInteractionState[entityIndex] == viewLight_t::INTERACTION_YES ) + { + contactedLights[numContactedLights] = vLight; + staticInteractions[numContactedLights] = world->interactionTable[vLight->lightDef->index * world->interactionTableWidth + entityIndex]; + if( ++numContactedLights == MAX_CONTACTED_LIGHTS ) + { + break; + } + } + continue; + } + + const idRenderLightLocal* lightDef = vLight->lightDef; + + if( !lightDef->globalLightBounds.IntersectsBounds( entityDef->globalReferenceBounds ) ) + { + continue; + } + + if( R_CullModelBoundsToLight( lightDef, entityDef->localReferenceBounds, entityDef->modelRenderMatrix ) ) + { + continue; + } + + if( !modelIsVisible ) + { + // some lights have their center of projection outside the world + if( lightDef->areaNum != -1 ) + { + // if no part of the model is in an area that is connected to + // the light center (it is behind a solid, closed door), we can ignore it + bool areasConnected = false; + for( areaReference_t* ref = entityDef->entityRefs; ref != NULL; ref = ref->ownerNext ) + { + if( world->AreasAreConnected( lightDef->areaNum, ref->area->areaNum, PS_BLOCK_VIEW ) ) + { + areasConnected = true; + break; + } + } + if( areasConnected == false ) + { + // can't possibly be seen or shadowed + continue; + } + } + + // check more precisely for shadow visibility + idBounds shadowBounds; + R_ShadowBounds( entityDef->globalReferenceBounds, lightDef->globalLightBounds, lightDef->globalLightOrigin, shadowBounds ); + + // this doesn't say that the shadow can't effect anything, only that it can't + // effect anything in the view + if( idRenderMatrix::CullBoundsToMVP( viewDef->worldSpace.mvp, shadowBounds ) ) + { + continue; + } + } + contactedLights[numContactedLights] = vLight; + staticInteractions[numContactedLights] = world->interactionTable[vLight->lightDef->index * world->interactionTableWidth + entityIndex]; + if( ++numContactedLights == MAX_CONTACTED_LIGHTS ) + { + break; + } + } + } + + // if we aren't visible and none of the shadows stretch into the view, + // we don't need to do anything else + if( !modelIsVisible && numContactedLights == 0 ) + { + return; + } + + //--------------------------- + // create a dynamic model if the geometry isn't static + //--------------------------- + idRenderModel* model = R_EntityDefDynamicModel( entityDef ); + if( model == NULL || model->NumSurfaces() <= 0 ) + { + return; + } + + //--------------------------- + // copy matrix related stuff for back-end use + // and setup a render matrix for faster culling + //--------------------------- + vEntity->modelDepthHack = renderEntity->modelDepthHack; + vEntity->weaponDepthHack = renderEntity->weaponDepthHack; + vEntity->skipMotionBlur = renderEntity->skipMotionBlur; + + memcpy( vEntity->modelMatrix, entityDef->modelMatrix, sizeof( vEntity->modelMatrix ) ); + R_MatrixMultiply( entityDef->modelMatrix, viewDef->worldSpace.modelViewMatrix, vEntity->modelViewMatrix ); + + idRenderMatrix viewMat; + idRenderMatrix::Transpose( *( idRenderMatrix* )vEntity->modelViewMatrix, viewMat ); + idRenderMatrix::Multiply( viewDef->projectionRenderMatrix, viewMat, vEntity->mvp ); + if( renderEntity->weaponDepthHack ) + { + idRenderMatrix::ApplyDepthHack( vEntity->mvp ); + } + if( renderEntity->modelDepthHack != 0.0f ) + { + idRenderMatrix::ApplyModelDepthHack( vEntity->mvp, renderEntity->modelDepthHack ); + } + + // local light and view origins are used to determine if the view is definitely outside + // an extruded shadow volume, which means we can skip drawing the end caps + idVec3 localViewOrigin; + R_GlobalPointToLocal( vEntity->modelMatrix, viewDef->renderView.vieworg, localViewOrigin ); + + //--------------------------- + // add all the model surfaces + //--------------------------- + for( int surfaceNum = 0; surfaceNum < model->NumSurfaces(); surfaceNum++ ) + { + const modelSurface_t* surf = model->Surface( surfaceNum ); + + // for debugging, only show a single surface at a time + if( r_singleSurface.GetInteger() >= 0 && surfaceNum != r_singleSurface.GetInteger() ) + { + continue; + } + + srfTriangles_t* tri = surf->geometry; + if( tri == NULL ) + { + continue; + } + if( tri->numIndexes == 0 ) + { + continue; // happens for particles + } + const idMaterial* shader = surf->shader; + if( shader == NULL ) + { + continue; + } + + // motorsep 11-24-2014; checking for LOD surface for LOD1 iteration + if( shader->IsLOD() ) + { + // foresthale 2014-11-24: calculate the bounds and get the distance from camera to bounds + idBounds& localBounds = tri->bounds; + if( tri->staticModelWithJoints ) + { + // skeletal models have difficult to compute bounds for surfaces, so use the whole entity + localBounds = vEntity->entityDef->localReferenceBounds; + } + const float* bounds = localBounds.ToFloatPtr(); + idVec3 nearestPointOnBounds = localViewOrigin; + nearestPointOnBounds.x = Max( nearestPointOnBounds.x, bounds[0] ); + nearestPointOnBounds.x = Min( nearestPointOnBounds.x, bounds[3] ); + nearestPointOnBounds.y = Max( nearestPointOnBounds.y, bounds[1] ); + nearestPointOnBounds.y = Min( nearestPointOnBounds.y, bounds[4] ); + nearestPointOnBounds.z = Max( nearestPointOnBounds.z, bounds[2] ); + nearestPointOnBounds.z = Min( nearestPointOnBounds.z, bounds[5] ); + idVec3 delta = nearestPointOnBounds - localViewOrigin; + float distance = delta.LengthFast(); + + if( !shader->IsLODVisibleForDistance( distance, r_lodMaterialDistance.GetFloat() ) ) + { + continue; + } + } + + // foresthale 2014-09-01: don't skip surfaces that use the "forceShadows" flag + if( !shader->IsDrawn() && !shader->SurfaceCastsShadow() ) + { + continue; // collision hulls, etc + } + + // RemapShaderBySkin + if( entityDef->parms.customShader != NULL ) + { + // this is sort of a hack, but causes deformed surfaces to map to empty surfaces, + // so the item highlight overlay doesn't highlight the autosprite surface + if( shader->Deform() ) + { + continue; + } + shader = entityDef->parms.customShader; + } + else if( entityDef->parms.customSkin ) + { + shader = entityDef->parms.customSkin->RemapShaderBySkin( shader ); + if( shader == NULL ) + { + continue; + } + // foresthale 2014-09-01: don't skip surfaces that use the "forceShadows" flag + if( !shader->IsDrawn() && !shader->SurfaceCastsShadow() ) + { + continue; + } + } + + // optionally override with the renderView->globalMaterial + if( tr.primaryRenderView.globalMaterial != NULL ) + { + shader = tr.primaryRenderView.globalMaterial; + } + + SCOPED_PROFILE_EVENT( shader->GetName() ); + + // debugging tool to make sure we have the correct pre-calculated bounds + if( r_checkBounds.GetBool() ) + { + for( int j = 0; j < tri->numVerts; j++ ) + { + int k; + for( k = 0; k < 3; k++ ) + { + if( tri->verts[j].xyz[k] > tri->bounds[1][k] + CHECK_BOUNDS_EPSILON + || tri->verts[j].xyz[k] < tri->bounds[0][k] - CHECK_BOUNDS_EPSILON ) + { + common->Printf( "bad tri->bounds on %s:%s\n", entityDef->parms.hModel->Name(), shader->GetName() ); + break; + } + if( tri->verts[j].xyz[k] > entityDef->localReferenceBounds[1][k] + CHECK_BOUNDS_EPSILON + || tri->verts[j].xyz[k] < entityDef->localReferenceBounds[0][k] - CHECK_BOUNDS_EPSILON ) + { + common->Printf( "bad referenceBounds on %s:%s\n", entityDef->parms.hModel->Name(), shader->GetName() ); + break; + } + } + if( k != 3 ) + { + break; + } + } + } + + // view frustum culling for the precise surface bounds, which is tighter + // than the entire entity reference bounds + // If the entire model wasn't visible, there is no need to check the + // individual surfaces. + const bool surfaceDirectlyVisible = modelIsVisible && !idRenderMatrix::CullBoundsToMVP( vEntity->mvp, tri->bounds ); + + // RB: added check wether GPU skinning is available at all + const bool gpuSkinned = ( tri->staticModelWithJoints != NULL && r_useGPUSkinning.GetBool() ); + // RB end + + //-------------------------- + // base drawing surface + //-------------------------- + const float* shaderRegisters = NULL; + drawSurf_t* baseDrawSurf = NULL; + if( surfaceDirectlyVisible && shader->IsDrawn() ) + { + // TODO render to masked occlusion buffer + + /* + // make sure we have an ambient cache and all necessary normals / tangents + if( !vertexCache.CacheIsCurrent( tri->indexCache ) ) + { + tri->indexCache = vertexCache.AllocIndex( tri->indexes, tri->numIndexes ); + } + + if( !vertexCache.CacheIsCurrent( tri->ambientCache ) ) + { + // we are going to use it for drawing, so make sure we have the tangents and normals + if( shader->ReceivesLighting() && !tri->tangentsCalculated ) + { + assert( tri->staticModelWithJoints == NULL ); + R_DeriveTangents( tri ); + + // RB: this was hit by parametric particle models .. + //assert( false ); // this should no longer be hit + // RB end + } + tri->ambientCache = vertexCache.AllocVertex( tri->verts, tri->numVerts ); + } + */ + + /* + // add the surface for drawing + // we can re-use some of the values for light interaction surfaces + baseDrawSurf = ( drawSurf_t* )R_FrameAlloc( sizeof( *baseDrawSurf ), FRAME_ALLOC_DRAW_SURFACE ); + baseDrawSurf->frontEndGeo = tri; + baseDrawSurf->space = vEntity; + baseDrawSurf->scissorRect = vEntity->scissorRect; + baseDrawSurf->extraGLState = 0; + + R_SetupDrawSurfShader( baseDrawSurf, shader, renderEntity ); + + shaderRegisters = baseDrawSurf->shaderRegisters; + + // Check for deformations (eyeballs, flares, etc) + const deform_t shaderDeform = shader->Deform(); + if( shaderDeform != DFRM_NONE ) + { + drawSurf_t* deformDrawSurf = R_DeformDrawSurf( baseDrawSurf ); + if( deformDrawSurf != NULL ) + { + // any deforms may have created multiple draw surfaces + for( drawSurf_t* surf = deformDrawSurf, * next = NULL; surf != NULL; surf = next ) + { + next = surf->nextOnLight; + + surf->linkChain = NULL; + surf->nextOnLight = vEntity->drawSurfs; + vEntity->drawSurfs = surf; + } + } + } + + // Most deform source surfaces do not need to be rendered. + // However, particles are rendered in conjunction with the source surface. + if( shaderDeform == DFRM_NONE || shaderDeform == DFRM_PARTICLE || shaderDeform == DFRM_PARTICLE2 ) + { + // copy verts and indexes to this frame's hardware memory if they aren't already there + if( !vertexCache.CacheIsCurrent( tri->ambientCache ) ) + { + tri->ambientCache = vertexCache.AllocVertex( tri->verts, tri->numVerts ); + } + if( !vertexCache.CacheIsCurrent( tri->indexCache ) ) + { + tri->indexCache = vertexCache.AllocIndex( tri->indexes, tri->numIndexes ); + } + + R_SetupDrawSurfJoints( baseDrawSurf, tri, shader ); + + baseDrawSurf->numIndexes = tri->numIndexes; + baseDrawSurf->ambientCache = tri->ambientCache; + baseDrawSurf->indexCache = tri->indexCache; + + baseDrawSurf->linkChain = NULL; // link to the view + baseDrawSurf->nextOnLight = vEntity->drawSurfs; + vEntity->drawSurfs = baseDrawSurf; + } + */ + } + } +} + +//REGISTER_PARALLEL_JOB( R_AddSingleModel, "R_AddSingleModel" ); + + + +/* +=================== +R_FillMaskedOcclusionBufferWithModels +=================== +*/ +void R_FillMaskedOcclusionBufferWithModels() +{ + SCOPED_PROFILE_EVENT( "R_FillMaskedOcclusionBufferWithModels" ); + + tr.viewDef->viewEntitys = R_SortViewEntities( tr.viewDef->viewEntitys ); + + //------------------------------------------------- + // Go through each view entity that is either visible to the view, or to + // any light that intersects the view (for shadows). + //------------------------------------------------- + + /* + if( r_useParallelAddModels.GetBool() ) + { + for( viewEntity_t* vEntity = tr.viewDef->viewEntitys; vEntity != NULL; vEntity = vEntity->next ) + { + tr.frontEndJobList->AddJob( ( jobRun_t )R_AddSingleModel, vEntity ); + } + tr.frontEndJobList->Submit(); + tr.frontEndJobList->Wait(); + } + else + */ + { + for( viewEntity_t* vEntity = tr.viewDef->viewEntitys; vEntity != NULL; vEntity = vEntity->next ) + { + const idRenderModel* model = vEntity->entityDef->parms.hModel; + + // skip after rendering BSP area models + if( !model->IsStaticWorldModel() ) + { + break; + } + + R_RenderSingleModel( vEntity ); + } + } +}