From f0c61a3f552fd4655aff8202bd92b0ab80396101 Mon Sep 17 00:00:00 2001 From: Stephen Saunders Date: Tue, 28 Feb 2023 18:02:45 -0500 Subject: [PATCH] Use command queue sync method vs. device sync for higher performance / frame rate --- neo/cmake-xcode-debug.sh | 7 +-- neo/idlib/precompiled.h | 3 +- neo/renderer/NVRHI/RenderBackend_NVRHI.cpp | 10 ++-- neo/sys/DeviceManager.h | 1 - neo/sys/DeviceManager_DX12.cpp | 11 +++++ neo/sys/DeviceManager_VK.cpp | 54 ++++++---------------- 6 files changed, 37 insertions(+), 49 deletions(-) diff --git a/neo/cmake-xcode-debug.sh b/neo/cmake-xcode-debug.sh index 66d77134..a2ee3f06 100755 --- a/neo/cmake-xcode-debug.sh +++ b/neo/cmake-xcode-debug.sh @@ -5,6 +5,7 @@ cd xcode-debug # note 1: remove or set -DCMAKE_SUPPRESS_REGENERATION=OFF to reenable ZERO_CHECK target which checks for CMakeLists.txt changes and re-runs CMake before builds # however, if ZERO_CHECK is reenabled **must** add VULKAN_SDK location to Xcode Custom Paths (under Prefs/Locations) otherwise build failures may occur # note 2: policy CMAKE_POLICY_DEFAULT_CMP0142=NEW suppresses non-existant per-config suffixes on Xcode library search paths, works for cmake version 3.25 and later -#note 3: env variable MVK_CONFIG_FULL_IMAGE_VIEW_SWIZZLE=1 enables MoltenVK's image view swizzle which may be required on older macOS versions or hardware (see vulkaninfo) -# note 4: env variable MVK_CONFIG_USE_METAL_ARGUMENT_BUFFERS=2 enables MoltenVK's use of Metal argument buffers only if VK_EXT_descriptor_indexing is enabled -cmake -G Xcode -DCMAKE_BUILD_TYPE=Debug -DCMAKE_XCODE_GENERATE_SCHEME=ON -DCMAKE_XCODE_SCHEME_ENVIRONMENT="MVK_CONFIG_FULL_IMAGE_VIEW_SWIZZLE=1;MVK_CONFIG_USE_METAL_ARGUMENT_BUFFERS=2" -DCMAKE_SUPPRESS_REGENERATION=ON -DOPENAL_LIBRARY=/usr/local/opt/openal-soft/lib/libopenal.dylib -DOPENAL_INCLUDE_DIR=/usr/local/opt/openal-soft/include ../neo -DCMAKE_POLICY_DEFAULT_CMP0142=NEW -Wno-dev +# note 3: env variable MVK_CONFIG_FULL_IMAGE_VIEW_SWIZZLE=1 enables MoltenVK's image view swizzle which may be required on older macOS versions or hardware (see vulkaninfo) +# note 4: env variable MVK_CONFIG_SYNCHRONOUS_QUEUE_SUBMITS=1 enforces synchronous queue submits which is required for the synchronization method used by the game +# note 5: env variable MVK_CONFIG_USE_METAL_ARGUMENT_BUFFERS=2 enables MoltenVK's use of Metal argument buffers only if VK_EXT_descriptor_indexing is enabled +cmake -G Xcode -DCMAKE_BUILD_TYPE=Debug -DCMAKE_XCODE_GENERATE_SCHEME=ON -DCMAKE_XCODE_SCHEME_ENVIRONMENT="MVK_CONFIG_FULL_IMAGE_VIEW_SWIZZLE=1;MVK_CONFIG_SYNCHRONOUS_QUEUE_SUBMITS=1;MVK_CONFIG_USE_METAL_ARGUMENT_BUFFERS=2" -DCMAKE_SUPPRESS_REGENERATION=ON -DOPENAL_LIBRARY=/usr/local/opt/openal-soft/lib/libopenal.dylib -DOPENAL_INCLUDE_DIR=/usr/local/opt/openal-soft/include ../neo -DCMAKE_POLICY_DEFAULT_CMP0142=NEW -Wno-dev diff --git a/neo/idlib/precompiled.h b/neo/idlib/precompiled.h index 90dfe8d3..3ac48bd2 100644 --- a/neo/idlib/precompiled.h +++ b/neo/idlib/precompiled.h @@ -90,8 +90,9 @@ const int MAX_EXPRESSION_REGISTERS = 4096; // everything that is needed by the backend needs // to be double buffered to allow it to run in // parallel on a dual cpu machine -#if defined(__APPLE__) && ( defined( USE_VULKAN ) || defined( USE_NVRHI ) ) +#if ( defined(__APPLE__) && defined( USE_VULKAN ) ) || defined( USE_NVRHI ) // SRS - macOS MoltenVK/Metal needs triple buffering for full screen to work properly + // SRS - use triple buffering for NVRHI with command queue event query sync method const uint32 NUM_FRAME_DATA = 3; #else const uint32 NUM_FRAME_DATA = 2; diff --git a/neo/renderer/NVRHI/RenderBackend_NVRHI.cpp b/neo/renderer/NVRHI/RenderBackend_NVRHI.cpp index f985b942..b1acac89 100644 --- a/neo/renderer/NVRHI/RenderBackend_NVRHI.cpp +++ b/neo/renderer/NVRHI/RenderBackend_NVRHI.cpp @@ -1597,14 +1597,16 @@ We want to exit this with the GPU idle, right at vsync void idRenderBackend::GL_BlockingSwapBuffers() { // Make sure that all frames have finished rendering - deviceManager->GetDevice()->waitForIdle(); - - // Release all in-flight references to the render targets - deviceManager->GetDevice()->runGarbageCollection(); + // SRS - device-level sync kills perf by serializing command queue processing (CPU) and rendering (GPU) + // - instead, use alternative sync method (based on command queue event queries) inside Present() + //deviceManager->GetDevice()->waitForIdle(); // Present to the swap chain. deviceManager->Present(); + // Release all in-flight references to the render targets + deviceManager->GetDevice()->runGarbageCollection(); + renderLog.EndFrame(); if( deviceManager->GetGraphicsAPI() == nvrhi::GraphicsAPI::VULKAN ) diff --git a/neo/sys/DeviceManager.h b/neo/sys/DeviceManager.h index 60e1e90b..02d5e6a0 100644 --- a/neo/sys/DeviceManager.h +++ b/neo/sys/DeviceManager.h @@ -61,7 +61,6 @@ struct DeviceCreationParameters nvrhi::Format swapChainFormat = nvrhi::Format::RGBA8_UNORM; // RB: don't do the sRGB gamma ramp with the swapchain uint32_t swapChainSampleCount = 1; uint32_t swapChainSampleQuality = 0; - uint32_t maxFramesInFlight = 2; bool enableDebugRuntime = false; bool enableNvrhiValidationLayer = false; bool vsyncEnabled = false; diff --git a/neo/sys/DeviceManager_DX12.cpp b/neo/sys/DeviceManager_DX12.cpp index 0be4f182..e357fed3 100644 --- a/neo/sys/DeviceManager_DX12.cpp +++ b/neo/sys/DeviceManager_DX12.cpp @@ -61,6 +61,7 @@ class DeviceManager_DX12 : public DeviceManager std::vector m_RhiSwapChainBuffers; RefCountPtr m_FrameFence; std::vector m_FrameFenceEvents; + nvrhi::EventQueryHandle m_FrameWaitQuery; UINT64 m_FrameCount = 1; @@ -447,6 +448,9 @@ bool DeviceManager_DX12::CreateDeviceAndSwapChain() m_FrameFenceEvents.push_back( CreateEvent( nullptr, false, true, NULL ) ); } + m_FrameWaitQuery = nvrhiDevice->createEventQuery(); + nvrhiDevice->setEventQuery( m_FrameWaitQuery, nvrhi::CommandQueue::Graphics ); + return true; } @@ -459,6 +463,8 @@ void DeviceManager_DX12::DestroyDeviceAndSwapChain() nvrhiDevice = nullptr; + m_FrameWaitQuery = nullptr; + for( auto fenceEvent : m_FrameFenceEvents ) { WaitForSingleObject( fenceEvent, INFINITE ); @@ -648,6 +654,11 @@ void DeviceManager_DX12::Present() // SRS - Don't change deviceParms.vsyncEnabled here, simply test for vsync mode 2 to set DXGI SyncInterval m_SwapChain->Present( deviceParms.vsyncEnabled && r_swapInterval.GetInteger() == 2 ? 1 : 0, presentFlags ); + // SRS - Sync on previous frame's command queue completion vs. waitForIdle() on whole device + nvrhiDevice->waitEventQuery( m_FrameWaitQuery ); + nvrhiDevice->resetEventQuery( m_FrameWaitQuery ); + nvrhiDevice->setEventQuery( m_FrameWaitQuery, nvrhi::CommandQueue::Graphics ); + m_FrameFence->SetEventOnCompletion( m_FrameCount, m_FrameFenceEvents[bufferIndex] ); m_GraphicsQueue->Signal( m_FrameFence, m_FrameCount ); m_FrameCount++; diff --git a/neo/sys/DeviceManager_VK.cpp b/neo/sys/DeviceManager_VK.cpp index 9b760356..d2314abd 100644 --- a/neo/sys/DeviceManager_VK.cpp +++ b/neo/sys/DeviceManager_VK.cpp @@ -272,8 +272,7 @@ private: nvrhi::CommandListHandle m_BarrierCommandList; vk::Semaphore m_PresentSemaphore; - std::queue m_FramesInFlight; - std::vector m_QueryPool; + nvrhi::EventQueryHandle m_FrameWaitQuery; // SRS - flag indicating support for eFifoRelaxed surface presentation (r_swapInterval = 1) mode bool enablePModeFifoRelaxed = false; @@ -1124,6 +1123,10 @@ bool DeviceManager_VK::CreateDeviceAndSwapChain() vkGetMoltenVKConfigurationMVK( m_VulkanInstance, &pConfig, &pConfigSize ); + // SRS - Enforce synchronous queue submission for vkQueueSubmit() & vkQueuePresentKHR() + pConfig.synchronousQueueSubmits = VK_TRUE; + vkSetMoltenVKConfigurationMVK( m_VulkanInstance, &pConfig, &pConfigSize ); + // SRS - If we don't have native image view swizzle, enable MoltenVK's image view swizzle feature if( portabilityFeatures.imageViewFormatSwizzle == VK_FALSE ) { @@ -1182,6 +1185,9 @@ bool DeviceManager_VK::CreateDeviceAndSwapChain() m_PresentSemaphore = m_VulkanDevice.createSemaphore( vk::SemaphoreCreateInfo() ); + m_FrameWaitQuery = m_NvrhiDevice->createEventQuery(); + m_NvrhiDevice->setEventQuery( m_FrameWaitQuery, nvrhi::CommandQueue::Graphics ); + #undef CHECK return true; @@ -1191,25 +1197,13 @@ void DeviceManager_VK::DestroyDeviceAndSwapChain() { destroySwapChain(); + m_FrameWaitQuery = nullptr; + m_VulkanDevice.destroySemaphore( m_PresentSemaphore ); m_PresentSemaphore = vk::Semaphore(); m_BarrierCommandList = nullptr; - while( m_FramesInFlight.size() > 0 ) - { - auto query = m_FramesInFlight.front(); - m_FramesInFlight.pop(); - query = nullptr; - } - - if( !m_QueryPool.empty() ) - { - auto query = m_QueryPool.back(); - m_QueryPool.pop_back(); - query = nullptr; - } - m_NvrhiDevice = nullptr; m_ValidationLayer = nullptr; m_RendererString.clear(); @@ -1294,37 +1288,17 @@ void DeviceManager_VK::Present() } else { -#ifndef _WIN32 if( deviceParms.vsyncEnabled ) { m_PresentQueue.waitIdle(); } -#endif - - while( m_FramesInFlight.size() > deviceParms.maxFramesInFlight ) - { - auto query = m_FramesInFlight.front(); - m_FramesInFlight.pop(); - - m_NvrhiDevice->waitEventQuery( query ); - - m_QueryPool.push_back( query ); - } - - nvrhi::EventQueryHandle query; - if( !m_QueryPool.empty() ) - { - query = m_QueryPool.back(); - m_QueryPool.pop_back(); - } + // SRS - Sync on previous frame's command queue completion vs. waitForIdle() on whole device else { - query = m_NvrhiDevice->createEventQuery(); + m_NvrhiDevice->waitEventQuery( m_FrameWaitQuery ); + m_NvrhiDevice->resetEventQuery( m_FrameWaitQuery ); + m_NvrhiDevice->setEventQuery( m_FrameWaitQuery, nvrhi::CommandQueue::Graphics ); } - - m_NvrhiDevice->resetEventQuery( query ); - m_NvrhiDevice->setEventQuery( query, nvrhi::CommandQueue::Graphics ); - m_FramesInFlight.push( query ); } }