Upload using the copy queue in vulkan, which should utilize PCIe DMA transfers. Unfortunately it doesn't seem to be faster.

This commit is contained in:
Magnus Norddahl 2019-11-23 18:19:22 +01:00
parent bff22bbd81
commit 70842720d2
7 changed files with 160 additions and 23 deletions

View file

@ -309,7 +309,7 @@ public:
void addImage(VulkanImage *image, VkImageLayout oldLayout, VkImageLayout newLayout, VkAccessFlags srcAccessMask, VkAccessFlags dstAccessMask, VkImageAspectFlags aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, int baseMipLevel = 0, int levelCount = 1);
void addImage(VkImage image, VkImageLayout oldLayout, VkImageLayout newLayout, VkAccessFlags srcAccessMask, VkAccessFlags dstAccessMask, VkImageAspectFlags aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, int baseMipLevel = 0, int levelCount = 1);
void addQueueTransfer(int srcFamily, int dstFamily, VulkanBuffer *buffer, VkAccessFlags srcAccessMask, VkAccessFlags dstAccessMask);
void addQueueTransfer(int srcFamily, int dstFamily, VulkanImage *image, VkImageLayout layout, VkImageAspectFlags aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, int baseMipLevel = 0, int levelCount = 1);
void addQueueTransfer(int srcFamily, int dstFamily, VulkanImage *image, VkImageLayout oldlayout, VkImageLayout newlayout, VkImageAspectFlags aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, int baseMipLevel = 0, int levelCount = 1);
void execute(VulkanCommandBuffer *commandBuffer, VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask, VkDependencyFlags dependencyFlags = 0);
@ -1234,12 +1234,12 @@ inline void PipelineBarrier::addQueueTransfer(int srcFamily, int dstFamily, Vulk
bufferMemoryBarriers.push_back(barrier);
}
inline void PipelineBarrier::addQueueTransfer(int srcFamily, int dstFamily, VulkanImage *image, VkImageLayout layout, VkImageAspectFlags aspectMask, int baseMipLevel, int levelCount)
inline void PipelineBarrier::addQueueTransfer(int srcFamily, int dstFamily, VulkanImage *image, VkImageLayout oldlayout, VkImageLayout newlayout, VkImageAspectFlags aspectMask, int baseMipLevel, int levelCount)
{
VkImageMemoryBarrier barrier = { };
barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
barrier.oldLayout = layout;
barrier.newLayout = layout;
barrier.oldLayout = oldlayout;
barrier.newLayout = newlayout;
barrier.srcQueueFamilyIndex = srcFamily;
barrier.dstQueueFamilyIndex = dstFamily;
barrier.image = image->image;

View file

@ -163,6 +163,18 @@ void VulkanDevice::SelectPhysicalDevice()
}
}
// Search for a transfer family made specifically for uploading. For nvidia this allows us to upload using DMA transfers via PCIe.
// To identify it, we look for a transfer family that must not have graphics or compute capabilities.
for (int i = 0; i < (int)info.QueueFamilies.size(); i++)
{
const auto& queueFamily = info.QueueFamilies[i];
if (queueFamily.queueCount > 0 && (queueFamily.queueFlags & (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT)) == 0 && (queueFamily.queueFlags & VK_QUEUE_TRANSFER_BIT))
{
dev.copyQueueTransferFamily = i;
break;
}
}
if (dev.graphicsFamily != -1 && dev.presentFamily != -1)
{
SupportedDevices.push_back(dev);
@ -206,6 +218,7 @@ void VulkanDevice::SelectPhysicalDevice()
PhysicalDevice = *SupportedDevices[selected].device;
graphicsFamily = SupportedDevices[selected].graphicsFamily;
presentFamily = SupportedDevices[selected].presentFamily;
copyQueueTransferFamily = SupportedDevices[selected].copyQueueTransferFamily;
graphicsTimeQueries = SupportedDevices[selected].graphicsTimeQueries;
}
@ -234,6 +247,8 @@ void VulkanDevice::CreateDevice()
std::set<int> neededFamilies;
neededFamilies.insert(graphicsFamily);
neededFamilies.insert(presentFamily);
if (copyQueueTransferFamily != -1)
neededFamilies.insert(copyQueueTransferFamily);
for (int index : neededFamilies)
{
@ -261,6 +276,9 @@ void VulkanDevice::CreateDevice()
vkGetDeviceQueue(device, graphicsFamily, 0, &graphicsQueue);
vkGetDeviceQueue(device, presentFamily, 0, &presentQueue);
if (copyQueueTransferFamily != -1)
vkGetDeviceQueue(device, copyQueueTransferFamily, 0, &copyQueue);
}
void VulkanDevice::CreateSurface()

View file

@ -30,6 +30,7 @@ public:
VulkanPhysicalDevice *device = nullptr;
int graphicsFamily = -1;
int presentFamily = -1;
int copyQueueTransferFamily = -1;
bool graphicsTimeQueries = false;
};
@ -72,11 +73,13 @@ public:
VkDevice device = VK_NULL_HANDLE;
VmaAllocator allocator = VK_NULL_HANDLE;
VkQueue copyQueue = VK_NULL_HANDLE;
VkQueue graphicsQueue = VK_NULL_HANDLE;
VkQueue presentQueue = VK_NULL_HANDLE;
int graphicsFamily = -1;
int presentFamily = -1;
int copyQueueTransferFamily = -1;
bool graphicsTimeQueries = false;
private:

View file

@ -146,6 +146,8 @@ void VulkanFrameBuffer::InitializeState()
maxuniformblock = device->PhysicalDevice.Properties.limits.maxUniformBufferRange;
mCommandPool.reset(new VulkanCommandPool(device, device->graphicsFamily));
if (device->copyQueueTransferFamily != -1)
mCopyQueueCommandPool.reset(new VulkanCommandPool(device, device->copyQueueTransferFamily));
mScreenBuffers.reset(new VkRenderBuffers());
mSaveBuffers.reset(new VkRenderBuffers());
@ -218,7 +220,7 @@ void VulkanFrameBuffer::DeleteFrameObjects()
FrameDeleteList.CommandBuffers.clear();
}
void VulkanFrameBuffer::FlushCommands(VulkanCommandBuffer **commands, size_t count, bool finish, bool lastsubmit)
void VulkanFrameBuffer::FlushCommands(VkQueue queue, VulkanCommandBuffer **commands, size_t count, bool finish, bool lastsubmit)
{
int currentIndex = mNextSubmit % maxConcurrentSubmitCount;
@ -245,7 +247,7 @@ void VulkanFrameBuffer::FlushCommands(VulkanCommandBuffer **commands, size_t cou
if (!lastsubmit)
submit.addSignal(mSubmitSemaphore[currentIndex].get());
submit.execute(device, device->graphicsQueue, mSubmitFence[currentIndex].get());
submit.execute(device, queue, mSubmitFence[currentIndex].get());
mNextSubmit++;
}
@ -253,6 +255,15 @@ void VulkanFrameBuffer::FlushCommands(bool finish, bool lastsubmit)
{
mRenderState->EndRenderPass();
if (mCopyQueueCommands)
{
mCopyQueueCommands->end();
VulkanCommandBuffer* command = mCopyQueueCommands.get();
FrameDeleteList.CommandBuffers.push_back(std::move(mCopyQueueCommands));
FlushCommands(device->copyQueue, &command, 1, false, false);
}
if (mDrawCommands || mTransferCommands)
{
VulkanCommandBuffer *commands[2];
@ -272,7 +283,7 @@ void VulkanFrameBuffer::FlushCommands(bool finish, bool lastsubmit)
FrameDeleteList.CommandBuffers.push_back(std::move(mDrawCommands));
}
FlushCommands(commands, count, finish, lastsubmit);
FlushCommands(device->graphicsQueue, commands, count, finish, lastsubmit);
current_rendered_commandbuffers += (int)count;
}
@ -906,6 +917,17 @@ void VulkanFrameBuffer::Draw2D()
::Draw2D(&m2DDrawer, *mRenderState);
}
VulkanCommandBuffer *VulkanFrameBuffer::GetCopyQueueCommands()
{
if (!mCopyQueueCommands)
{
mCopyQueueCommands = mCopyQueueCommandPool->createBuffer();
mCopyQueueCommands->SetDebugName("VulkanFrameBuffer.mCopyQueueCommands");
mCopyQueueCommands->begin();
}
return mCopyQueueCommands.get();
}
VulkanCommandBuffer *VulkanFrameBuffer::GetTransferCommands()
{
if (!mTransferCommands)

View file

@ -27,6 +27,7 @@ public:
uint32_t presentImageIndex = 0xffffffff;
bool cur_vsync;
VulkanCommandBuffer *GetCopyQueueCommands();
VulkanCommandBuffer *GetTransferCommands();
VulkanCommandBuffer *GetDrawCommands();
VkShaderManager *GetShaderManager() { return mShaderManager.get(); }
@ -118,7 +119,7 @@ private:
void CopyScreenToBuffer(int w, int h, void *data);
void UpdateShadowMap();
void DeleteFrameObjects();
void FlushCommands(VulkanCommandBuffer **commands, size_t count, bool finish, bool lastsubmit);
void FlushCommands(VkQueue queue, VulkanCommandBuffer **commands, size_t count, bool finish, bool lastsubmit);
std::unique_ptr<VkShaderManager> mShaderManager;
std::unique_ptr<VkSamplerManager> mSamplerManager;
@ -127,6 +128,8 @@ private:
std::unique_ptr<VkPostprocess> mPostprocess;
std::unique_ptr<VkRenderPassManager> mRenderPassManager;
std::unique_ptr<VulkanCommandPool> mCommandPool;
std::unique_ptr<VulkanCommandPool> mCopyQueueCommandPool;
std::unique_ptr<VulkanCommandBuffer> mCopyQueueCommands;
std::unique_ptr<VulkanCommandBuffer> mTransferCommands;
std::unique_ptr<VkRenderState> mRenderState;

View file

@ -71,13 +71,18 @@ void VkHardwareTexture::Reset()
if (mappedSWFB)
{
mImage.Image->Unmap();
if (mTransferBuffer)
mTransferBuffer->Unmap();
else
mImage.Image->Unmap();
mappedSWFB = nullptr;
}
auto &deleteList = fb->FrameDeleteList;
if (mImage.Image) deleteList.Images.push_back(std::move(mImage.Image));
if (mImage.View) deleteList.ImageViews.push_back(std::move(mImage.View));
if (mTransferImage) deleteList.Images.push_back(std::move(mTransferImage));
if (mTransferBuffer) deleteList.Buffers.push_back(std::move(mTransferBuffer));
for (auto &it : mImage.RSFramebuffers) deleteList.Framebuffers.push_back(std::move(it.second));
if (mDepthStencil.Image) deleteList.Images.push_back(std::move(mDepthStencil.Image));
if (mDepthStencil.View) deleteList.ImageViews.push_back(std::move(mDepthStencil.View));
@ -324,17 +329,52 @@ void VkHardwareTexture::AllocateBuffer(int w, int h, int texelsize)
VkFormat format = texelsize == 4 ? VK_FORMAT_B8G8R8A8_UNORM : VK_FORMAT_R8_UNORM;
ImageBuilder imgbuilder;
VkDeviceSize allocatedBytes = 0;
imgbuilder.setFormat(format);
imgbuilder.setSize(w, h);
imgbuilder.setLinearTiling();
imgbuilder.setUsage(VK_IMAGE_USAGE_SAMPLED_BIT, VMA_MEMORY_USAGE_UNKNOWN, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT);
imgbuilder.setMemoryType(
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
mImage.Image = imgbuilder.create(fb->device, &allocatedBytes);
mImage.Image->SetDebugName("VkHardwareTexture.mImage");
if (fb->device->copyQueueTransferFamily != -1)
{
// Use DMA transfer to get the image to the GPU
BufferBuilder bufbuilder;
bufbuilder.setUsage(VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VMA_MEMORY_USAGE_CPU_ONLY, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT);
bufbuilder.setSize(w * h * texelsize);
mTransferBuffer = bufbuilder.create(fb->device);
mTransferBuffer->SetDebugName("VkHardwareTexture.mTransferBuffer");
ImageBuilder imgbuilder0;
imgbuilder0.setFormat(format);
imgbuilder0.setSize(w, h);
imgbuilder0.setUsage(VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
mTransferImage = imgbuilder0.create(fb->device);
mTransferImage->SetDebugName("VkHardwareTexture.mTransferImage");
ImageBuilder imgbuilder1;
imgbuilder1.setFormat(format);
imgbuilder1.setSize(w, h);
imgbuilder1.setUsage(VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
mImage.Image = imgbuilder1.create(fb->device);
mImage.Image->SetDebugName("VkHardwareTexture.mImage");
bufferpitch = w;
}
else
{
// Memory map the image directly for GPUs where we have no transfer queue (i.e. Intel embedded GPUs)
ImageBuilder imgbuilder;
imgbuilder.setFormat(format);
imgbuilder.setSize(w, h);
imgbuilder.setLinearTiling();
imgbuilder.setUsage(VK_IMAGE_USAGE_SAMPLED_BIT, VMA_MEMORY_USAGE_UNKNOWN, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT);
imgbuilder.setMemoryType(
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
VkDeviceSize allocatedBytes = 0;
mImage.Image = imgbuilder.create(fb->device, &allocatedBytes);
mImage.Image->SetDebugName("VkHardwareTexture.mImage");
bufferpitch = int(allocatedBytes / h / texelsize);
}
mTexelsize = texelsize;
ImageViewBuilder viewbuilder;
@ -347,20 +387,68 @@ void VkHardwareTexture::AllocateBuffer(int w, int h, int texelsize)
VkImageTransition imageTransition;
imageTransition.addImage(&mImage, VK_IMAGE_LAYOUT_GENERAL, true);
imageTransition.execute(cmdbuffer);
bufferpitch = int(allocatedBytes / h / texelsize);
}
}
uint8_t *VkHardwareTexture::MapBuffer()
{
if (!mappedSWFB)
mappedSWFB = (uint8_t*)mImage.Image->Map(0, mImage.Image->width * mImage.Image->height * mTexelsize);
{
if (mTransferBuffer)
mappedSWFB = (uint8_t*)mTransferBuffer->Map(0, mImage.Image->width * mImage.Image->height * mTexelsize);
else
mappedSWFB = (uint8_t*)mImage.Image->Map(0, mImage.Image->width * mImage.Image->height * mTexelsize);
}
return mappedSWFB;
}
unsigned int VkHardwareTexture::CreateTexture(unsigned char * buffer, int w, int h, int texunit, bool mipmap, int translation, const char *name)
{
if (mTransferBuffer)
{
auto fb = GetVulkanFrameBuffer();
auto copyqueue = fb->GetCopyQueueCommands();
// Acquire image, transfer buffer via copy queue (PCIe DMA), release image
PipelineBarrier barrier0;
barrier0.addQueueTransfer(fb->device->graphicsFamily, fb->device->copyQueueTransferFamily, mTransferImage.get(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
barrier0.execute(copyqueue, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);
VkBufferImageCopy region = {};
region.imageExtent.width = mTransferImage->width;
region.imageExtent.height = mTransferImage->height;
region.imageExtent.depth = 1;
region.imageSubresource.mipLevel = 0;
region.imageSubresource.layerCount = 1;
region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
copyqueue->copyBufferToImage(mTransferBuffer->buffer, mTransferImage->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &region);
PipelineBarrier barrier1;
barrier1.addQueueTransfer(fb->device->copyQueueTransferFamily, fb->device->graphicsFamily, mTransferImage.get(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
barrier1.execute(copyqueue, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);
// Acquire image on graphics queue, make a copy of it (on the GPU), then release the image again back to the copy queue
auto gfxqueue = fb->GetTransferCommands();
PipelineBarrier barrier2;
barrier2.addImage(mImage.Image.get(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_ACCESS_SHADER_READ_BIT, VK_ACCESS_TRANSFER_WRITE_BIT);
barrier2.addQueueTransfer(fb->device->copyQueueTransferFamily, fb->device->graphicsFamily, mTransferImage.get(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
barrier2.execute(gfxqueue, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);
VkImageCopy imgregion = {};
imgregion.extent = region.imageExtent;
imgregion.srcSubresource = region.imageSubresource;
imgregion.dstSubresource = region.imageSubresource;
gfxqueue->copyImage(mTransferImage->image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, mImage.Image->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &imgregion);
PipelineBarrier barrier3;
barrier3.addImage(mImage.Image.get(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, mImage.Layout, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
barrier3.addQueueTransfer(fb->device->graphicsFamily, fb->device->copyQueueTransferFamily, mTransferImage.get(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
barrier3.execute(gfxqueue, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);
}
return 0;
}

View file

@ -78,5 +78,8 @@ private:
VkTextureImage mDepthStencil;
std::unique_ptr<VulkanBuffer> mTransferBuffer;
std::unique_ptr<VulkanImage> mTransferImage;
uint8_t* mappedSWFB = nullptr;
};