From b5a431d2f51ed06f0d6aa7976f628545e8369f7f Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Sat, 9 Aug 2025 23:57:43 +0200 Subject: [PATCH] Add explicit GPU resource transitions, memory and compute barriers --- Source/Engine/Graphics/GPUContext.h | 27 +++++ Source/Engine/Graphics/GPUPass.h | 68 +++++++++++ Source/Engine/Graphics/GPUResourceAccess.h | 29 +++++ .../DirectX/DX11/GPUContextDX11.cpp | 4 +- .../DirectX/DX12/GPUContextDX12.cpp | 59 +++++++++- .../DirectX/DX12/GPUContextDX12.h | 4 +- .../GraphicsDevice/Vulkan/GPUBufferVulkan.cpp | 4 +- .../Vulkan/GPUContextVulkan.cpp | 109 ++++++++++++------ .../GraphicsDevice/Vulkan/GPUContextVulkan.h | 12 +- .../Vulkan/RenderToolsVulkan.cpp | 75 ++++++++++++ .../GraphicsDevice/Vulkan/RenderToolsVulkan.h | 8 ++ Source/Engine/Renderer/Utils/BitonicSort.cpp | 2 +- 12 files changed, 353 insertions(+), 48 deletions(-) create mode 100644 Source/Engine/Graphics/GPUPass.h create mode 100644 Source/Engine/Graphics/GPUResourceAccess.h diff --git a/Source/Engine/Graphics/GPUContext.h b/Source/Engine/Graphics/GPUContext.h index a042e3f83..4f1306567 100644 --- a/Source/Engine/Graphics/GPUContext.h +++ b/Source/Engine/Graphics/GPUContext.h @@ -9,6 +9,11 @@ #include "PixelFormat.h" #include "Config.h" +#if PLATFORM_WIN32 +// Fix nasty Win32 define garbage +#undef MemoryBarrier +#endif + class GPUConstantBuffer; class GPUShaderProgramCS; class GPUBuffer; @@ -21,6 +26,8 @@ class GPUResourceView; class GPUTextureView; class GPUBufferView; class GPUVertexLayout; +struct GPUPass; +enum class GPUResourceAccess; // Gets the GPU texture view. Checks if pointer is not null and texture has one or more mip levels loaded. #define GET_TEXTURE_VIEW_SAFE(t) (t && t->ResidentMipLevels() > 0 ? t->View() : nullptr) @@ -632,4 +639,24 @@ public: /// Forces graphics backend to rebind descriptors after command list was used by external graphics library. /// virtual void ForceRebindDescriptors(); + +protected: + friend GPUPass; + int32 _pass = 0; + +public: + // Performs resource state transition into a specific access (mask). + virtual void Transition(GPUResource* resource, GPUResourceAccess access) + { + } + + // Inserts a global memory barrier on data copies between resources. + virtual void MemoryBarrier() + { + } + + // Begins or ends unordered access resource overlap region that allows running different compute shader dispatches simultaneously. + virtual void OverlapUA(bool end) + { + } }; diff --git a/Source/Engine/Graphics/GPUPass.h b/Source/Engine/Graphics/GPUPass.h new file mode 100644 index 000000000..59f8608e2 --- /dev/null +++ b/Source/Engine/Graphics/GPUPass.h @@ -0,0 +1,68 @@ +// Copyright (c) Wojciech Figat. All rights reserved. + +#pragma once + +#include "GPUContext.h" +#include "Engine/Graphics/GPUResourceAccess.h" + +/// +/// Base for GPU rendering passes that control low-level memory access and GPU resources states with usage to optimize rendering. +/// +struct FLAXENGINE_API GPUPass +{ + NON_COPYABLE(GPUPass); + + GPUContext* Context; + + GPUPass(GPUContext* context) + : Context(context) + { + Context->_pass++; + } + + ~GPUPass() + { + Context->_pass--; + } + + // Performs resource state transition into a specific access (mask). Can be done preemptively in the prologue of the pass to execute more efficient barriers. + void Transition(GPUResource* resource, GPUResourceAccess access) + { + Context->Transition(resource, access); + } +}; + +/// +/// GPU pass that manually controls memory barriers and cache flushes when performing batched copy/upload operations with GPU context. Can be used to optimize GPU buffers usage by running different copy operations simultaneously. +/// +struct FLAXENGINE_API GPUMemoryPass : GPUPass +{ + GPUMemoryPass(GPUContext* context) + : GPUPass(context) + { + } + + ~GPUMemoryPass() + { + Context->MemoryBarrier(); + } +}; + +/// +/// GPU pass that controls memory barriers when performing batched Compute shader dispatches with GPU context. Can be used to optimize GPU utilization by running different dispatches simultaneously (by overlapping work). +/// +struct FLAXENGINE_API GPUComputePass : GPUPass +{ + GPUComputePass(GPUContext* context) + : GPUPass(context) + { + Context->OverlapUA(false); + } + + ~GPUComputePass() + { + Context->OverlapUA(true); + } +}; + +// TODO: add GPUDrawPass for render targets and depth/stencil setup with optimized clear for faster drawing on tiled-GPUs (mobile) diff --git a/Source/Engine/Graphics/GPUResourceAccess.h b/Source/Engine/Graphics/GPUResourceAccess.h new file mode 100644 index 000000000..360da2dbb --- /dev/null +++ b/Source/Engine/Graphics/GPUResourceAccess.h @@ -0,0 +1,29 @@ +// Copyright (c) Wojciech Figat. All rights reserved. + +#pragma once + +#include "Engine/Core/Types/BaseTypes.h" + +// GPU resource access flags. Used to describe how resource can be accessed which allows GPU to optimize data layout and memory access. +enum class GPUResourceAccess +{ + None = 0, + CopyRead = 1 << 0, + CopyWrite = 1 << 1, + CpuRead = 1 << 2, + CpuWrite = 1 << 3, + DepthRead = 1 << 4, + DepthWrite = 1 << 5, + DepthBuffer = DepthRead | DepthWrite, + RenderTarget = 1 << 6, + UnorderedAccess = 1 << 7, + IndirectArgs = 1 << 8, + ShaderReadCompute = 1 << 9, + ShaderReadPixel = 1 << 10, + ShaderReadNonPixel = 1 << 11, + ShaderReadGraphics = ShaderReadPixel | ShaderReadNonPixel, + Last, + All = (Last << 1) - 1, +}; + +DECLARE_ENUM_OPERATORS(GPUResourceAccess); diff --git a/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.cpp b/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.cpp index 62f9afd3a..3d94cdd96 100644 --- a/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.cpp +++ b/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.cpp @@ -297,7 +297,7 @@ void GPUContextDX11::SetRenderTarget(GPUTextureView* depthBuffer, const Span(rts[i]); + auto rtDX11 = reinterpret_cast(rts.Get()[i]); rtvs[i] = rtDX11 ? rtDX11->RTV() : nullptr; } int32 rtvsSize = sizeof(ID3D11RenderTargetView*) * rts.Length(); @@ -431,7 +431,7 @@ void GPUContextDX11::BindVB(const Span& vertexBuffers, const uint32* bool vbEdited = false; for (int32 i = 0; i < vertexBuffers.Length(); i++) { - const auto vbDX11 = static_cast(vertexBuffers[i]); + const auto vbDX11 = static_cast(vertexBuffers.Get()[i]); const auto vb = vbDX11 ? vbDX11->GetBuffer() : nullptr; vbEdited |= vb != _vbHandles[i]; _vbHandles[i] = vb; diff --git a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUContextDX12.cpp b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUContextDX12.cpp index 6d06231ee..88afc5cfb 100644 --- a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUContextDX12.cpp +++ b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUContextDX12.cpp @@ -35,6 +35,7 @@ #include "GPUShaderProgramDX12.h" #include "CommandSignatureDX12.h" #include "Engine/Profiler/RenderStats.h" +#include "Engine/Graphics/GPUResourceAccess.h" #include "Engine/Graphics/Shaders/GPUShader.h" #include "Engine/Threading/Threading.h" @@ -51,6 +52,47 @@ inline bool operator!=(const D3D12_INDEX_BUFFER_VIEW& l, const D3D12_INDEX_BUFFE return l.SizeInBytes != r.SizeInBytes || l.Format != r.Format || l.BufferLocation != r.BufferLocation; } +FORCE_INLINE D3D12_RESOURCE_STATES GetResourceState(GPUResourceAccess access) +{ + switch (access) + { + case GPUResourceAccess::None: + return D3D12_RESOURCE_STATE_COMMON; + case GPUResourceAccess::CopyRead: + return D3D12_RESOURCE_STATE_COPY_SOURCE; + case GPUResourceAccess::CopyWrite: + return D3D12_RESOURCE_STATE_COPY_DEST; + case GPUResourceAccess::CpuRead: + return D3D12_RESOURCE_STATE_GENERIC_READ; + case GPUResourceAccess::CpuWrite: + return D3D12_RESOURCE_STATE_COMMON; + case GPUResourceAccess::DepthRead: + return D3D12_RESOURCE_STATE_DEPTH_READ; + case GPUResourceAccess::DepthWrite: + return D3D12_RESOURCE_STATE_DEPTH_WRITE; + case GPUResourceAccess::DepthBuffer: + return D3D12_RESOURCE_STATE_DEPTH_READ | D3D12_RESOURCE_STATE_DEPTH_WRITE; + case GPUResourceAccess::RenderTarget: + return D3D12_RESOURCE_STATE_RENDER_TARGET; + case GPUResourceAccess::UnorderedAccess: + return D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + case GPUResourceAccess::IndirectArgs: + return D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT; + case GPUResourceAccess::ShaderReadPixel: + //return D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE; // TODO: optimize SRV states in flushSRVs to be based on current binding usage slots + case GPUResourceAccess::ShaderReadCompute: + case GPUResourceAccess::ShaderReadNonPixel: + //return D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE; // TODO: optimize SRV states in flushSRVs to be based on current binding usage slots + case GPUResourceAccess::ShaderReadGraphics: + return D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE | D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE; +#if !BUILD_RELEASE + default: + LOG(Error, "Unsupported GPU Resource Access: {}", (uint32)access); +#endif + } + return D3D12_RESOURCE_STATE_COMMON; +} + // Ensure to match the indirect commands arguments layout static_assert(sizeof(GPUDispatchIndirectArgs) == sizeof(D3D12_DISPATCH_ARGUMENTS), "Wrong size of GPUDrawIndirectArgs."); static_assert(OFFSET_OF(GPUDispatchIndirectArgs, ThreadGroupCountX) == OFFSET_OF(D3D12_DISPATCH_ARGUMENTS, ThreadGroupCountX), "Wrong offset for GPUDrawIndirectArgs::ThreadGroupCountX"); @@ -1124,7 +1166,8 @@ void GPUContextDX12::Dispatch(GPUShaderProgramCS* shader, uint32 threadGroupCoun _psDirtyFlag = true; // Insert UAV barrier to ensure proper memory access for multiple sequential dispatches - AddUAVBarrier(); + if (_pass == 0) + AddUAVBarrier(); } void GPUContextDX12::DispatchIndirect(GPUShaderProgramCS* shader, GPUBuffer* bufferForArgs, uint32 offsetForArgs) @@ -1158,7 +1201,8 @@ void GPUContextDX12::DispatchIndirect(GPUShaderProgramCS* shader, GPUBuffer* buf _psDirtyFlag = true; // Insert UAV barrier to ensure proper memory access for multiple sequential dispatches - AddUAVBarrier(); + if (_pass == 0) + AddUAVBarrier(); } void GPUContextDX12::ResolveMultisample(GPUTexture* sourceMultisampleTexture, GPUTexture* destTexture, int32 sourceSubResource, int32 destSubResource, PixelFormat format) @@ -1549,4 +1593,15 @@ void GPUContextDX12::ForceRebindDescriptors() _commandList->SetDescriptorHeaps(ARRAY_COUNT(ppHeaps), ppHeaps); } +void GPUContextDX12::Transition(GPUResource* resource, GPUResourceAccess access) +{ + SetResourceState(dynamic_cast(resource), GetResourceState(access)); +} + +void GPUContextDX12::OverlapUA(bool end) +{ + if (end) + AddUAVBarrier(); +} + #endif diff --git a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUContextDX12.h b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUContextDX12.h index 917b68165..4bd1b54a1 100644 --- a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUContextDX12.h +++ b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUContextDX12.h @@ -21,7 +21,7 @@ class GPUVertexLayoutDX12; /// /// Size of the resource barriers buffer size (will be flushed on overflow) /// -#define DX12_RB_BUFFER_SIZE 16 +#define DX12_RB_BUFFER_SIZE 64 /// /// GPU Commands Context implementation for DirectX 12 @@ -214,6 +214,8 @@ public: void CopySubresource(GPUResource* dstResource, uint32 dstSubresource, GPUResource* srcResource, uint32 srcSubresource) override; void SetResourceState(GPUResource* resource, uint64 state, int32 subresource) override; void ForceRebindDescriptors() override; + void Transition(GPUResource* resource, GPUResourceAccess access) override; + void OverlapUA(bool end) override; }; #endif diff --git a/Source/Engine/GraphicsDevice/Vulkan/GPUBufferVulkan.cpp b/Source/Engine/GraphicsDevice/Vulkan/GPUBufferVulkan.cpp index e20b2f89e..a1c3d71fb 100644 --- a/Source/Engine/GraphicsDevice/Vulkan/GPUBufferVulkan.cpp +++ b/Source/Engine/GraphicsDevice/Vulkan/GPUBufferVulkan.cpp @@ -19,7 +19,7 @@ void GPUBufferViewVulkan::Init(GPUDeviceVulkan* device, GPUBufferVulkan* owner, Buffer = buffer; Size = size; - if ((owner->IsShaderResource() && !(owner->GetDescription().Flags & GPUBufferFlags::Structured)) || (usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) == VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) + if ((EnumHasAnyFlags(owner->GetDescription().Flags, GPUBufferFlags::ShaderResource | GPUBufferFlags::UnorderedAccess) && !(owner->GetDescription().Flags & GPUBufferFlags::Structured)) || (usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) == VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) { VkBufferViewCreateInfo viewInfo; RenderToolsVulkan::ZeroStruct(viewInfo, VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO); @@ -103,7 +103,7 @@ bool GPUBufferVulkan::OnInit() bufferInfo.usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT; if (useUAV || EnumHasAnyFlags(_desc.Flags, GPUBufferFlags::RawBuffer | GPUBufferFlags::Structured)) bufferInfo.usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - if (useUAV && useSRV) + if (useUAV) bufferInfo.usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT; if (EnumHasAnyFlags(_desc.Flags, GPUBufferFlags::Argument)) bufferInfo.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT; diff --git a/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.cpp b/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.cpp index c36d1acee..430ce5b70 100644 --- a/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.cpp +++ b/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.cpp @@ -78,13 +78,14 @@ const Char* ToString(VkImageLayout layout) void PipelineBarrierVulkan::Execute(const CmdBufferVulkan* cmdBuffer) { ASSERT(cmdBuffer->IsOutsideRenderPass()); - vkCmdPipelineBarrier(cmdBuffer->GetHandle(), SourceStage, DestStage, 0, 0, nullptr, BufferBarriers.Count(), BufferBarriers.Get(), ImageBarriers.Count(), ImageBarriers.Get()); + vkCmdPipelineBarrier(cmdBuffer->GetHandle(), SourceStage, DestStage, 0, MemoryBarriers.Count(), MemoryBarriers.Get(), BufferBarriers.Count(), BufferBarriers.Get(), ImageBarriers.Count(), ImageBarriers.Get()); // Reset SourceStage = 0; DestStage = 0; ImageBarriers.Clear(); BufferBarriers.Clear(); + MemoryBarriers.Clear(); #if VK_ENABLE_BARRIERS_DEBUG ImageBarriersDebug.Clear(); #endif @@ -153,12 +154,7 @@ void GPUContextVulkan::AddImageBarrier(VkImage image, VkImageLayout srcLayout, V #if VK_ENABLE_BARRIERS_BATCHING // Auto-flush on overflow if (_barriers.IsFull()) - { - const auto cmdBuffer = _cmdBufferManager->GetCmdBuffer(); - if (cmdBuffer->IsInsideRenderPass()) - EndRenderPass(); - _barriers.Execute(cmdBuffer); - } + FlushBarriers(); #endif // Insert barrier @@ -190,10 +186,7 @@ void GPUContextVulkan::AddImageBarrier(VkImage image, VkImageLayout srcLayout, V #if !VK_ENABLE_BARRIERS_BATCHING // Auto-flush without batching - const auto cmdBuffer = _cmdBufferManager->GetCmdBuffer(); - if (cmdBuffer->IsInsideRenderPass()) - EndRenderPass(); - _barriers.Execute(cmdBuffer); + FlushBarriers(); #endif } @@ -315,12 +308,7 @@ void GPUContextVulkan::AddBufferBarrier(GPUBufferVulkan* buffer, VkAccessFlags d #if VK_ENABLE_BARRIERS_BATCHING // Auto-flush on overflow if (_barriers.IsFull()) - { - const auto cmdBuffer = _cmdBufferManager->GetCmdBuffer(); - if (cmdBuffer->IsInsideRenderPass()) - EndRenderPass(); - _barriers.Execute(cmdBuffer); - } + FlushBarriers(); #endif // Insert barrier @@ -339,13 +327,38 @@ void GPUContextVulkan::AddBufferBarrier(GPUBufferVulkan* buffer, VkAccessFlags d #if !VK_ENABLE_BARRIERS_BATCHING // Auto-flush without batching - const auto cmdBuffer = _cmdBufferManager->GetCmdBuffer(); - if (cmdBuffer->IsInsideRenderPass()) - EndRenderPass(); - _barriers.Execute(cmdBuffer); + FlushBarriers(); #endif } +void GPUContextVulkan::AddMemoryBarrier() +{ +#if VK_ENABLE_BARRIERS_BATCHING + // Auto-flush on overflow + if (_barriers.IsFull()) + FlushBarriers(); +#endif + + // Insert barrier + VkMemoryBarrier& memoryBarrier = _barriers.MemoryBarriers.AddOne(); + RenderToolsVulkan::ZeroStruct(memoryBarrier, VK_STRUCTURE_TYPE_MEMORY_BARRIER); + memoryBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + memoryBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT; + _barriers.SourceStage |= VK_PIPELINE_STAGE_TRANSFER_BIT; + _barriers.DestStage |= VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; + +#if !VK_ENABLE_BARRIERS_BATCHING + // Auto-flush without batching + FlushBarriers(); +#endif +} + +void GPUContextVulkan::AddUABarrier() +{ + _barriers.SourceStage |= VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; + _barriers.DestStage |= VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; +} + void GPUContextVulkan::FlushBarriers() { #if VK_ENABLE_BARRIERS_BATCHING @@ -475,7 +488,7 @@ void GPUContextVulkan::EndRenderPass() cmdBuffer->EndRenderPass(); _renderPass = nullptr; - // Place a barrier between RenderPasses, so that color / depth outputs can be read in subsequent passes + // Place a barrier between RenderPasses, so that color/depth outputs can be read in subsequent passes // TODO: remove it in future and use proper barriers without whole pipeline stalls vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 0, nullptr); } @@ -1155,8 +1168,8 @@ void GPUContextVulkan::Dispatch(GPUShaderProgramCS* shader, uint32 threadGroupCo RENDER_STAT_DISPATCH_CALL(); // Place a barrier between dispatches, so that UAVs can be read+write in subsequent passes - // TODO: optimize it by moving inputs/outputs into higher-layer so eg. Global SDF can manually optimize it - vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 0, nullptr); + if (_pass == 0) + AddUABarrier(); #if VK_ENABLE_BARRIERS_DEBUG LOG(Warning, "Dispatch"); @@ -1191,8 +1204,8 @@ void GPUContextVulkan::DispatchIndirect(GPUShaderProgramCS* shader, GPUBuffer* b RENDER_STAT_DISPATCH_CALL(); // Place a barrier between dispatches, so that UAVs can be read+write in subsequent passes - // TODO: optimize it by moving inputs/outputs into higher-layer so eg. Global SDF can manually optimize it - vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 0, nullptr); + if (_pass == 0) + AddUABarrier(); #if VK_ENABLE_BARRIERS_DEBUG LOG(Warning, "DispatchIndirect"); @@ -1351,18 +1364,14 @@ void GPUContextVulkan::UpdateBuffer(GPUBuffer* buffer, const void* data, uint32 const auto bufferVulkan = static_cast(buffer); - // Memory transfer barrier - // TODO: batch pipeline barriers - const VkMemoryBarrier barrierBefore = { VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr, VK_ACCESS_MEMORY_WRITE_BIT, VK_ACCESS_MEMORY_READ_BIT }; - vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrierBefore, 0, nullptr, 0, nullptr); + // Transition resource + AddBufferBarrier(bufferVulkan, VK_ACCESS_TRANSFER_WRITE_BIT); + FlushBarriers(); // Use direct update for small buffers const uint32 alignedSize = Math::AlignUp(size, 4); if (size <= 4 * 1024 && alignedSize <= buffer->GetSize()) { - //AddBufferBarrier(bufferVulkan, VK_ACCESS_TRANSFER_WRITE_BIT); - //FlushBarriers(); - vkCmdUpdateBuffer(cmdBuffer->GetHandle(), bufferVulkan->GetHandle(), offset, alignedSize, data); } else @@ -1379,10 +1388,9 @@ void GPUContextVulkan::UpdateBuffer(GPUBuffer* buffer, const void* data, uint32 _device->StagingManager.ReleaseBuffer(cmdBuffer, staging); } - // Memory transfer barrier - // TODO: batch pipeline barriers - const VkMemoryBarrier barrierAfter = { VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT }; - vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 1, &barrierAfter, 0, nullptr, 0, nullptr); + // Memory transfer barrier to ensure buffer is ready to read (eg. by Draw or Dispatch) + if (_pass == 0) + AddMemoryBarrier(); } void GPUContextVulkan::CopyBuffer(GPUBuffer* dstBuffer, GPUBuffer* srcBuffer, uint32 size, uint32 dstOffset, uint32 srcOffset) @@ -1407,6 +1415,10 @@ void GPUContextVulkan::CopyBuffer(GPUBuffer* dstBuffer, GPUBuffer* srcBuffer, ui bufferCopy.dstOffset = dstOffset; bufferCopy.size = size; vkCmdCopyBuffer(cmdBuffer->GetHandle(), srcBufferVulkan->GetHandle(), dstBufferVulkan->GetHandle(), 1, &bufferCopy); + + // Memory transfer barrier to ensure buffer is ready to read (eg. by Draw or Dispatch) + if (_pass == 0) + AddMemoryBarrier(); } void GPUContextVulkan::UpdateTexture(GPUTexture* texture, int32 arrayIndex, int32 mipIndex, const void* data, uint32 rowPitch, uint32 slicePitch) @@ -1816,4 +1828,27 @@ void GPUContextVulkan::CopySubresource(GPUResource* dstResource, uint32 dstSubre } } +void GPUContextVulkan::Transition(GPUResource* resource, GPUResourceAccess access) +{ + if (auto buffer = dynamic_cast(resource)) + { + AddBufferBarrier(buffer, RenderToolsVulkan::GetAccess(access)); + } + else if (auto texture = dynamic_cast(resource)) + { + AddImageBarrier(texture, RenderToolsVulkan::GetImageLayout(access)); + } +} + +void GPUContextVulkan::MemoryBarrier() +{ + AddMemoryBarrier(); +} + +void GPUContextVulkan::OverlapUA(bool end) +{ + if (end) + AddUABarrier(); +} + #endif diff --git a/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.h b/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.h index 73aa5a52f..d3dd1c528 100644 --- a/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.h +++ b/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.h @@ -34,7 +34,7 @@ class DescriptorSetLayoutVulkan; /// /// Size of the pipeline barriers buffer size (will be auto-flushed on overflow). /// -#define VK_BARRIER_BUFFER_SIZE 16 +#define VK_BARRIER_BUFFER_SIZE 64 /// /// The Vulkan pipeline resources layout barrier batching structure. @@ -45,18 +45,19 @@ struct PipelineBarrierVulkan VkPipelineStageFlags DestStage = 0; Array> ImageBarriers; Array> BufferBarriers; + Array> MemoryBarriers; #if VK_ENABLE_BARRIERS_DEBUG Array> ImageBarriersDebug; #endif FORCE_INLINE bool IsFull() const { - return ImageBarriers.Count() == VK_BARRIER_BUFFER_SIZE || BufferBarriers.Count() == VK_BARRIER_BUFFER_SIZE; + return ImageBarriers.Count() == VK_BARRIER_BUFFER_SIZE || BufferBarriers.Count() == VK_BARRIER_BUFFER_SIZE || MemoryBarriers.Count() == 4; } FORCE_INLINE bool HasBarrier() const { - return ImageBarriers.Count() + BufferBarriers.Count() != 0; + return ImageBarriers.Count() + BufferBarriers.Count() + MemoryBarriers.Count() != 0; } void Execute(const CmdBufferVulkan* cmdBuffer); @@ -130,6 +131,8 @@ public: void AddImageBarrier(GPUTextureVulkan* texture, int32 mipSlice, int32 arraySlice, VkImageLayout dstLayout); void AddImageBarrier(GPUTextureVulkan* texture, VkImageLayout dstLayout); void AddBufferBarrier(GPUBufferVulkan* buffer, VkAccessFlags dstAccess); + void AddMemoryBarrier(); + void AddUABarrier(); void FlushBarriers(); @@ -199,6 +202,9 @@ public: void CopyCounter(GPUBuffer* dstBuffer, uint32 dstOffset, GPUBuffer* srcBuffer) override; void CopyResource(GPUResource* dstResource, GPUResource* srcResource) override; void CopySubresource(GPUResource* dstResource, uint32 dstSubresource, GPUResource* srcResource, uint32 srcSubresource) override; + void Transition(GPUResource* resource, GPUResourceAccess access) override; + void MemoryBarrier() override; + void OverlapUA(bool end) override; }; #endif diff --git a/Source/Engine/GraphicsDevice/Vulkan/RenderToolsVulkan.cpp b/Source/Engine/GraphicsDevice/Vulkan/RenderToolsVulkan.cpp index 604b8a612..4a8d138ed 100644 --- a/Source/Engine/GraphicsDevice/Vulkan/RenderToolsVulkan.cpp +++ b/Source/Engine/GraphicsDevice/Vulkan/RenderToolsVulkan.cpp @@ -5,6 +5,7 @@ #include "RenderToolsVulkan.h" #include "Engine/Core/Types/StringBuilder.h" #include "Engine/Core/Log.h" +#include "Engine/Graphics/GPUResourceAccess.h" // @formatter:off @@ -258,6 +259,80 @@ void RenderToolsVulkan::LogVkResult(VkResult result, const char* file, uint32 li #endif } +VkAccessFlags RenderToolsVulkan::GetAccess(GPUResourceAccess access) +{ + switch (access) + { + case GPUResourceAccess::None: + return VK_ACCESS_NONE; + case GPUResourceAccess::CopyRead: + return VK_ACCESS_TRANSFER_READ_BIT; + case GPUResourceAccess::CopyWrite: + return VK_ACCESS_TRANSFER_WRITE_BIT; + case GPUResourceAccess::CpuRead: + return VK_ACCESS_HOST_READ_BIT; + case GPUResourceAccess::CpuWrite: + return VK_ACCESS_HOST_WRITE_BIT; + case GPUResourceAccess::DepthRead: + return VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT; + case GPUResourceAccess::DepthWrite: + return VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + case GPUResourceAccess::DepthBuffer: + return VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + case GPUResourceAccess::RenderTarget: + return VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + case GPUResourceAccess::UnorderedAccess: + return VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + case GPUResourceAccess::IndirectArgs: + return VK_ACCESS_INDIRECT_COMMAND_READ_BIT; + case GPUResourceAccess::ShaderReadCompute: + case GPUResourceAccess::ShaderReadPixel: + case GPUResourceAccess::ShaderReadNonPixel: + case GPUResourceAccess::ShaderReadGraphics: + return VK_ACCESS_SHADER_READ_BIT; +#if !BUILD_RELEASE + default: + LOG(Error, "Unsupported GPU Resource Access: {}", (uint32)access); +#endif + } + return VK_ACCESS_NONE; +} + +VkImageLayout RenderToolsVulkan::GetImageLayout(GPUResourceAccess access) +{ + switch (access) + { + case GPUResourceAccess::None: + return VK_IMAGE_LAYOUT_UNDEFINED; + case GPUResourceAccess::CopyRead: + return VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + case GPUResourceAccess::CopyWrite: + return VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + case GPUResourceAccess::CpuRead: + case GPUResourceAccess::CpuWrite: + return VK_IMAGE_LAYOUT_GENERAL; + case GPUResourceAccess::DepthRead: + return VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL; + case GPUResourceAccess::DepthWrite: + case GPUResourceAccess::DepthBuffer: + return VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL; + return VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL; + case GPUResourceAccess::RenderTarget: + return VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL; + case GPUResourceAccess::UnorderedAccess: + case GPUResourceAccess::ShaderReadCompute: + case GPUResourceAccess::ShaderReadPixel: + case GPUResourceAccess::ShaderReadNonPixel: + case GPUResourceAccess::ShaderReadGraphics: + return VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; +#if !BUILD_RELEASE + default: + LOG(Error, "Unsupported GPU Resource Access: {}", (uint32)access); +#endif + } + return VK_IMAGE_LAYOUT_UNDEFINED; +} + bool RenderToolsVulkan::HasExtension(const Array& extensions, const char* name) { for (int32 i = 0; i < extensions.Count(); i++) diff --git a/Source/Engine/GraphicsDevice/Vulkan/RenderToolsVulkan.h b/Source/Engine/GraphicsDevice/Vulkan/RenderToolsVulkan.h index d2d1bca79..82167fd6c 100644 --- a/Source/Engine/GraphicsDevice/Vulkan/RenderToolsVulkan.h +++ b/Source/Engine/GraphicsDevice/Vulkan/RenderToolsVulkan.h @@ -20,6 +20,8 @@ #define VK_SET_DEBUG_NAME(device, handle, type, name) #endif +enum class GPUResourceAccess; + /// /// Set of utilities for rendering on Vulkan platform. /// @@ -40,6 +42,9 @@ public: static String GetVkErrorString(VkResult result); static void LogVkResult(VkResult result, const char* file = nullptr, uint32 line = 0, bool fatal = false); + static VkAccessFlags GetAccess(GPUResourceAccess access); + static VkImageLayout GetImageLayout(GPUResourceAccess access); + static inline VkPipelineStageFlags GetBufferBarrierFlags(VkAccessFlags accessFlags) { VkPipelineStageFlags stageFlags = (VkPipelineStageFlags)0; @@ -67,6 +72,9 @@ public: case VK_ACCESS_SHADER_WRITE_BIT: stageFlags = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; break; + case VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT: + stageFlags = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + break; case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT: case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: stageFlags = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; diff --git a/Source/Engine/Renderer/Utils/BitonicSort.cpp b/Source/Engine/Renderer/Utils/BitonicSort.cpp index cd0f627f5..a031b0e9d 100644 --- a/Source/Engine/Renderer/Utils/BitonicSort.cpp +++ b/Source/Engine/Renderer/Utils/BitonicSort.cpp @@ -82,7 +82,7 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* indicesBuffer, GPUBuffer* if (checkIfSkipPass()) return; PROFILE_GPU_CPU("Bitonic Sort"); - uint32 maxNumElements = indicesBuffer->GetElementsCount(); + int32 maxNumElements = (int32)indicesBuffer->GetElementsCount(); if (maxElements > 0 && maxElements < maxNumElements) maxNumElements = maxElements; const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements);