Add explicit GPU resource transitions, memory and compute barriers

This commit is contained in:
Wojtek Figat
2025-08-09 23:57:43 +02:00
parent 3907bc4957
commit b5a431d2f5
12 changed files with 353 additions and 48 deletions

View File

@@ -9,6 +9,11 @@
#include "PixelFormat.h" #include "PixelFormat.h"
#include "Config.h" #include "Config.h"
#if PLATFORM_WIN32
// Fix nasty Win32 define garbage
#undef MemoryBarrier
#endif
class GPUConstantBuffer; class GPUConstantBuffer;
class GPUShaderProgramCS; class GPUShaderProgramCS;
class GPUBuffer; class GPUBuffer;
@@ -21,6 +26,8 @@ class GPUResourceView;
class GPUTextureView; class GPUTextureView;
class GPUBufferView; class GPUBufferView;
class GPUVertexLayout; class GPUVertexLayout;
struct GPUPass;
enum class GPUResourceAccess;
// Gets the GPU texture view. Checks if pointer is not null and texture has one or more mip levels loaded. // Gets the GPU texture view. Checks if pointer is not null and texture has one or more mip levels loaded.
#define GET_TEXTURE_VIEW_SAFE(t) (t && t->ResidentMipLevels() > 0 ? t->View() : nullptr) #define GET_TEXTURE_VIEW_SAFE(t) (t && t->ResidentMipLevels() > 0 ? t->View() : nullptr)
@@ -632,4 +639,24 @@ public:
/// Forces graphics backend to rebind descriptors after command list was used by external graphics library. /// Forces graphics backend to rebind descriptors after command list was used by external graphics library.
/// </summary> /// </summary>
virtual void ForceRebindDescriptors(); virtual void ForceRebindDescriptors();
protected:
friend GPUPass;
int32 _pass = 0;
public:
// Performs resource state transition into a specific access (mask).
virtual void Transition(GPUResource* resource, GPUResourceAccess access)
{
}
// Inserts a global memory barrier on data copies between resources.
virtual void MemoryBarrier()
{
}
// Begins or ends unordered access resource overlap region that allows running different compute shader dispatches simultaneously.
virtual void OverlapUA(bool end)
{
}
}; };

View File

@@ -0,0 +1,68 @@
// Copyright (c) Wojciech Figat. All rights reserved.
#pragma once
#include "GPUContext.h"
#include "Engine/Graphics/GPUResourceAccess.h"
/// <summary>
/// Base for GPU rendering passes that control low-level memory access and GPU resources states with usage to optimize rendering.
/// </summary>
struct FLAXENGINE_API GPUPass
{
NON_COPYABLE(GPUPass);
GPUContext* Context;
GPUPass(GPUContext* context)
: Context(context)
{
Context->_pass++;
}
~GPUPass()
{
Context->_pass--;
}
// Performs resource state transition into a specific access (mask). Can be done preemptively in the prologue of the pass to execute more efficient barriers.
void Transition(GPUResource* resource, GPUResourceAccess access)
{
Context->Transition(resource, access);
}
};
/// <summary>
/// GPU pass that manually controls memory barriers and cache flushes when performing batched copy/upload operations with GPU context. Can be used to optimize GPU buffers usage by running different copy operations simultaneously.
/// </summary>
struct FLAXENGINE_API GPUMemoryPass : GPUPass
{
GPUMemoryPass(GPUContext* context)
: GPUPass(context)
{
}
~GPUMemoryPass()
{
Context->MemoryBarrier();
}
};
/// <summary>
/// GPU pass that controls memory barriers when performing batched Compute shader dispatches with GPU context. Can be used to optimize GPU utilization by running different dispatches simultaneously (by overlapping work).
/// </summary>
struct FLAXENGINE_API GPUComputePass : GPUPass
{
GPUComputePass(GPUContext* context)
: GPUPass(context)
{
Context->OverlapUA(false);
}
~GPUComputePass()
{
Context->OverlapUA(true);
}
};
// TODO: add GPUDrawPass for render targets and depth/stencil setup with optimized clear for faster drawing on tiled-GPUs (mobile)

View File

@@ -0,0 +1,29 @@
// Copyright (c) Wojciech Figat. All rights reserved.
#pragma once
#include "Engine/Core/Types/BaseTypes.h"
// GPU resource access flags. Used to describe how resource can be accessed which allows GPU to optimize data layout and memory access.
enum class GPUResourceAccess
{
None = 0,
CopyRead = 1 << 0,
CopyWrite = 1 << 1,
CpuRead = 1 << 2,
CpuWrite = 1 << 3,
DepthRead = 1 << 4,
DepthWrite = 1 << 5,
DepthBuffer = DepthRead | DepthWrite,
RenderTarget = 1 << 6,
UnorderedAccess = 1 << 7,
IndirectArgs = 1 << 8,
ShaderReadCompute = 1 << 9,
ShaderReadPixel = 1 << 10,
ShaderReadNonPixel = 1 << 11,
ShaderReadGraphics = ShaderReadPixel | ShaderReadNonPixel,
Last,
All = (Last << 1) - 1,
};
DECLARE_ENUM_OPERATORS(GPUResourceAccess);

View File

@@ -297,7 +297,7 @@ void GPUContextDX11::SetRenderTarget(GPUTextureView* depthBuffer, const Span<GPU
__declspec(align(16)) ID3D11RenderTargetView* rtvs[GPU_MAX_RT_BINDED]; __declspec(align(16)) ID3D11RenderTargetView* rtvs[GPU_MAX_RT_BINDED];
for (int32 i = 0; i < rts.Length(); i++) for (int32 i = 0; i < rts.Length(); i++)
{ {
auto rtDX11 = reinterpret_cast<GPUTextureViewDX11*>(rts[i]); auto rtDX11 = reinterpret_cast<GPUTextureViewDX11*>(rts.Get()[i]);
rtvs[i] = rtDX11 ? rtDX11->RTV() : nullptr; rtvs[i] = rtDX11 ? rtDX11->RTV() : nullptr;
} }
int32 rtvsSize = sizeof(ID3D11RenderTargetView*) * rts.Length(); int32 rtvsSize = sizeof(ID3D11RenderTargetView*) * rts.Length();
@@ -431,7 +431,7 @@ void GPUContextDX11::BindVB(const Span<GPUBuffer*>& vertexBuffers, const uint32*
bool vbEdited = false; bool vbEdited = false;
for (int32 i = 0; i < vertexBuffers.Length(); i++) for (int32 i = 0; i < vertexBuffers.Length(); i++)
{ {
const auto vbDX11 = static_cast<GPUBufferDX11*>(vertexBuffers[i]); const auto vbDX11 = static_cast<GPUBufferDX11*>(vertexBuffers.Get()[i]);
const auto vb = vbDX11 ? vbDX11->GetBuffer() : nullptr; const auto vb = vbDX11 ? vbDX11->GetBuffer() : nullptr;
vbEdited |= vb != _vbHandles[i]; vbEdited |= vb != _vbHandles[i];
_vbHandles[i] = vb; _vbHandles[i] = vb;

View File

@@ -35,6 +35,7 @@
#include "GPUShaderProgramDX12.h" #include "GPUShaderProgramDX12.h"
#include "CommandSignatureDX12.h" #include "CommandSignatureDX12.h"
#include "Engine/Profiler/RenderStats.h" #include "Engine/Profiler/RenderStats.h"
#include "Engine/Graphics/GPUResourceAccess.h"
#include "Engine/Graphics/Shaders/GPUShader.h" #include "Engine/Graphics/Shaders/GPUShader.h"
#include "Engine/Threading/Threading.h" #include "Engine/Threading/Threading.h"
@@ -51,6 +52,47 @@ inline bool operator!=(const D3D12_INDEX_BUFFER_VIEW& l, const D3D12_INDEX_BUFFE
return l.SizeInBytes != r.SizeInBytes || l.Format != r.Format || l.BufferLocation != r.BufferLocation; return l.SizeInBytes != r.SizeInBytes || l.Format != r.Format || l.BufferLocation != r.BufferLocation;
} }
FORCE_INLINE D3D12_RESOURCE_STATES GetResourceState(GPUResourceAccess access)
{
switch (access)
{
case GPUResourceAccess::None:
return D3D12_RESOURCE_STATE_COMMON;
case GPUResourceAccess::CopyRead:
return D3D12_RESOURCE_STATE_COPY_SOURCE;
case GPUResourceAccess::CopyWrite:
return D3D12_RESOURCE_STATE_COPY_DEST;
case GPUResourceAccess::CpuRead:
return D3D12_RESOURCE_STATE_GENERIC_READ;
case GPUResourceAccess::CpuWrite:
return D3D12_RESOURCE_STATE_COMMON;
case GPUResourceAccess::DepthRead:
return D3D12_RESOURCE_STATE_DEPTH_READ;
case GPUResourceAccess::DepthWrite:
return D3D12_RESOURCE_STATE_DEPTH_WRITE;
case GPUResourceAccess::DepthBuffer:
return D3D12_RESOURCE_STATE_DEPTH_READ | D3D12_RESOURCE_STATE_DEPTH_WRITE;
case GPUResourceAccess::RenderTarget:
return D3D12_RESOURCE_STATE_RENDER_TARGET;
case GPUResourceAccess::UnorderedAccess:
return D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
case GPUResourceAccess::IndirectArgs:
return D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT;
case GPUResourceAccess::ShaderReadPixel:
//return D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE; // TODO: optimize SRV states in flushSRVs to be based on current binding usage slots
case GPUResourceAccess::ShaderReadCompute:
case GPUResourceAccess::ShaderReadNonPixel:
//return D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE; // TODO: optimize SRV states in flushSRVs to be based on current binding usage slots
case GPUResourceAccess::ShaderReadGraphics:
return D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE | D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE;
#if !BUILD_RELEASE
default:
LOG(Error, "Unsupported GPU Resource Access: {}", (uint32)access);
#endif
}
return D3D12_RESOURCE_STATE_COMMON;
}
// Ensure to match the indirect commands arguments layout // Ensure to match the indirect commands arguments layout
static_assert(sizeof(GPUDispatchIndirectArgs) == sizeof(D3D12_DISPATCH_ARGUMENTS), "Wrong size of GPUDrawIndirectArgs."); static_assert(sizeof(GPUDispatchIndirectArgs) == sizeof(D3D12_DISPATCH_ARGUMENTS), "Wrong size of GPUDrawIndirectArgs.");
static_assert(OFFSET_OF(GPUDispatchIndirectArgs, ThreadGroupCountX) == OFFSET_OF(D3D12_DISPATCH_ARGUMENTS, ThreadGroupCountX), "Wrong offset for GPUDrawIndirectArgs::ThreadGroupCountX"); static_assert(OFFSET_OF(GPUDispatchIndirectArgs, ThreadGroupCountX) == OFFSET_OF(D3D12_DISPATCH_ARGUMENTS, ThreadGroupCountX), "Wrong offset for GPUDrawIndirectArgs::ThreadGroupCountX");
@@ -1124,7 +1166,8 @@ void GPUContextDX12::Dispatch(GPUShaderProgramCS* shader, uint32 threadGroupCoun
_psDirtyFlag = true; _psDirtyFlag = true;
// Insert UAV barrier to ensure proper memory access for multiple sequential dispatches // Insert UAV barrier to ensure proper memory access for multiple sequential dispatches
AddUAVBarrier(); if (_pass == 0)
AddUAVBarrier();
} }
void GPUContextDX12::DispatchIndirect(GPUShaderProgramCS* shader, GPUBuffer* bufferForArgs, uint32 offsetForArgs) void GPUContextDX12::DispatchIndirect(GPUShaderProgramCS* shader, GPUBuffer* bufferForArgs, uint32 offsetForArgs)
@@ -1158,7 +1201,8 @@ void GPUContextDX12::DispatchIndirect(GPUShaderProgramCS* shader, GPUBuffer* buf
_psDirtyFlag = true; _psDirtyFlag = true;
// Insert UAV barrier to ensure proper memory access for multiple sequential dispatches // Insert UAV barrier to ensure proper memory access for multiple sequential dispatches
AddUAVBarrier(); if (_pass == 0)
AddUAVBarrier();
} }
void GPUContextDX12::ResolveMultisample(GPUTexture* sourceMultisampleTexture, GPUTexture* destTexture, int32 sourceSubResource, int32 destSubResource, PixelFormat format) void GPUContextDX12::ResolveMultisample(GPUTexture* sourceMultisampleTexture, GPUTexture* destTexture, int32 sourceSubResource, int32 destSubResource, PixelFormat format)
@@ -1549,4 +1593,15 @@ void GPUContextDX12::ForceRebindDescriptors()
_commandList->SetDescriptorHeaps(ARRAY_COUNT(ppHeaps), ppHeaps); _commandList->SetDescriptorHeaps(ARRAY_COUNT(ppHeaps), ppHeaps);
} }
void GPUContextDX12::Transition(GPUResource* resource, GPUResourceAccess access)
{
SetResourceState(dynamic_cast<ResourceOwnerDX12*>(resource), GetResourceState(access));
}
void GPUContextDX12::OverlapUA(bool end)
{
if (end)
AddUAVBarrier();
}
#endif #endif

View File

@@ -21,7 +21,7 @@ class GPUVertexLayoutDX12;
/// <summary> /// <summary>
/// Size of the resource barriers buffer size (will be flushed on overflow) /// Size of the resource barriers buffer size (will be flushed on overflow)
/// </summary> /// </summary>
#define DX12_RB_BUFFER_SIZE 16 #define DX12_RB_BUFFER_SIZE 64
/// <summary> /// <summary>
/// GPU Commands Context implementation for DirectX 12 /// GPU Commands Context implementation for DirectX 12
@@ -214,6 +214,8 @@ public:
void CopySubresource(GPUResource* dstResource, uint32 dstSubresource, GPUResource* srcResource, uint32 srcSubresource) override; void CopySubresource(GPUResource* dstResource, uint32 dstSubresource, GPUResource* srcResource, uint32 srcSubresource) override;
void SetResourceState(GPUResource* resource, uint64 state, int32 subresource) override; void SetResourceState(GPUResource* resource, uint64 state, int32 subresource) override;
void ForceRebindDescriptors() override; void ForceRebindDescriptors() override;
void Transition(GPUResource* resource, GPUResourceAccess access) override;
void OverlapUA(bool end) override;
}; };
#endif #endif

View File

@@ -19,7 +19,7 @@ void GPUBufferViewVulkan::Init(GPUDeviceVulkan* device, GPUBufferVulkan* owner,
Buffer = buffer; Buffer = buffer;
Size = size; Size = size;
if ((owner->IsShaderResource() && !(owner->GetDescription().Flags & GPUBufferFlags::Structured)) || (usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) == VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) if ((EnumHasAnyFlags(owner->GetDescription().Flags, GPUBufferFlags::ShaderResource | GPUBufferFlags::UnorderedAccess) && !(owner->GetDescription().Flags & GPUBufferFlags::Structured)) || (usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) == VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT)
{ {
VkBufferViewCreateInfo viewInfo; VkBufferViewCreateInfo viewInfo;
RenderToolsVulkan::ZeroStruct(viewInfo, VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO); RenderToolsVulkan::ZeroStruct(viewInfo, VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO);
@@ -103,7 +103,7 @@ bool GPUBufferVulkan::OnInit()
bufferInfo.usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT; bufferInfo.usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
if (useUAV || EnumHasAnyFlags(_desc.Flags, GPUBufferFlags::RawBuffer | GPUBufferFlags::Structured)) if (useUAV || EnumHasAnyFlags(_desc.Flags, GPUBufferFlags::RawBuffer | GPUBufferFlags::Structured))
bufferInfo.usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; bufferInfo.usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
if (useUAV && useSRV) if (useUAV)
bufferInfo.usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT; bufferInfo.usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT;
if (EnumHasAnyFlags(_desc.Flags, GPUBufferFlags::Argument)) if (EnumHasAnyFlags(_desc.Flags, GPUBufferFlags::Argument))
bufferInfo.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT; bufferInfo.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT;

View File

@@ -78,13 +78,14 @@ const Char* ToString(VkImageLayout layout)
void PipelineBarrierVulkan::Execute(const CmdBufferVulkan* cmdBuffer) void PipelineBarrierVulkan::Execute(const CmdBufferVulkan* cmdBuffer)
{ {
ASSERT(cmdBuffer->IsOutsideRenderPass()); ASSERT(cmdBuffer->IsOutsideRenderPass());
vkCmdPipelineBarrier(cmdBuffer->GetHandle(), SourceStage, DestStage, 0, 0, nullptr, BufferBarriers.Count(), BufferBarriers.Get(), ImageBarriers.Count(), ImageBarriers.Get()); vkCmdPipelineBarrier(cmdBuffer->GetHandle(), SourceStage, DestStage, 0, MemoryBarriers.Count(), MemoryBarriers.Get(), BufferBarriers.Count(), BufferBarriers.Get(), ImageBarriers.Count(), ImageBarriers.Get());
// Reset // Reset
SourceStage = 0; SourceStage = 0;
DestStage = 0; DestStage = 0;
ImageBarriers.Clear(); ImageBarriers.Clear();
BufferBarriers.Clear(); BufferBarriers.Clear();
MemoryBarriers.Clear();
#if VK_ENABLE_BARRIERS_DEBUG #if VK_ENABLE_BARRIERS_DEBUG
ImageBarriersDebug.Clear(); ImageBarriersDebug.Clear();
#endif #endif
@@ -153,12 +154,7 @@ void GPUContextVulkan::AddImageBarrier(VkImage image, VkImageLayout srcLayout, V
#if VK_ENABLE_BARRIERS_BATCHING #if VK_ENABLE_BARRIERS_BATCHING
// Auto-flush on overflow // Auto-flush on overflow
if (_barriers.IsFull()) if (_barriers.IsFull())
{ FlushBarriers();
const auto cmdBuffer = _cmdBufferManager->GetCmdBuffer();
if (cmdBuffer->IsInsideRenderPass())
EndRenderPass();
_barriers.Execute(cmdBuffer);
}
#endif #endif
// Insert barrier // Insert barrier
@@ -190,10 +186,7 @@ void GPUContextVulkan::AddImageBarrier(VkImage image, VkImageLayout srcLayout, V
#if !VK_ENABLE_BARRIERS_BATCHING #if !VK_ENABLE_BARRIERS_BATCHING
// Auto-flush without batching // Auto-flush without batching
const auto cmdBuffer = _cmdBufferManager->GetCmdBuffer(); FlushBarriers();
if (cmdBuffer->IsInsideRenderPass())
EndRenderPass();
_barriers.Execute(cmdBuffer);
#endif #endif
} }
@@ -315,12 +308,7 @@ void GPUContextVulkan::AddBufferBarrier(GPUBufferVulkan* buffer, VkAccessFlags d
#if VK_ENABLE_BARRIERS_BATCHING #if VK_ENABLE_BARRIERS_BATCHING
// Auto-flush on overflow // Auto-flush on overflow
if (_barriers.IsFull()) if (_barriers.IsFull())
{ FlushBarriers();
const auto cmdBuffer = _cmdBufferManager->GetCmdBuffer();
if (cmdBuffer->IsInsideRenderPass())
EndRenderPass();
_barriers.Execute(cmdBuffer);
}
#endif #endif
// Insert barrier // Insert barrier
@@ -339,13 +327,38 @@ void GPUContextVulkan::AddBufferBarrier(GPUBufferVulkan* buffer, VkAccessFlags d
#if !VK_ENABLE_BARRIERS_BATCHING #if !VK_ENABLE_BARRIERS_BATCHING
// Auto-flush without batching // Auto-flush without batching
const auto cmdBuffer = _cmdBufferManager->GetCmdBuffer(); FlushBarriers();
if (cmdBuffer->IsInsideRenderPass())
EndRenderPass();
_barriers.Execute(cmdBuffer);
#endif #endif
} }
void GPUContextVulkan::AddMemoryBarrier()
{
#if VK_ENABLE_BARRIERS_BATCHING
// Auto-flush on overflow
if (_barriers.IsFull())
FlushBarriers();
#endif
// Insert barrier
VkMemoryBarrier& memoryBarrier = _barriers.MemoryBarriers.AddOne();
RenderToolsVulkan::ZeroStruct(memoryBarrier, VK_STRUCTURE_TYPE_MEMORY_BARRIER);
memoryBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
memoryBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT;
_barriers.SourceStage |= VK_PIPELINE_STAGE_TRANSFER_BIT;
_barriers.DestStage |= VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
#if !VK_ENABLE_BARRIERS_BATCHING
// Auto-flush without batching
FlushBarriers();
#endif
}
void GPUContextVulkan::AddUABarrier()
{
_barriers.SourceStage |= VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
_barriers.DestStage |= VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
}
void GPUContextVulkan::FlushBarriers() void GPUContextVulkan::FlushBarriers()
{ {
#if VK_ENABLE_BARRIERS_BATCHING #if VK_ENABLE_BARRIERS_BATCHING
@@ -475,7 +488,7 @@ void GPUContextVulkan::EndRenderPass()
cmdBuffer->EndRenderPass(); cmdBuffer->EndRenderPass();
_renderPass = nullptr; _renderPass = nullptr;
// Place a barrier between RenderPasses, so that color / depth outputs can be read in subsequent passes // Place a barrier between RenderPasses, so that color/depth outputs can be read in subsequent passes
// TODO: remove it in future and use proper barriers without whole pipeline stalls // TODO: remove it in future and use proper barriers without whole pipeline stalls
vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 0, nullptr); vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 0, nullptr);
} }
@@ -1155,8 +1168,8 @@ void GPUContextVulkan::Dispatch(GPUShaderProgramCS* shader, uint32 threadGroupCo
RENDER_STAT_DISPATCH_CALL(); RENDER_STAT_DISPATCH_CALL();
// Place a barrier between dispatches, so that UAVs can be read+write in subsequent passes // Place a barrier between dispatches, so that UAVs can be read+write in subsequent passes
// TODO: optimize it by moving inputs/outputs into higher-layer so eg. Global SDF can manually optimize it if (_pass == 0)
vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 0, nullptr); AddUABarrier();
#if VK_ENABLE_BARRIERS_DEBUG #if VK_ENABLE_BARRIERS_DEBUG
LOG(Warning, "Dispatch"); LOG(Warning, "Dispatch");
@@ -1191,8 +1204,8 @@ void GPUContextVulkan::DispatchIndirect(GPUShaderProgramCS* shader, GPUBuffer* b
RENDER_STAT_DISPATCH_CALL(); RENDER_STAT_DISPATCH_CALL();
// Place a barrier between dispatches, so that UAVs can be read+write in subsequent passes // Place a barrier between dispatches, so that UAVs can be read+write in subsequent passes
// TODO: optimize it by moving inputs/outputs into higher-layer so eg. Global SDF can manually optimize it if (_pass == 0)
vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 0, nullptr); AddUABarrier();
#if VK_ENABLE_BARRIERS_DEBUG #if VK_ENABLE_BARRIERS_DEBUG
LOG(Warning, "DispatchIndirect"); LOG(Warning, "DispatchIndirect");
@@ -1351,18 +1364,14 @@ void GPUContextVulkan::UpdateBuffer(GPUBuffer* buffer, const void* data, uint32
const auto bufferVulkan = static_cast<GPUBufferVulkan*>(buffer); const auto bufferVulkan = static_cast<GPUBufferVulkan*>(buffer);
// Memory transfer barrier // Transition resource
// TODO: batch pipeline barriers AddBufferBarrier(bufferVulkan, VK_ACCESS_TRANSFER_WRITE_BIT);
const VkMemoryBarrier barrierBefore = { VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr, VK_ACCESS_MEMORY_WRITE_BIT, VK_ACCESS_MEMORY_READ_BIT }; FlushBarriers();
vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrierBefore, 0, nullptr, 0, nullptr);
// Use direct update for small buffers // Use direct update for small buffers
const uint32 alignedSize = Math::AlignUp<uint32>(size, 4); const uint32 alignedSize = Math::AlignUp<uint32>(size, 4);
if (size <= 4 * 1024 && alignedSize <= buffer->GetSize()) if (size <= 4 * 1024 && alignedSize <= buffer->GetSize())
{ {
//AddBufferBarrier(bufferVulkan, VK_ACCESS_TRANSFER_WRITE_BIT);
//FlushBarriers();
vkCmdUpdateBuffer(cmdBuffer->GetHandle(), bufferVulkan->GetHandle(), offset, alignedSize, data); vkCmdUpdateBuffer(cmdBuffer->GetHandle(), bufferVulkan->GetHandle(), offset, alignedSize, data);
} }
else else
@@ -1379,10 +1388,9 @@ void GPUContextVulkan::UpdateBuffer(GPUBuffer* buffer, const void* data, uint32
_device->StagingManager.ReleaseBuffer(cmdBuffer, staging); _device->StagingManager.ReleaseBuffer(cmdBuffer, staging);
} }
// Memory transfer barrier // Memory transfer barrier to ensure buffer is ready to read (eg. by Draw or Dispatch)
// TODO: batch pipeline barriers if (_pass == 0)
const VkMemoryBarrier barrierAfter = { VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT }; AddMemoryBarrier();
vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 1, &barrierAfter, 0, nullptr, 0, nullptr);
} }
void GPUContextVulkan::CopyBuffer(GPUBuffer* dstBuffer, GPUBuffer* srcBuffer, uint32 size, uint32 dstOffset, uint32 srcOffset) void GPUContextVulkan::CopyBuffer(GPUBuffer* dstBuffer, GPUBuffer* srcBuffer, uint32 size, uint32 dstOffset, uint32 srcOffset)
@@ -1407,6 +1415,10 @@ void GPUContextVulkan::CopyBuffer(GPUBuffer* dstBuffer, GPUBuffer* srcBuffer, ui
bufferCopy.dstOffset = dstOffset; bufferCopy.dstOffset = dstOffset;
bufferCopy.size = size; bufferCopy.size = size;
vkCmdCopyBuffer(cmdBuffer->GetHandle(), srcBufferVulkan->GetHandle(), dstBufferVulkan->GetHandle(), 1, &bufferCopy); vkCmdCopyBuffer(cmdBuffer->GetHandle(), srcBufferVulkan->GetHandle(), dstBufferVulkan->GetHandle(), 1, &bufferCopy);
// Memory transfer barrier to ensure buffer is ready to read (eg. by Draw or Dispatch)
if (_pass == 0)
AddMemoryBarrier();
} }
void GPUContextVulkan::UpdateTexture(GPUTexture* texture, int32 arrayIndex, int32 mipIndex, const void* data, uint32 rowPitch, uint32 slicePitch) void GPUContextVulkan::UpdateTexture(GPUTexture* texture, int32 arrayIndex, int32 mipIndex, const void* data, uint32 rowPitch, uint32 slicePitch)
@@ -1816,4 +1828,27 @@ void GPUContextVulkan::CopySubresource(GPUResource* dstResource, uint32 dstSubre
} }
} }
void GPUContextVulkan::Transition(GPUResource* resource, GPUResourceAccess access)
{
if (auto buffer = dynamic_cast<GPUBufferVulkan*>(resource))
{
AddBufferBarrier(buffer, RenderToolsVulkan::GetAccess(access));
}
else if (auto texture = dynamic_cast<GPUTextureVulkan*>(resource))
{
AddImageBarrier(texture, RenderToolsVulkan::GetImageLayout(access));
}
}
void GPUContextVulkan::MemoryBarrier()
{
AddMemoryBarrier();
}
void GPUContextVulkan::OverlapUA(bool end)
{
if (end)
AddUABarrier();
}
#endif #endif

View File

@@ -34,7 +34,7 @@ class DescriptorSetLayoutVulkan;
/// <summary> /// <summary>
/// Size of the pipeline barriers buffer size (will be auto-flushed on overflow). /// Size of the pipeline barriers buffer size (will be auto-flushed on overflow).
/// </summary> /// </summary>
#define VK_BARRIER_BUFFER_SIZE 16 #define VK_BARRIER_BUFFER_SIZE 64
/// <summary> /// <summary>
/// The Vulkan pipeline resources layout barrier batching structure. /// The Vulkan pipeline resources layout barrier batching structure.
@@ -45,18 +45,19 @@ struct PipelineBarrierVulkan
VkPipelineStageFlags DestStage = 0; VkPipelineStageFlags DestStage = 0;
Array<VkImageMemoryBarrier, FixedAllocation<VK_BARRIER_BUFFER_SIZE>> ImageBarriers; Array<VkImageMemoryBarrier, FixedAllocation<VK_BARRIER_BUFFER_SIZE>> ImageBarriers;
Array<VkBufferMemoryBarrier, FixedAllocation<VK_BARRIER_BUFFER_SIZE>> BufferBarriers; Array<VkBufferMemoryBarrier, FixedAllocation<VK_BARRIER_BUFFER_SIZE>> BufferBarriers;
Array<VkMemoryBarrier, FixedAllocation<4>> MemoryBarriers;
#if VK_ENABLE_BARRIERS_DEBUG #if VK_ENABLE_BARRIERS_DEBUG
Array<GPUTextureViewVulkan*, FixedAllocation<VK_BARRIER_BUFFER_SIZE>> ImageBarriersDebug; Array<GPUTextureViewVulkan*, FixedAllocation<VK_BARRIER_BUFFER_SIZE>> ImageBarriersDebug;
#endif #endif
FORCE_INLINE bool IsFull() const FORCE_INLINE bool IsFull() const
{ {
return ImageBarriers.Count() == VK_BARRIER_BUFFER_SIZE || BufferBarriers.Count() == VK_BARRIER_BUFFER_SIZE; return ImageBarriers.Count() == VK_BARRIER_BUFFER_SIZE || BufferBarriers.Count() == VK_BARRIER_BUFFER_SIZE || MemoryBarriers.Count() == 4;
} }
FORCE_INLINE bool HasBarrier() const FORCE_INLINE bool HasBarrier() const
{ {
return ImageBarriers.Count() + BufferBarriers.Count() != 0; return ImageBarriers.Count() + BufferBarriers.Count() + MemoryBarriers.Count() != 0;
} }
void Execute(const CmdBufferVulkan* cmdBuffer); void Execute(const CmdBufferVulkan* cmdBuffer);
@@ -130,6 +131,8 @@ public:
void AddImageBarrier(GPUTextureVulkan* texture, int32 mipSlice, int32 arraySlice, VkImageLayout dstLayout); void AddImageBarrier(GPUTextureVulkan* texture, int32 mipSlice, int32 arraySlice, VkImageLayout dstLayout);
void AddImageBarrier(GPUTextureVulkan* texture, VkImageLayout dstLayout); void AddImageBarrier(GPUTextureVulkan* texture, VkImageLayout dstLayout);
void AddBufferBarrier(GPUBufferVulkan* buffer, VkAccessFlags dstAccess); void AddBufferBarrier(GPUBufferVulkan* buffer, VkAccessFlags dstAccess);
void AddMemoryBarrier();
void AddUABarrier();
void FlushBarriers(); void FlushBarriers();
@@ -199,6 +202,9 @@ public:
void CopyCounter(GPUBuffer* dstBuffer, uint32 dstOffset, GPUBuffer* srcBuffer) override; void CopyCounter(GPUBuffer* dstBuffer, uint32 dstOffset, GPUBuffer* srcBuffer) override;
void CopyResource(GPUResource* dstResource, GPUResource* srcResource) override; void CopyResource(GPUResource* dstResource, GPUResource* srcResource) override;
void CopySubresource(GPUResource* dstResource, uint32 dstSubresource, GPUResource* srcResource, uint32 srcSubresource) override; void CopySubresource(GPUResource* dstResource, uint32 dstSubresource, GPUResource* srcResource, uint32 srcSubresource) override;
void Transition(GPUResource* resource, GPUResourceAccess access) override;
void MemoryBarrier() override;
void OverlapUA(bool end) override;
}; };
#endif #endif

View File

@@ -5,6 +5,7 @@
#include "RenderToolsVulkan.h" #include "RenderToolsVulkan.h"
#include "Engine/Core/Types/StringBuilder.h" #include "Engine/Core/Types/StringBuilder.h"
#include "Engine/Core/Log.h" #include "Engine/Core/Log.h"
#include "Engine/Graphics/GPUResourceAccess.h"
// @formatter:off // @formatter:off
@@ -258,6 +259,80 @@ void RenderToolsVulkan::LogVkResult(VkResult result, const char* file, uint32 li
#endif #endif
} }
VkAccessFlags RenderToolsVulkan::GetAccess(GPUResourceAccess access)
{
switch (access)
{
case GPUResourceAccess::None:
return VK_ACCESS_NONE;
case GPUResourceAccess::CopyRead:
return VK_ACCESS_TRANSFER_READ_BIT;
case GPUResourceAccess::CopyWrite:
return VK_ACCESS_TRANSFER_WRITE_BIT;
case GPUResourceAccess::CpuRead:
return VK_ACCESS_HOST_READ_BIT;
case GPUResourceAccess::CpuWrite:
return VK_ACCESS_HOST_WRITE_BIT;
case GPUResourceAccess::DepthRead:
return VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT;
case GPUResourceAccess::DepthWrite:
return VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
case GPUResourceAccess::DepthBuffer:
return VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
case GPUResourceAccess::RenderTarget:
return VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
case GPUResourceAccess::UnorderedAccess:
return VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
case GPUResourceAccess::IndirectArgs:
return VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
case GPUResourceAccess::ShaderReadCompute:
case GPUResourceAccess::ShaderReadPixel:
case GPUResourceAccess::ShaderReadNonPixel:
case GPUResourceAccess::ShaderReadGraphics:
return VK_ACCESS_SHADER_READ_BIT;
#if !BUILD_RELEASE
default:
LOG(Error, "Unsupported GPU Resource Access: {}", (uint32)access);
#endif
}
return VK_ACCESS_NONE;
}
VkImageLayout RenderToolsVulkan::GetImageLayout(GPUResourceAccess access)
{
switch (access)
{
case GPUResourceAccess::None:
return VK_IMAGE_LAYOUT_UNDEFINED;
case GPUResourceAccess::CopyRead:
return VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
case GPUResourceAccess::CopyWrite:
return VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
case GPUResourceAccess::CpuRead:
case GPUResourceAccess::CpuWrite:
return VK_IMAGE_LAYOUT_GENERAL;
case GPUResourceAccess::DepthRead:
return VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL;
case GPUResourceAccess::DepthWrite:
case GPUResourceAccess::DepthBuffer:
return VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL;
return VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL;
case GPUResourceAccess::RenderTarget:
return VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL;
case GPUResourceAccess::UnorderedAccess:
case GPUResourceAccess::ShaderReadCompute:
case GPUResourceAccess::ShaderReadPixel:
case GPUResourceAccess::ShaderReadNonPixel:
case GPUResourceAccess::ShaderReadGraphics:
return VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
#if !BUILD_RELEASE
default:
LOG(Error, "Unsupported GPU Resource Access: {}", (uint32)access);
#endif
}
return VK_IMAGE_LAYOUT_UNDEFINED;
}
bool RenderToolsVulkan::HasExtension(const Array<const char*>& extensions, const char* name) bool RenderToolsVulkan::HasExtension(const Array<const char*>& extensions, const char* name)
{ {
for (int32 i = 0; i < extensions.Count(); i++) for (int32 i = 0; i < extensions.Count(); i++)

View File

@@ -20,6 +20,8 @@
#define VK_SET_DEBUG_NAME(device, handle, type, name) #define VK_SET_DEBUG_NAME(device, handle, type, name)
#endif #endif
enum class GPUResourceAccess;
/// <summary> /// <summary>
/// Set of utilities for rendering on Vulkan platform. /// Set of utilities for rendering on Vulkan platform.
/// </summary> /// </summary>
@@ -40,6 +42,9 @@ public:
static String GetVkErrorString(VkResult result); static String GetVkErrorString(VkResult result);
static void LogVkResult(VkResult result, const char* file = nullptr, uint32 line = 0, bool fatal = false); static void LogVkResult(VkResult result, const char* file = nullptr, uint32 line = 0, bool fatal = false);
static VkAccessFlags GetAccess(GPUResourceAccess access);
static VkImageLayout GetImageLayout(GPUResourceAccess access);
static inline VkPipelineStageFlags GetBufferBarrierFlags(VkAccessFlags accessFlags) static inline VkPipelineStageFlags GetBufferBarrierFlags(VkAccessFlags accessFlags)
{ {
VkPipelineStageFlags stageFlags = (VkPipelineStageFlags)0; VkPipelineStageFlags stageFlags = (VkPipelineStageFlags)0;
@@ -67,6 +72,9 @@ public:
case VK_ACCESS_SHADER_WRITE_BIT: case VK_ACCESS_SHADER_WRITE_BIT:
stageFlags = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; stageFlags = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
break; break;
case VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT:
stageFlags = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
break;
case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT: case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
stageFlags = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; stageFlags = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;

View File

@@ -82,7 +82,7 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* indicesBuffer, GPUBuffer*
if (checkIfSkipPass()) if (checkIfSkipPass())
return; return;
PROFILE_GPU_CPU("Bitonic Sort"); PROFILE_GPU_CPU("Bitonic Sort");
uint32 maxNumElements = indicesBuffer->GetElementsCount(); int32 maxNumElements = (int32)indicesBuffer->GetElementsCount();
if (maxElements > 0 && maxElements < maxNumElements) if (maxElements > 0 && maxElements < maxNumElements)
maxNumElements = maxElements; maxNumElements = maxElements;
const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements); const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements);