Add explicit GPU resource transitions, memory and compute barriers
This commit is contained in:
@@ -9,6 +9,11 @@
|
||||
#include "PixelFormat.h"
|
||||
#include "Config.h"
|
||||
|
||||
#if PLATFORM_WIN32
|
||||
// Fix nasty Win32 define garbage
|
||||
#undef MemoryBarrier
|
||||
#endif
|
||||
|
||||
class GPUConstantBuffer;
|
||||
class GPUShaderProgramCS;
|
||||
class GPUBuffer;
|
||||
@@ -21,6 +26,8 @@ class GPUResourceView;
|
||||
class GPUTextureView;
|
||||
class GPUBufferView;
|
||||
class GPUVertexLayout;
|
||||
struct GPUPass;
|
||||
enum class GPUResourceAccess;
|
||||
|
||||
// Gets the GPU texture view. Checks if pointer is not null and texture has one or more mip levels loaded.
|
||||
#define GET_TEXTURE_VIEW_SAFE(t) (t && t->ResidentMipLevels() > 0 ? t->View() : nullptr)
|
||||
@@ -632,4 +639,24 @@ public:
|
||||
/// Forces graphics backend to rebind descriptors after command list was used by external graphics library.
|
||||
/// </summary>
|
||||
virtual void ForceRebindDescriptors();
|
||||
|
||||
protected:
|
||||
friend GPUPass;
|
||||
int32 _pass = 0;
|
||||
|
||||
public:
|
||||
// Performs resource state transition into a specific access (mask).
|
||||
virtual void Transition(GPUResource* resource, GPUResourceAccess access)
|
||||
{
|
||||
}
|
||||
|
||||
// Inserts a global memory barrier on data copies between resources.
|
||||
virtual void MemoryBarrier()
|
||||
{
|
||||
}
|
||||
|
||||
// Begins or ends unordered access resource overlap region that allows running different compute shader dispatches simultaneously.
|
||||
virtual void OverlapUA(bool end)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
68
Source/Engine/Graphics/GPUPass.h
Normal file
68
Source/Engine/Graphics/GPUPass.h
Normal file
@@ -0,0 +1,68 @@
|
||||
// Copyright (c) Wojciech Figat. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GPUContext.h"
|
||||
#include "Engine/Graphics/GPUResourceAccess.h"
|
||||
|
||||
/// <summary>
|
||||
/// Base for GPU rendering passes that control low-level memory access and GPU resources states with usage to optimize rendering.
|
||||
/// </summary>
|
||||
struct FLAXENGINE_API GPUPass
|
||||
{
|
||||
NON_COPYABLE(GPUPass);
|
||||
|
||||
GPUContext* Context;
|
||||
|
||||
GPUPass(GPUContext* context)
|
||||
: Context(context)
|
||||
{
|
||||
Context->_pass++;
|
||||
}
|
||||
|
||||
~GPUPass()
|
||||
{
|
||||
Context->_pass--;
|
||||
}
|
||||
|
||||
// Performs resource state transition into a specific access (mask). Can be done preemptively in the prologue of the pass to execute more efficient barriers.
|
||||
void Transition(GPUResource* resource, GPUResourceAccess access)
|
||||
{
|
||||
Context->Transition(resource, access);
|
||||
}
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// GPU pass that manually controls memory barriers and cache flushes when performing batched copy/upload operations with GPU context. Can be used to optimize GPU buffers usage by running different copy operations simultaneously.
|
||||
/// </summary>
|
||||
struct FLAXENGINE_API GPUMemoryPass : GPUPass
|
||||
{
|
||||
GPUMemoryPass(GPUContext* context)
|
||||
: GPUPass(context)
|
||||
{
|
||||
}
|
||||
|
||||
~GPUMemoryPass()
|
||||
{
|
||||
Context->MemoryBarrier();
|
||||
}
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// GPU pass that controls memory barriers when performing batched Compute shader dispatches with GPU context. Can be used to optimize GPU utilization by running different dispatches simultaneously (by overlapping work).
|
||||
/// </summary>
|
||||
struct FLAXENGINE_API GPUComputePass : GPUPass
|
||||
{
|
||||
GPUComputePass(GPUContext* context)
|
||||
: GPUPass(context)
|
||||
{
|
||||
Context->OverlapUA(false);
|
||||
}
|
||||
|
||||
~GPUComputePass()
|
||||
{
|
||||
Context->OverlapUA(true);
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: add GPUDrawPass for render targets and depth/stencil setup with optimized clear for faster drawing on tiled-GPUs (mobile)
|
||||
29
Source/Engine/Graphics/GPUResourceAccess.h
Normal file
29
Source/Engine/Graphics/GPUResourceAccess.h
Normal file
@@ -0,0 +1,29 @@
|
||||
// Copyright (c) Wojciech Figat. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Engine/Core/Types/BaseTypes.h"
|
||||
|
||||
// GPU resource access flags. Used to describe how resource can be accessed which allows GPU to optimize data layout and memory access.
|
||||
enum class GPUResourceAccess
|
||||
{
|
||||
None = 0,
|
||||
CopyRead = 1 << 0,
|
||||
CopyWrite = 1 << 1,
|
||||
CpuRead = 1 << 2,
|
||||
CpuWrite = 1 << 3,
|
||||
DepthRead = 1 << 4,
|
||||
DepthWrite = 1 << 5,
|
||||
DepthBuffer = DepthRead | DepthWrite,
|
||||
RenderTarget = 1 << 6,
|
||||
UnorderedAccess = 1 << 7,
|
||||
IndirectArgs = 1 << 8,
|
||||
ShaderReadCompute = 1 << 9,
|
||||
ShaderReadPixel = 1 << 10,
|
||||
ShaderReadNonPixel = 1 << 11,
|
||||
ShaderReadGraphics = ShaderReadPixel | ShaderReadNonPixel,
|
||||
Last,
|
||||
All = (Last << 1) - 1,
|
||||
};
|
||||
|
||||
DECLARE_ENUM_OPERATORS(GPUResourceAccess);
|
||||
@@ -297,7 +297,7 @@ void GPUContextDX11::SetRenderTarget(GPUTextureView* depthBuffer, const Span<GPU
|
||||
__declspec(align(16)) ID3D11RenderTargetView* rtvs[GPU_MAX_RT_BINDED];
|
||||
for (int32 i = 0; i < rts.Length(); i++)
|
||||
{
|
||||
auto rtDX11 = reinterpret_cast<GPUTextureViewDX11*>(rts[i]);
|
||||
auto rtDX11 = reinterpret_cast<GPUTextureViewDX11*>(rts.Get()[i]);
|
||||
rtvs[i] = rtDX11 ? rtDX11->RTV() : nullptr;
|
||||
}
|
||||
int32 rtvsSize = sizeof(ID3D11RenderTargetView*) * rts.Length();
|
||||
@@ -431,7 +431,7 @@ void GPUContextDX11::BindVB(const Span<GPUBuffer*>& vertexBuffers, const uint32*
|
||||
bool vbEdited = false;
|
||||
for (int32 i = 0; i < vertexBuffers.Length(); i++)
|
||||
{
|
||||
const auto vbDX11 = static_cast<GPUBufferDX11*>(vertexBuffers[i]);
|
||||
const auto vbDX11 = static_cast<GPUBufferDX11*>(vertexBuffers.Get()[i]);
|
||||
const auto vb = vbDX11 ? vbDX11->GetBuffer() : nullptr;
|
||||
vbEdited |= vb != _vbHandles[i];
|
||||
_vbHandles[i] = vb;
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
#include "GPUShaderProgramDX12.h"
|
||||
#include "CommandSignatureDX12.h"
|
||||
#include "Engine/Profiler/RenderStats.h"
|
||||
#include "Engine/Graphics/GPUResourceAccess.h"
|
||||
#include "Engine/Graphics/Shaders/GPUShader.h"
|
||||
#include "Engine/Threading/Threading.h"
|
||||
|
||||
@@ -51,6 +52,47 @@ inline bool operator!=(const D3D12_INDEX_BUFFER_VIEW& l, const D3D12_INDEX_BUFFE
|
||||
return l.SizeInBytes != r.SizeInBytes || l.Format != r.Format || l.BufferLocation != r.BufferLocation;
|
||||
}
|
||||
|
||||
FORCE_INLINE D3D12_RESOURCE_STATES GetResourceState(GPUResourceAccess access)
|
||||
{
|
||||
switch (access)
|
||||
{
|
||||
case GPUResourceAccess::None:
|
||||
return D3D12_RESOURCE_STATE_COMMON;
|
||||
case GPUResourceAccess::CopyRead:
|
||||
return D3D12_RESOURCE_STATE_COPY_SOURCE;
|
||||
case GPUResourceAccess::CopyWrite:
|
||||
return D3D12_RESOURCE_STATE_COPY_DEST;
|
||||
case GPUResourceAccess::CpuRead:
|
||||
return D3D12_RESOURCE_STATE_GENERIC_READ;
|
||||
case GPUResourceAccess::CpuWrite:
|
||||
return D3D12_RESOURCE_STATE_COMMON;
|
||||
case GPUResourceAccess::DepthRead:
|
||||
return D3D12_RESOURCE_STATE_DEPTH_READ;
|
||||
case GPUResourceAccess::DepthWrite:
|
||||
return D3D12_RESOURCE_STATE_DEPTH_WRITE;
|
||||
case GPUResourceAccess::DepthBuffer:
|
||||
return D3D12_RESOURCE_STATE_DEPTH_READ | D3D12_RESOURCE_STATE_DEPTH_WRITE;
|
||||
case GPUResourceAccess::RenderTarget:
|
||||
return D3D12_RESOURCE_STATE_RENDER_TARGET;
|
||||
case GPUResourceAccess::UnorderedAccess:
|
||||
return D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
|
||||
case GPUResourceAccess::IndirectArgs:
|
||||
return D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT;
|
||||
case GPUResourceAccess::ShaderReadPixel:
|
||||
//return D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE; // TODO: optimize SRV states in flushSRVs to be based on current binding usage slots
|
||||
case GPUResourceAccess::ShaderReadCompute:
|
||||
case GPUResourceAccess::ShaderReadNonPixel:
|
||||
//return D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE; // TODO: optimize SRV states in flushSRVs to be based on current binding usage slots
|
||||
case GPUResourceAccess::ShaderReadGraphics:
|
||||
return D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE | D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE;
|
||||
#if !BUILD_RELEASE
|
||||
default:
|
||||
LOG(Error, "Unsupported GPU Resource Access: {}", (uint32)access);
|
||||
#endif
|
||||
}
|
||||
return D3D12_RESOURCE_STATE_COMMON;
|
||||
}
|
||||
|
||||
// Ensure to match the indirect commands arguments layout
|
||||
static_assert(sizeof(GPUDispatchIndirectArgs) == sizeof(D3D12_DISPATCH_ARGUMENTS), "Wrong size of GPUDrawIndirectArgs.");
|
||||
static_assert(OFFSET_OF(GPUDispatchIndirectArgs, ThreadGroupCountX) == OFFSET_OF(D3D12_DISPATCH_ARGUMENTS, ThreadGroupCountX), "Wrong offset for GPUDrawIndirectArgs::ThreadGroupCountX");
|
||||
@@ -1124,7 +1166,8 @@ void GPUContextDX12::Dispatch(GPUShaderProgramCS* shader, uint32 threadGroupCoun
|
||||
_psDirtyFlag = true;
|
||||
|
||||
// Insert UAV barrier to ensure proper memory access for multiple sequential dispatches
|
||||
AddUAVBarrier();
|
||||
if (_pass == 0)
|
||||
AddUAVBarrier();
|
||||
}
|
||||
|
||||
void GPUContextDX12::DispatchIndirect(GPUShaderProgramCS* shader, GPUBuffer* bufferForArgs, uint32 offsetForArgs)
|
||||
@@ -1158,7 +1201,8 @@ void GPUContextDX12::DispatchIndirect(GPUShaderProgramCS* shader, GPUBuffer* buf
|
||||
_psDirtyFlag = true;
|
||||
|
||||
// Insert UAV barrier to ensure proper memory access for multiple sequential dispatches
|
||||
AddUAVBarrier();
|
||||
if (_pass == 0)
|
||||
AddUAVBarrier();
|
||||
}
|
||||
|
||||
void GPUContextDX12::ResolveMultisample(GPUTexture* sourceMultisampleTexture, GPUTexture* destTexture, int32 sourceSubResource, int32 destSubResource, PixelFormat format)
|
||||
@@ -1549,4 +1593,15 @@ void GPUContextDX12::ForceRebindDescriptors()
|
||||
_commandList->SetDescriptorHeaps(ARRAY_COUNT(ppHeaps), ppHeaps);
|
||||
}
|
||||
|
||||
void GPUContextDX12::Transition(GPUResource* resource, GPUResourceAccess access)
|
||||
{
|
||||
SetResourceState(dynamic_cast<ResourceOwnerDX12*>(resource), GetResourceState(access));
|
||||
}
|
||||
|
||||
void GPUContextDX12::OverlapUA(bool end)
|
||||
{
|
||||
if (end)
|
||||
AddUAVBarrier();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -21,7 +21,7 @@ class GPUVertexLayoutDX12;
|
||||
/// <summary>
|
||||
/// Size of the resource barriers buffer size (will be flushed on overflow)
|
||||
/// </summary>
|
||||
#define DX12_RB_BUFFER_SIZE 16
|
||||
#define DX12_RB_BUFFER_SIZE 64
|
||||
|
||||
/// <summary>
|
||||
/// GPU Commands Context implementation for DirectX 12
|
||||
@@ -214,6 +214,8 @@ public:
|
||||
void CopySubresource(GPUResource* dstResource, uint32 dstSubresource, GPUResource* srcResource, uint32 srcSubresource) override;
|
||||
void SetResourceState(GPUResource* resource, uint64 state, int32 subresource) override;
|
||||
void ForceRebindDescriptors() override;
|
||||
void Transition(GPUResource* resource, GPUResourceAccess access) override;
|
||||
void OverlapUA(bool end) override;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -19,7 +19,7 @@ void GPUBufferViewVulkan::Init(GPUDeviceVulkan* device, GPUBufferVulkan* owner,
|
||||
Buffer = buffer;
|
||||
Size = size;
|
||||
|
||||
if ((owner->IsShaderResource() && !(owner->GetDescription().Flags & GPUBufferFlags::Structured)) || (usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) == VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT)
|
||||
if ((EnumHasAnyFlags(owner->GetDescription().Flags, GPUBufferFlags::ShaderResource | GPUBufferFlags::UnorderedAccess) && !(owner->GetDescription().Flags & GPUBufferFlags::Structured)) || (usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) == VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT)
|
||||
{
|
||||
VkBufferViewCreateInfo viewInfo;
|
||||
RenderToolsVulkan::ZeroStruct(viewInfo, VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO);
|
||||
@@ -103,7 +103,7 @@ bool GPUBufferVulkan::OnInit()
|
||||
bufferInfo.usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
|
||||
if (useUAV || EnumHasAnyFlags(_desc.Flags, GPUBufferFlags::RawBuffer | GPUBufferFlags::Structured))
|
||||
bufferInfo.usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
|
||||
if (useUAV && useSRV)
|
||||
if (useUAV)
|
||||
bufferInfo.usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT;
|
||||
if (EnumHasAnyFlags(_desc.Flags, GPUBufferFlags::Argument))
|
||||
bufferInfo.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT;
|
||||
|
||||
@@ -78,13 +78,14 @@ const Char* ToString(VkImageLayout layout)
|
||||
void PipelineBarrierVulkan::Execute(const CmdBufferVulkan* cmdBuffer)
|
||||
{
|
||||
ASSERT(cmdBuffer->IsOutsideRenderPass());
|
||||
vkCmdPipelineBarrier(cmdBuffer->GetHandle(), SourceStage, DestStage, 0, 0, nullptr, BufferBarriers.Count(), BufferBarriers.Get(), ImageBarriers.Count(), ImageBarriers.Get());
|
||||
vkCmdPipelineBarrier(cmdBuffer->GetHandle(), SourceStage, DestStage, 0, MemoryBarriers.Count(), MemoryBarriers.Get(), BufferBarriers.Count(), BufferBarriers.Get(), ImageBarriers.Count(), ImageBarriers.Get());
|
||||
|
||||
// Reset
|
||||
SourceStage = 0;
|
||||
DestStage = 0;
|
||||
ImageBarriers.Clear();
|
||||
BufferBarriers.Clear();
|
||||
MemoryBarriers.Clear();
|
||||
#if VK_ENABLE_BARRIERS_DEBUG
|
||||
ImageBarriersDebug.Clear();
|
||||
#endif
|
||||
@@ -153,12 +154,7 @@ void GPUContextVulkan::AddImageBarrier(VkImage image, VkImageLayout srcLayout, V
|
||||
#if VK_ENABLE_BARRIERS_BATCHING
|
||||
// Auto-flush on overflow
|
||||
if (_barriers.IsFull())
|
||||
{
|
||||
const auto cmdBuffer = _cmdBufferManager->GetCmdBuffer();
|
||||
if (cmdBuffer->IsInsideRenderPass())
|
||||
EndRenderPass();
|
||||
_barriers.Execute(cmdBuffer);
|
||||
}
|
||||
FlushBarriers();
|
||||
#endif
|
||||
|
||||
// Insert barrier
|
||||
@@ -190,10 +186,7 @@ void GPUContextVulkan::AddImageBarrier(VkImage image, VkImageLayout srcLayout, V
|
||||
|
||||
#if !VK_ENABLE_BARRIERS_BATCHING
|
||||
// Auto-flush without batching
|
||||
const auto cmdBuffer = _cmdBufferManager->GetCmdBuffer();
|
||||
if (cmdBuffer->IsInsideRenderPass())
|
||||
EndRenderPass();
|
||||
_barriers.Execute(cmdBuffer);
|
||||
FlushBarriers();
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -315,12 +308,7 @@ void GPUContextVulkan::AddBufferBarrier(GPUBufferVulkan* buffer, VkAccessFlags d
|
||||
#if VK_ENABLE_BARRIERS_BATCHING
|
||||
// Auto-flush on overflow
|
||||
if (_barriers.IsFull())
|
||||
{
|
||||
const auto cmdBuffer = _cmdBufferManager->GetCmdBuffer();
|
||||
if (cmdBuffer->IsInsideRenderPass())
|
||||
EndRenderPass();
|
||||
_barriers.Execute(cmdBuffer);
|
||||
}
|
||||
FlushBarriers();
|
||||
#endif
|
||||
|
||||
// Insert barrier
|
||||
@@ -339,13 +327,38 @@ void GPUContextVulkan::AddBufferBarrier(GPUBufferVulkan* buffer, VkAccessFlags d
|
||||
|
||||
#if !VK_ENABLE_BARRIERS_BATCHING
|
||||
// Auto-flush without batching
|
||||
const auto cmdBuffer = _cmdBufferManager->GetCmdBuffer();
|
||||
if (cmdBuffer->IsInsideRenderPass())
|
||||
EndRenderPass();
|
||||
_barriers.Execute(cmdBuffer);
|
||||
FlushBarriers();
|
||||
#endif
|
||||
}
|
||||
|
||||
void GPUContextVulkan::AddMemoryBarrier()
|
||||
{
|
||||
#if VK_ENABLE_BARRIERS_BATCHING
|
||||
// Auto-flush on overflow
|
||||
if (_barriers.IsFull())
|
||||
FlushBarriers();
|
||||
#endif
|
||||
|
||||
// Insert barrier
|
||||
VkMemoryBarrier& memoryBarrier = _barriers.MemoryBarriers.AddOne();
|
||||
RenderToolsVulkan::ZeroStruct(memoryBarrier, VK_STRUCTURE_TYPE_MEMORY_BARRIER);
|
||||
memoryBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
|
||||
memoryBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT;
|
||||
_barriers.SourceStage |= VK_PIPELINE_STAGE_TRANSFER_BIT;
|
||||
_barriers.DestStage |= VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
|
||||
|
||||
#if !VK_ENABLE_BARRIERS_BATCHING
|
||||
// Auto-flush without batching
|
||||
FlushBarriers();
|
||||
#endif
|
||||
}
|
||||
|
||||
void GPUContextVulkan::AddUABarrier()
|
||||
{
|
||||
_barriers.SourceStage |= VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
|
||||
_barriers.DestStage |= VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
|
||||
}
|
||||
|
||||
void GPUContextVulkan::FlushBarriers()
|
||||
{
|
||||
#if VK_ENABLE_BARRIERS_BATCHING
|
||||
@@ -475,7 +488,7 @@ void GPUContextVulkan::EndRenderPass()
|
||||
cmdBuffer->EndRenderPass();
|
||||
_renderPass = nullptr;
|
||||
|
||||
// Place a barrier between RenderPasses, so that color / depth outputs can be read in subsequent passes
|
||||
// Place a barrier between RenderPasses, so that color/depth outputs can be read in subsequent passes
|
||||
// TODO: remove it in future and use proper barriers without whole pipeline stalls
|
||||
vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 0, nullptr);
|
||||
}
|
||||
@@ -1155,8 +1168,8 @@ void GPUContextVulkan::Dispatch(GPUShaderProgramCS* shader, uint32 threadGroupCo
|
||||
RENDER_STAT_DISPATCH_CALL();
|
||||
|
||||
// Place a barrier between dispatches, so that UAVs can be read+write in subsequent passes
|
||||
// TODO: optimize it by moving inputs/outputs into higher-layer so eg. Global SDF can manually optimize it
|
||||
vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 0, nullptr);
|
||||
if (_pass == 0)
|
||||
AddUABarrier();
|
||||
|
||||
#if VK_ENABLE_BARRIERS_DEBUG
|
||||
LOG(Warning, "Dispatch");
|
||||
@@ -1191,8 +1204,8 @@ void GPUContextVulkan::DispatchIndirect(GPUShaderProgramCS* shader, GPUBuffer* b
|
||||
RENDER_STAT_DISPATCH_CALL();
|
||||
|
||||
// Place a barrier between dispatches, so that UAVs can be read+write in subsequent passes
|
||||
// TODO: optimize it by moving inputs/outputs into higher-layer so eg. Global SDF can manually optimize it
|
||||
vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 0, nullptr);
|
||||
if (_pass == 0)
|
||||
AddUABarrier();
|
||||
|
||||
#if VK_ENABLE_BARRIERS_DEBUG
|
||||
LOG(Warning, "DispatchIndirect");
|
||||
@@ -1351,18 +1364,14 @@ void GPUContextVulkan::UpdateBuffer(GPUBuffer* buffer, const void* data, uint32
|
||||
|
||||
const auto bufferVulkan = static_cast<GPUBufferVulkan*>(buffer);
|
||||
|
||||
// Memory transfer barrier
|
||||
// TODO: batch pipeline barriers
|
||||
const VkMemoryBarrier barrierBefore = { VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr, VK_ACCESS_MEMORY_WRITE_BIT, VK_ACCESS_MEMORY_READ_BIT };
|
||||
vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrierBefore, 0, nullptr, 0, nullptr);
|
||||
// Transition resource
|
||||
AddBufferBarrier(bufferVulkan, VK_ACCESS_TRANSFER_WRITE_BIT);
|
||||
FlushBarriers();
|
||||
|
||||
// Use direct update for small buffers
|
||||
const uint32 alignedSize = Math::AlignUp<uint32>(size, 4);
|
||||
if (size <= 4 * 1024 && alignedSize <= buffer->GetSize())
|
||||
{
|
||||
//AddBufferBarrier(bufferVulkan, VK_ACCESS_TRANSFER_WRITE_BIT);
|
||||
//FlushBarriers();
|
||||
|
||||
vkCmdUpdateBuffer(cmdBuffer->GetHandle(), bufferVulkan->GetHandle(), offset, alignedSize, data);
|
||||
}
|
||||
else
|
||||
@@ -1379,10 +1388,9 @@ void GPUContextVulkan::UpdateBuffer(GPUBuffer* buffer, const void* data, uint32
|
||||
_device->StagingManager.ReleaseBuffer(cmdBuffer, staging);
|
||||
}
|
||||
|
||||
// Memory transfer barrier
|
||||
// TODO: batch pipeline barriers
|
||||
const VkMemoryBarrier barrierAfter = { VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT };
|
||||
vkCmdPipelineBarrier(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 1, &barrierAfter, 0, nullptr, 0, nullptr);
|
||||
// Memory transfer barrier to ensure buffer is ready to read (eg. by Draw or Dispatch)
|
||||
if (_pass == 0)
|
||||
AddMemoryBarrier();
|
||||
}
|
||||
|
||||
void GPUContextVulkan::CopyBuffer(GPUBuffer* dstBuffer, GPUBuffer* srcBuffer, uint32 size, uint32 dstOffset, uint32 srcOffset)
|
||||
@@ -1407,6 +1415,10 @@ void GPUContextVulkan::CopyBuffer(GPUBuffer* dstBuffer, GPUBuffer* srcBuffer, ui
|
||||
bufferCopy.dstOffset = dstOffset;
|
||||
bufferCopy.size = size;
|
||||
vkCmdCopyBuffer(cmdBuffer->GetHandle(), srcBufferVulkan->GetHandle(), dstBufferVulkan->GetHandle(), 1, &bufferCopy);
|
||||
|
||||
// Memory transfer barrier to ensure buffer is ready to read (eg. by Draw or Dispatch)
|
||||
if (_pass == 0)
|
||||
AddMemoryBarrier();
|
||||
}
|
||||
|
||||
void GPUContextVulkan::UpdateTexture(GPUTexture* texture, int32 arrayIndex, int32 mipIndex, const void* data, uint32 rowPitch, uint32 slicePitch)
|
||||
@@ -1816,4 +1828,27 @@ void GPUContextVulkan::CopySubresource(GPUResource* dstResource, uint32 dstSubre
|
||||
}
|
||||
}
|
||||
|
||||
void GPUContextVulkan::Transition(GPUResource* resource, GPUResourceAccess access)
|
||||
{
|
||||
if (auto buffer = dynamic_cast<GPUBufferVulkan*>(resource))
|
||||
{
|
||||
AddBufferBarrier(buffer, RenderToolsVulkan::GetAccess(access));
|
||||
}
|
||||
else if (auto texture = dynamic_cast<GPUTextureVulkan*>(resource))
|
||||
{
|
||||
AddImageBarrier(texture, RenderToolsVulkan::GetImageLayout(access));
|
||||
}
|
||||
}
|
||||
|
||||
void GPUContextVulkan::MemoryBarrier()
|
||||
{
|
||||
AddMemoryBarrier();
|
||||
}
|
||||
|
||||
void GPUContextVulkan::OverlapUA(bool end)
|
||||
{
|
||||
if (end)
|
||||
AddUABarrier();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -34,7 +34,7 @@ class DescriptorSetLayoutVulkan;
|
||||
/// <summary>
|
||||
/// Size of the pipeline barriers buffer size (will be auto-flushed on overflow).
|
||||
/// </summary>
|
||||
#define VK_BARRIER_BUFFER_SIZE 16
|
||||
#define VK_BARRIER_BUFFER_SIZE 64
|
||||
|
||||
/// <summary>
|
||||
/// The Vulkan pipeline resources layout barrier batching structure.
|
||||
@@ -45,18 +45,19 @@ struct PipelineBarrierVulkan
|
||||
VkPipelineStageFlags DestStage = 0;
|
||||
Array<VkImageMemoryBarrier, FixedAllocation<VK_BARRIER_BUFFER_SIZE>> ImageBarriers;
|
||||
Array<VkBufferMemoryBarrier, FixedAllocation<VK_BARRIER_BUFFER_SIZE>> BufferBarriers;
|
||||
Array<VkMemoryBarrier, FixedAllocation<4>> MemoryBarriers;
|
||||
#if VK_ENABLE_BARRIERS_DEBUG
|
||||
Array<GPUTextureViewVulkan*, FixedAllocation<VK_BARRIER_BUFFER_SIZE>> ImageBarriersDebug;
|
||||
#endif
|
||||
|
||||
FORCE_INLINE bool IsFull() const
|
||||
{
|
||||
return ImageBarriers.Count() == VK_BARRIER_BUFFER_SIZE || BufferBarriers.Count() == VK_BARRIER_BUFFER_SIZE;
|
||||
return ImageBarriers.Count() == VK_BARRIER_BUFFER_SIZE || BufferBarriers.Count() == VK_BARRIER_BUFFER_SIZE || MemoryBarriers.Count() == 4;
|
||||
}
|
||||
|
||||
FORCE_INLINE bool HasBarrier() const
|
||||
{
|
||||
return ImageBarriers.Count() + BufferBarriers.Count() != 0;
|
||||
return ImageBarriers.Count() + BufferBarriers.Count() + MemoryBarriers.Count() != 0;
|
||||
}
|
||||
|
||||
void Execute(const CmdBufferVulkan* cmdBuffer);
|
||||
@@ -130,6 +131,8 @@ public:
|
||||
void AddImageBarrier(GPUTextureVulkan* texture, int32 mipSlice, int32 arraySlice, VkImageLayout dstLayout);
|
||||
void AddImageBarrier(GPUTextureVulkan* texture, VkImageLayout dstLayout);
|
||||
void AddBufferBarrier(GPUBufferVulkan* buffer, VkAccessFlags dstAccess);
|
||||
void AddMemoryBarrier();
|
||||
void AddUABarrier();
|
||||
|
||||
void FlushBarriers();
|
||||
|
||||
@@ -199,6 +202,9 @@ public:
|
||||
void CopyCounter(GPUBuffer* dstBuffer, uint32 dstOffset, GPUBuffer* srcBuffer) override;
|
||||
void CopyResource(GPUResource* dstResource, GPUResource* srcResource) override;
|
||||
void CopySubresource(GPUResource* dstResource, uint32 dstSubresource, GPUResource* srcResource, uint32 srcSubresource) override;
|
||||
void Transition(GPUResource* resource, GPUResourceAccess access) override;
|
||||
void MemoryBarrier() override;
|
||||
void OverlapUA(bool end) override;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "RenderToolsVulkan.h"
|
||||
#include "Engine/Core/Types/StringBuilder.h"
|
||||
#include "Engine/Core/Log.h"
|
||||
#include "Engine/Graphics/GPUResourceAccess.h"
|
||||
|
||||
// @formatter:off
|
||||
|
||||
@@ -258,6 +259,80 @@ void RenderToolsVulkan::LogVkResult(VkResult result, const char* file, uint32 li
|
||||
#endif
|
||||
}
|
||||
|
||||
VkAccessFlags RenderToolsVulkan::GetAccess(GPUResourceAccess access)
|
||||
{
|
||||
switch (access)
|
||||
{
|
||||
case GPUResourceAccess::None:
|
||||
return VK_ACCESS_NONE;
|
||||
case GPUResourceAccess::CopyRead:
|
||||
return VK_ACCESS_TRANSFER_READ_BIT;
|
||||
case GPUResourceAccess::CopyWrite:
|
||||
return VK_ACCESS_TRANSFER_WRITE_BIT;
|
||||
case GPUResourceAccess::CpuRead:
|
||||
return VK_ACCESS_HOST_READ_BIT;
|
||||
case GPUResourceAccess::CpuWrite:
|
||||
return VK_ACCESS_HOST_WRITE_BIT;
|
||||
case GPUResourceAccess::DepthRead:
|
||||
return VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT;
|
||||
case GPUResourceAccess::DepthWrite:
|
||||
return VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
|
||||
case GPUResourceAccess::DepthBuffer:
|
||||
return VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
|
||||
case GPUResourceAccess::RenderTarget:
|
||||
return VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
|
||||
case GPUResourceAccess::UnorderedAccess:
|
||||
return VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
|
||||
case GPUResourceAccess::IndirectArgs:
|
||||
return VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
|
||||
case GPUResourceAccess::ShaderReadCompute:
|
||||
case GPUResourceAccess::ShaderReadPixel:
|
||||
case GPUResourceAccess::ShaderReadNonPixel:
|
||||
case GPUResourceAccess::ShaderReadGraphics:
|
||||
return VK_ACCESS_SHADER_READ_BIT;
|
||||
#if !BUILD_RELEASE
|
||||
default:
|
||||
LOG(Error, "Unsupported GPU Resource Access: {}", (uint32)access);
|
||||
#endif
|
||||
}
|
||||
return VK_ACCESS_NONE;
|
||||
}
|
||||
|
||||
VkImageLayout RenderToolsVulkan::GetImageLayout(GPUResourceAccess access)
|
||||
{
|
||||
switch (access)
|
||||
{
|
||||
case GPUResourceAccess::None:
|
||||
return VK_IMAGE_LAYOUT_UNDEFINED;
|
||||
case GPUResourceAccess::CopyRead:
|
||||
return VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
|
||||
case GPUResourceAccess::CopyWrite:
|
||||
return VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
|
||||
case GPUResourceAccess::CpuRead:
|
||||
case GPUResourceAccess::CpuWrite:
|
||||
return VK_IMAGE_LAYOUT_GENERAL;
|
||||
case GPUResourceAccess::DepthRead:
|
||||
return VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL;
|
||||
case GPUResourceAccess::DepthWrite:
|
||||
case GPUResourceAccess::DepthBuffer:
|
||||
return VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL;
|
||||
return VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL;
|
||||
case GPUResourceAccess::RenderTarget:
|
||||
return VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL;
|
||||
case GPUResourceAccess::UnorderedAccess:
|
||||
case GPUResourceAccess::ShaderReadCompute:
|
||||
case GPUResourceAccess::ShaderReadPixel:
|
||||
case GPUResourceAccess::ShaderReadNonPixel:
|
||||
case GPUResourceAccess::ShaderReadGraphics:
|
||||
return VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
|
||||
#if !BUILD_RELEASE
|
||||
default:
|
||||
LOG(Error, "Unsupported GPU Resource Access: {}", (uint32)access);
|
||||
#endif
|
||||
}
|
||||
return VK_IMAGE_LAYOUT_UNDEFINED;
|
||||
}
|
||||
|
||||
bool RenderToolsVulkan::HasExtension(const Array<const char*>& extensions, const char* name)
|
||||
{
|
||||
for (int32 i = 0; i < extensions.Count(); i++)
|
||||
|
||||
@@ -20,6 +20,8 @@
|
||||
#define VK_SET_DEBUG_NAME(device, handle, type, name)
|
||||
#endif
|
||||
|
||||
enum class GPUResourceAccess;
|
||||
|
||||
/// <summary>
|
||||
/// Set of utilities for rendering on Vulkan platform.
|
||||
/// </summary>
|
||||
@@ -40,6 +42,9 @@ public:
|
||||
static String GetVkErrorString(VkResult result);
|
||||
static void LogVkResult(VkResult result, const char* file = nullptr, uint32 line = 0, bool fatal = false);
|
||||
|
||||
static VkAccessFlags GetAccess(GPUResourceAccess access);
|
||||
static VkImageLayout GetImageLayout(GPUResourceAccess access);
|
||||
|
||||
static inline VkPipelineStageFlags GetBufferBarrierFlags(VkAccessFlags accessFlags)
|
||||
{
|
||||
VkPipelineStageFlags stageFlags = (VkPipelineStageFlags)0;
|
||||
@@ -67,6 +72,9 @@ public:
|
||||
case VK_ACCESS_SHADER_WRITE_BIT:
|
||||
stageFlags = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
|
||||
break;
|
||||
case VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT:
|
||||
stageFlags = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
|
||||
break;
|
||||
case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
|
||||
case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
|
||||
stageFlags = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
|
||||
|
||||
@@ -82,7 +82,7 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* indicesBuffer, GPUBuffer*
|
||||
if (checkIfSkipPass())
|
||||
return;
|
||||
PROFILE_GPU_CPU("Bitonic Sort");
|
||||
uint32 maxNumElements = indicesBuffer->GetElementsCount();
|
||||
int32 maxNumElements = (int32)indicesBuffer->GetElementsCount();
|
||||
if (maxElements > 0 && maxElements < maxNumElements)
|
||||
maxNumElements = maxElements;
|
||||
const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements);
|
||||
|
||||
Reference in New Issue
Block a user