From cf9c20385560f82275ff468d34edb5babbfc35cd Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Wed, 6 Aug 2025 18:48:18 +0200 Subject: [PATCH] Various optimizations --- Content/Shaders/BitonicSort.flax | 4 ++-- Source/Engine/Core/Types/Variant.cpp | 2 ++ Source/Engine/Graphics/GPUBufferDescription.h | 5 +++-- Source/Engine/Graphics/GPUContext.cpp | 4 +++- .../GraphicsDevice/DirectX/DX12/GPUDeviceDX12.cpp | 3 ++- .../Engine/GraphicsDevice/Vulkan/GPUContextVulkan.cpp | 2 +- Source/Engine/Particles/Graph/GPU/GPUParticles.cpp | 3 +++ Source/Engine/Particles/Particles.cpp | 1 - Source/Engine/Renderer/RenderList.cpp | 1 + Source/Engine/Renderer/Utils/BitonicSort.cpp | 10 ++++------ Source/Shaders/BitonicSort.shader | 1 + Source/Shaders/GPUParticlesSorting.shader | 2 -- 12 files changed, 22 insertions(+), 16 deletions(-) diff --git a/Content/Shaders/BitonicSort.flax b/Content/Shaders/BitonicSort.flax index ee7db3c74..1c01ad7bc 100644 --- a/Content/Shaders/BitonicSort.flax +++ b/Content/Shaders/BitonicSort.flax @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:924884da1dfef7a802b7190fd148eebbeece50d6fa4d69295c38238dd96331e6 -size 6538 +oid sha256:4a7cb98a1cbfe00c7d8d9dabe713f213537eaf13d4061243c32ca29ba06f3403 +size 6546 diff --git a/Source/Engine/Core/Types/Variant.cpp b/Source/Engine/Core/Types/Variant.cpp index 0ce2d8387..4ab8552d3 100644 --- a/Source/Engine/Core/Types/Variant.cpp +++ b/Source/Engine/Core/Types/Variant.cpp @@ -632,6 +632,7 @@ Variant::Variant(ScriptingObject* v) AsObject = v; if (v) { + // TODO: optimize VariantType to support statically linked typename of ScriptingType (via 1 bit flag within Types enum, only in game as editor might hot-reload types) Type.SetTypeName(v->GetType().Fullname); v->Deleted.Bind(this); } @@ -643,6 +644,7 @@ Variant::Variant(Asset* v) AsAsset = v; if (v) { + // TODO: optimize VariantType to support statically linked typename of ScriptingType (via 1 bit flag within Types enum, only in game as editor might hot-reload types) Type.SetTypeName(v->GetType().Fullname); v->AddReference(); v->OnUnloaded.Bind(this); diff --git a/Source/Engine/Graphics/GPUBufferDescription.h b/Source/Engine/Graphics/GPUBufferDescription.h index f0f192954..6303ae089 100644 --- a/Source/Engine/Graphics/GPUBufferDescription.h +++ b/Source/Engine/Graphics/GPUBufferDescription.h @@ -334,11 +334,12 @@ public: /// Creates argument buffer description. /// /// The size (in bytes). + /// The additional bindings (for example, to use as UAV, pass ). /// The usage. /// The buffer description. - static GPUBufferDescription Argument(int32 size, GPUResourceUsage usage = GPUResourceUsage::Default) + static GPUBufferDescription Argument(int32 size, GPUResourceUsage usage = GPUResourceUsage::Default, GPUBufferFlags additionalFlags = GPUBufferFlags::None) { - return Buffer(size, GPUBufferFlags::Argument, PixelFormat::Unknown, nullptr, 0, usage); + return Buffer(size, GPUBufferFlags::Argument | additionalFlags, PixelFormat::R32_UInt, nullptr, sizeof(uint32), usage); } /// diff --git a/Source/Engine/Graphics/GPUContext.cpp b/Source/Engine/Graphics/GPUContext.cpp index fdeca122b..55f87ba3b 100644 --- a/Source/Engine/Graphics/GPUContext.cpp +++ b/Source/Engine/Graphics/GPUContext.cpp @@ -20,6 +20,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view GPUResource* resource = view ? view->GetParent() : nullptr; const Char* resourceType = TEXT("resource"); const Char* flagType = TEXT("flags"); + StringView resourceName; if (resource) { switch (resource->GetResourceType()) @@ -36,6 +37,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view flagType = TEXT("GPUBufferFlags"); break; } + resourceName = resource->GetName(); } const Char* usage = TEXT("-"); switch (bindPoint) @@ -53,7 +55,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view usage = TEXT("render target"); break; } - LOG(Error, "Incorrect {} bind at slot {} as {} (ensure to setup correct {} when creating that resource)", resourceType, slot, usage, flagType); + LOG(Error, "Incorrect {} '{}' bind at slot {} as {} (ensure to setup correct {} when creating that resource)", resourceType, resourceName, slot, usage, flagType); } #endif diff --git a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUDeviceDX12.cpp b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUDeviceDX12.cpp index 40e081175..20a26f2f5 100644 --- a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUDeviceDX12.cpp +++ b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUDeviceDX12.cpp @@ -359,7 +359,8 @@ bool GPUDeviceDX12::Init() // Debug Layer #if GPU_ENABLE_DIAGNOSTICS ComPtr infoQueue; - VALIDATE_DIRECTX_CALL(_device->QueryInterface(IID_PPV_ARGS(&infoQueue))); + HRESULT result = _device->QueryInterface(IID_PPV_ARGS(&infoQueue)); + LOG_DIRECTX_RESULT(result); if (infoQueue) { D3D12_INFO_QUEUE_FILTER filter; diff --git a/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.cpp b/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.cpp index 1a8739acc..c36d1acee 100644 --- a/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.cpp +++ b/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.cpp @@ -1358,7 +1358,7 @@ void GPUContextVulkan::UpdateBuffer(GPUBuffer* buffer, const void* data, uint32 // Use direct update for small buffers const uint32 alignedSize = Math::AlignUp(size, 4); - if (size <= 16 * 1024 && alignedSize <= buffer->GetSize()) + if (size <= 4 * 1024 && alignedSize <= buffer->GetSize()) { //AddBufferBarrier(bufferVulkan, VK_ACCESS_TRANSFER_WRITE_BIT); //FlushBarriers(); diff --git a/Source/Engine/Particles/Graph/GPU/GPUParticles.cpp b/Source/Engine/Particles/Graph/GPU/GPUParticles.cpp index 2c570a741..94886136f 100644 --- a/Source/Engine/Particles/Graph/GPU/GPUParticles.cpp +++ b/Source/Engine/Particles/Graph/GPU/GPUParticles.cpp @@ -12,6 +12,7 @@ #include "Engine/Graphics/GPUContext.h" #include "Engine/Graphics/Shaders/GPUShader.h" #include "Engine/Graphics/Shaders/GPUConstantBuffer.h" +#include "Engine/Profiler/Profiler.h" GPU_CB_STRUCT(GPUParticlesData { Matrix ViewProjectionMatrix; @@ -131,6 +132,8 @@ void GPUParticles::CopyParticlesCount(GPUContext* context, ParticleEmitter* emit void GPUParticles::Execute(GPUContext* context, ParticleEmitter* emitter, ParticleEffect* effect, int32 emitterIndex, ParticleEmitterInstance& data) { + PROFILE_CPU_ASSET(emitter); + PROFILE_GPU("GPUParticles"); ASSERT(emitter->Graph.Version == data.Version); ASSERT(emitter->Graph.Version == data.Buffer->Version); uint32 counterDefaultValue = 0; diff --git a/Source/Engine/Particles/Particles.cpp b/Source/Engine/Particles/Particles.cpp index 60f0e6978..71b0f2c30 100644 --- a/Source/Engine/Particles/Particles.cpp +++ b/Source/Engine/Particles/Particles.cpp @@ -770,7 +770,6 @@ void DrawEmitterGPU(RenderContext& renderContext, ParticleBuffer* buffer, DrawCa context->BindCB(0, GPUParticlesSortingCB); context->BindSR(0, buffer->GPU.Buffer->View()); context->BindUA(0, buffer->GPU.SortingKeysBuffer->View()); - // TODO: optimize it by using DispatchIndirect with shared invoke args generated after particles update const int32 threadGroupSize = 1024; context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1); diff --git a/Source/Engine/Renderer/RenderList.cpp b/Source/Engine/Renderer/RenderList.cpp index 05f72f83f..fa0eb8d61 100644 --- a/Source/Engine/Renderer/RenderList.cpp +++ b/Source/Engine/Renderer/RenderList.cpp @@ -264,6 +264,7 @@ void RenderList::AddDelayedDraw(DelayedDraw&& func) void RenderList::DrainDelayedDraws(RenderContext& renderContext) { + PROFILE_GPU_CPU_NAMED("DelayedDraws"); for (DelayedDraw& e : _delayedDraws) e(renderContext); _delayedDraws.SetCapacity(0); diff --git a/Source/Engine/Renderer/Utils/BitonicSort.cpp b/Source/Engine/Renderer/Utils/BitonicSort.cpp index be5f38be4..0834588ba 100644 --- a/Source/Engine/Renderer/Utils/BitonicSort.cpp +++ b/Source/Engine/Renderer/Utils/BitonicSort.cpp @@ -6,8 +6,6 @@ #include "Engine/Graphics/GPUContext.h" #include "Engine/Graphics/GPULimits.h" -#define INDIRECT_ARGS_STRIDE 12 - // The sorting keys buffer item structure template. Matches the shader type. struct Item { @@ -39,7 +37,7 @@ bool BitonicSort::Init() // Create indirect dispatch arguments buffer _dispatchArgsBuffer = GPUDevice::Instance->CreateBuffer(TEXT("BitonicSortDispatchArgs")); - if (_dispatchArgsBuffer->Init(GPUBufferDescription::Raw(22 * 23 / 2 * INDIRECT_ARGS_STRIDE, GPUBufferFlags::Argument | GPUBufferFlags::UnorderedAccess))) + if (_dispatchArgsBuffer->Init(GPUBufferDescription::Raw(22 * 23 / 2 * sizeof(GPUDispatchIndirectArgs), GPUBufferFlags::Argument | GPUBufferFlags::UnorderedAccess))) return true; // Load asset @@ -122,7 +120,7 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf // We have already pre-sorted up through k = 2048 when first writing our list, so we continue sorting with k = 4096 // For really large values of k, these indirect dispatches will be skipped over with thread counts of 0 - uint32 indirectArgsOffset = INDIRECT_ARGS_STRIDE; + uint32 indirectArgsOffset = sizeof(GPUDispatchIndirectArgs); for (uint32 k = 4096; k <= alignedMaxNumElements; k *= 2) { for (uint32 j = k / 2; j >= 2048; j /= 2) @@ -133,11 +131,11 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf context->BindCB(0, _cb); context->DispatchIndirect(_outerSortCS, _dispatchArgsBuffer, indirectArgsOffset); - indirectArgsOffset += INDIRECT_ARGS_STRIDE; + indirectArgsOffset += sizeof(GPUDispatchIndirectArgs); } context->DispatchIndirect(_innerSortCS, _dispatchArgsBuffer, indirectArgsOffset); - indirectArgsOffset += INDIRECT_ARGS_STRIDE; + indirectArgsOffset += sizeof(GPUDispatchIndirectArgs); } context->ResetUA(); diff --git a/Source/Shaders/BitonicSort.shader b/Source/Shaders/BitonicSort.shader index c4a275862..a2f7d215b 100644 --- a/Source/Shaders/BitonicSort.shader +++ b/Source/Shaders/BitonicSort.shader @@ -68,6 +68,7 @@ void CS_IndirectArgs(uint groupIndex : SV_GroupIndex) uint offset = 12 * prevDispatches; // Generate outer sort dispatch arguments + UNROLL for (uint j = k / 2; j > 1024; j /= 2) { // All of the groups of size 2j that are full diff --git a/Source/Shaders/GPUParticlesSorting.shader b/Source/Shaders/GPUParticlesSorting.shader index b0e0063ec..395172327 100644 --- a/Source/Shaders/GPUParticlesSorting.shader +++ b/Source/Shaders/GPUParticlesSorting.shader @@ -51,8 +51,6 @@ void CS_Sort(uint3 dispatchThreadId : SV_DispatchThreadID) if (index >= particlesCount) return; - // TODO: maybe process more than 1 particle at once and pre-sort them? - #if SORT_MODE == 0 // Sort particles by depth to the view's near plane