Various optimizations

2025-08-06 18:48:18 +02:00
parent a5838f739d
commit cf9c203855
12 changed files with 22 additions and 16 deletions
--- a/Content/Shaders/BitonicSort.flax
+++ b/Content/Shaders/BitonicSort.flax
--- a/Source/Engine/Core/Types/Variant.cpp
+++ b/Source/Engine/Core/Types/Variant.cpp
@@ -632,6 +632,7 @@ Variant::Variant(ScriptingObject* v)
    AsObject = v;
    if (v)
    {
+        // TODO: optimize VariantType to support statically linked typename of ScriptingType (via 1 bit flag within Types enum, only in game as editor might hot-reload types)
        Type.SetTypeName(v->GetType().Fullname);
        v->Deleted.Bind<Variant, &Variant::OnObjectDeleted>(this);
    }
@@ -643,6 +644,7 @@ Variant::Variant(Asset* v)
    AsAsset = v;
    if (v)
    {
+        // TODO: optimize VariantType to support statically linked typename of ScriptingType (via 1 bit flag within Types enum, only in game as editor might hot-reload types)
        Type.SetTypeName(v->GetType().Fullname);
        v->AddReference();
        v->OnUnloaded.Bind<Variant, &Variant::OnAssetUnloaded>(this);
--- a/Source/Engine/Graphics/GPUBufferDescription.h
+++ b/Source/Engine/Graphics/GPUBufferDescription.h
@@ -334,11 +334,12 @@ public:
    /// Creates argument buffer description.
    /// </summary>
    /// <param name="size">The size (in bytes).</param>
+    /// <param name="additionalFlags">The additional bindings (for example, to use as UAV, pass <see cref="GPUBufferFlags::UnorderedAccess" />).</param>
    /// <param name="usage">The usage.</param>
    /// <returns>The buffer description.</returns>
-    static GPUBufferDescription Argument(int32 size, GPUResourceUsage usage = GPUResourceUsage::Default)
+    static GPUBufferDescription Argument(int32 size, GPUResourceUsage usage = GPUResourceUsage::Default, GPUBufferFlags additionalFlags = GPUBufferFlags::None)
    {
-        return Buffer(size, GPUBufferFlags::Argument, PixelFormat::Unknown, nullptr, 0, usage);
+        return Buffer(size, GPUBufferFlags::Argument | additionalFlags, PixelFormat::R32_UInt, nullptr, sizeof(uint32), usage);
    }

    /// <summary>
--- a/Source/Engine/Graphics/GPUContext.cpp
+++ b/Source/Engine/Graphics/GPUContext.cpp
@@ -20,6 +20,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view
    GPUResource* resource = view ? view->GetParent() : nullptr;
    const Char* resourceType = TEXT("resource");
    const Char* flagType = TEXT("flags");
+    StringView resourceName;
    if (resource)
    {
        switch (resource->GetResourceType())
@@ -36,6 +37,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view
            flagType = TEXT("GPUBufferFlags");
            break;
        }
+        resourceName = resource->GetName();
    }
    const Char* usage = TEXT("-");
    switch (bindPoint)
@@ -53,7 +55,7 @@ void GPUContext::LogInvalidResourceUsage(int32 slot, const GPUResourceView* view
        usage = TEXT("render target");
        break;
    }
-    LOG(Error, "Incorrect {} bind at slot {} as {} (ensure to setup correct {} when creating that resource)", resourceType, slot, usage, flagType);
+    LOG(Error, "Incorrect {} '{}' bind at slot {} as {} (ensure to setup correct {} when creating that resource)", resourceType, resourceName, slot, usage, flagType);
 }

 #endif
--- a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUDeviceDX12.cpp
+++ b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUDeviceDX12.cpp
@@ -359,7 +359,8 @@ bool GPUDeviceDX12::Init()
    // Debug Layer
 #if GPU_ENABLE_DIAGNOSTICS
    ComPtr<ID3D12InfoQueue> infoQueue;
-    VALIDATE_DIRECTX_CALL(_device->QueryInterface(IID_PPV_ARGS(&infoQueue)));
+    HRESULT result = _device->QueryInterface(IID_PPV_ARGS(&infoQueue));
+    LOG_DIRECTX_RESULT(result);
    if (infoQueue)
    {
        D3D12_INFO_QUEUE_FILTER filter;
--- a/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.cpp
+++ b/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.cpp
@@ -1358,7 +1358,7 @@ void GPUContextVulkan::UpdateBuffer(GPUBuffer* buffer, const void* data, uint32

    // Use direct update for small buffers
    const uint32 alignedSize = Math::AlignUp<uint32>(size, 4);
-    if (size <= 16 * 1024 && alignedSize <= buffer->GetSize())
+    if (size <= 4 * 1024 && alignedSize <= buffer->GetSize())
    {
        //AddBufferBarrier(bufferVulkan, VK_ACCESS_TRANSFER_WRITE_BIT);
        //FlushBarriers();
--- a/Source/Engine/Particles/Graph/GPU/GPUParticles.cpp
+++ b/Source/Engine/Particles/Graph/GPU/GPUParticles.cpp
@@ -12,6 +12,7 @@
 #include "Engine/Graphics/GPUContext.h"
 #include "Engine/Graphics/Shaders/GPUShader.h"
 #include "Engine/Graphics/Shaders/GPUConstantBuffer.h"
+#include "Engine/Profiler/Profiler.h"

 GPU_CB_STRUCT(GPUParticlesData {
    Matrix ViewProjectionMatrix;
@@ -131,6 +132,8 @@ void GPUParticles::CopyParticlesCount(GPUContext* context, ParticleEmitter* emit

 void GPUParticles::Execute(GPUContext* context, ParticleEmitter* emitter, ParticleEffect* effect, int32 emitterIndex, ParticleEmitterInstance& data)
 {
+    PROFILE_CPU_ASSET(emitter);
+    PROFILE_GPU("GPUParticles");
    ASSERT(emitter->Graph.Version == data.Version);
    ASSERT(emitter->Graph.Version == data.Buffer->Version);
    uint32 counterDefaultValue = 0;
--- a/Source/Engine/Particles/Particles.cpp
+++ b/Source/Engine/Particles/Particles.cpp
@@ -770,7 +770,6 @@ void DrawEmitterGPU(RenderContext& renderContext, ParticleBuffer* buffer, DrawCa
            context->BindCB(0, GPUParticlesSortingCB);
            context->BindSR(0, buffer->GPU.Buffer->View());
            context->BindUA(0, buffer->GPU.SortingKeysBuffer->View());
-            // TODO: optimize it by using DispatchIndirect with shared invoke args generated after particles update
            const int32 threadGroupSize = 1024;
            context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1);

--- a/Source/Engine/Renderer/RenderList.cpp
+++ b/Source/Engine/Renderer/RenderList.cpp
@@ -264,6 +264,7 @@ void RenderList::AddDelayedDraw(DelayedDraw&& func)

 void RenderList::DrainDelayedDraws(RenderContext& renderContext)
 {
+    PROFILE_GPU_CPU_NAMED("DelayedDraws");
    for (DelayedDraw& e : _delayedDraws)
        e(renderContext);
    _delayedDraws.SetCapacity(0);
--- a/Source/Engine/Renderer/Utils/BitonicSort.cpp
+++ b/Source/Engine/Renderer/Utils/BitonicSort.cpp
@@ -6,8 +6,6 @@
 #include "Engine/Graphics/GPUContext.h"
 #include "Engine/Graphics/GPULimits.h"

-#define INDIRECT_ARGS_STRIDE 12
-
 // The sorting keys buffer item structure template. Matches the shader type.
 struct Item
 {
@@ -39,7 +37,7 @@ bool BitonicSort::Init()

    // Create indirect dispatch arguments buffer
    _dispatchArgsBuffer = GPUDevice::Instance->CreateBuffer(TEXT("BitonicSortDispatchArgs"));
-    if (_dispatchArgsBuffer->Init(GPUBufferDescription::Raw(22 * 23 / 2 * INDIRECT_ARGS_STRIDE, GPUBufferFlags::Argument | GPUBufferFlags::UnorderedAccess)))
+    if (_dispatchArgsBuffer->Init(GPUBufferDescription::Raw(22 * 23 / 2 * sizeof(GPUDispatchIndirectArgs), GPUBufferFlags::Argument | GPUBufferFlags::UnorderedAccess)))
        return true;

    // Load asset
@@ -122,7 +120,7 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf

    // We have already pre-sorted up through k = 2048 when first writing our list, so we continue sorting with k = 4096
    // For really large values of k, these indirect dispatches will be skipped over with thread counts of 0
-    uint32 indirectArgsOffset = INDIRECT_ARGS_STRIDE;
+    uint32 indirectArgsOffset = sizeof(GPUDispatchIndirectArgs);
    for (uint32 k = 4096; k <= alignedMaxNumElements; k *= 2)
    {
        for (uint32 j = k / 2; j >= 2048; j /= 2)
@@ -133,11 +131,11 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
            context->BindCB(0, _cb);

            context->DispatchIndirect(_outerSortCS, _dispatchArgsBuffer, indirectArgsOffset);
-            indirectArgsOffset += INDIRECT_ARGS_STRIDE;
+            indirectArgsOffset += sizeof(GPUDispatchIndirectArgs);
        }

        context->DispatchIndirect(_innerSortCS, _dispatchArgsBuffer, indirectArgsOffset);
-        indirectArgsOffset += INDIRECT_ARGS_STRIDE;
+        indirectArgsOffset += sizeof(GPUDispatchIndirectArgs);
    }

    context->ResetUA();
--- a/Source/Shaders/BitonicSort.shader
+++ b/Source/Shaders/BitonicSort.shader
@@ -68,6 +68,7 @@ void CS_IndirectArgs(uint groupIndex : SV_GroupIndex)
 	uint offset = 12 * prevDispatches;

 	// Generate outer sort dispatch arguments
+	UNROLL
 	for (uint j = k / 2; j > 1024; j /= 2)
 	{
 		// All of the groups of size 2j that are full
--- a/Source/Shaders/GPUParticlesSorting.shader
+++ b/Source/Shaders/GPUParticlesSorting.shader
@@ -51,8 +51,6 @@ void CS_Sort(uint3 dispatchThreadId : SV_DispatchThreadID)
 	if (index >= particlesCount)
 		return;

-	// TODO: maybe process more than 1 particle at once and pre-sort them?
-
 #if SORT_MODE == 0

 	// Sort particles by depth to the view's near plane