Optimize Animated Model bones buffer flushing with delayed draw action to reduce lock contention

#3917 #3827
2026-02-06 13:27:53 +01:00
parent 73c19b278f
commit 4afd9fd8df
5 changed files with 22 additions and 21 deletions
--- a/Source/Engine/Level/Actors/AnimatedModel.cpp
+++ b/Source/Engine/Level/Actors/AnimatedModel.cpp
@@ -20,6 +20,7 @@
 #include "Engine/Level/Scene/Scene.h"
 #include "Engine/Level/SceneObjectsFactory.h"
 #include "Engine/Profiler/ProfilerMemory.h"
+#include "Engine/Renderer/RenderList.h"
 #include "Engine/Serialization/Serialization.h"

 AnimatedModel::AnimatedModel(const SpawnParams& params)
@@ -1012,9 +1013,10 @@ void AnimatedModel::Draw(RenderContext& renderContext)
        // Flush skinning data with GPU
        if (_skinningData.IsDirty())
        {
-            RenderContext::GPULocker.Lock();
-            GPUDevice::Instance->GetMainContext()->UpdateBuffer(_skinningData.BoneMatrices, _skinningData.Data.Get(), _skinningData.Data.Count());
-            RenderContext::GPULocker.Unlock();
+            renderContext.List->AddDelayedDraw([this](GPUContext* context, RenderContextBatch& renderContextBatch, int32 renderContextIndex)
+            {
+                context->UpdateBuffer(_skinningData.BoneMatrices, _skinningData.Data.Get(), _skinningData.Data.Count());
+            });
            _skinningData.OnFlush();
        }

@@ -1057,9 +1059,10 @@ void AnimatedModel::Draw(RenderContextBatch& renderContextBatch)
        // Flush skinning data with GPU
        if (_skinningData.IsDirty())
        {
-            RenderContext::GPULocker.Lock();
-            GPUDevice::Instance->GetMainContext()->UpdateBuffer(_skinningData.BoneMatrices, _skinningData.Data.Get(), _skinningData.Data.Count());
-            RenderContext::GPULocker.Unlock();
+            renderContext.List->AddDelayedDraw([this](GPUContext* context, RenderContextBatch& renderContextBatch, int32 renderContextIndex)
+            {
+                context->UpdateBuffer(_skinningData.BoneMatrices, _skinningData.Data.Get(), _skinningData.Data.Count());
+            });
            _skinningData.OnFlush();
        }

--- a/Source/Engine/Particles/Particles.cpp
+++ b/Source/Engine/Particles/Particles.cpp
@@ -677,11 +677,10 @@ void CleanupGPUParticlesSorting()
    SAFE_DELETE_GPU_RESOURCE(GPUIndirectArgsBuffer);
 }

-void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
+void DrawEmittersGPU(GPUContext* context, RenderContextBatch& renderContextBatch)
 {
    PROFILE_GPU_CPU_NAMED("DrawEmittersGPU");
    ScopeReadLock systemScope(Particles::SystemLocker);
-    GPUContext* context = GPUDevice::Instance->GetMainContext();

    // Count draws and sorting passes needed for resources allocation
    uint32 indirectArgsSize = 0;
@@ -1124,9 +1123,9 @@ void DrawEmitterGPU(RenderContextBatch& renderContextBatch, ParticleBuffer* buff
    if (GPUEmitterDraws.Count() == 0)
    {
        // The first emitter schedules the drawing of all batched draws
-        renderContextBatch.GetMainContext().List->AddDelayedDraw([](RenderContextBatch& renderContextBatch, int32 contextIndex)
+        renderContextBatch.GetMainContext().List->AddDelayedDraw([](GPUContext* context, RenderContextBatch& renderContextBatch, int32 renderContextIndex)
        {
-            DrawEmittersGPU(renderContextBatch);
+            DrawEmittersGPU(context, renderContextBatch);
        });
    }
    GPUEmitterDraws.Add({ buffer, drawCall, drawModes, staticFlags, bounds, renderModulesIndices, indirectArgsSize, sortOrder, sorting });
--- a/Source/Engine/Renderer/RenderList.cpp
+++ b/Source/Engine/Renderer/RenderList.cpp
@@ -258,18 +258,17 @@ void RenderList::AddSettingsBlend(IPostFxSettingsProvider* provider, float weigh

 void RenderList::AddDelayedDraw(DelayedDraw&& func)
 {
-    MemPoolLocker.Lock(); // TODO: convert _delayedDraws into RenderListBuffer with usage of arena Memory for fast alloc
    _delayedDraws.Add(MoveTemp(func));
-    MemPoolLocker.Unlock();
 }

-void RenderList::DrainDelayedDraws(RenderContextBatch& renderContextBatch, int32 contextIndex)
+void RenderList::DrainDelayedDraws(GPUContext* context, RenderContextBatch& renderContextBatch, int32 renderContextIndex)
 {
-    if (_delayedDraws.IsEmpty())
+    if (_delayedDraws.Count() == 0)
        return;
+    PROFILE_CPU();
    for (DelayedDraw& e : _delayedDraws)
-        e(renderContextBatch, contextIndex);
-    _delayedDraws.SetCapacity(0);
+        e(context, renderContextBatch, renderContextIndex);
+    _delayedDraws.Clear();
 }

 void RenderList::BlendSettings()
@@ -495,7 +494,6 @@ RenderList::RenderList(const SpawnParams& params)
    , ObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Buffer"))
    , TempObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Buffer"))
    , _instanceBuffer(0, sizeof(ShaderObjectDrawInstanceData), TEXT("Instance Buffer"), GPUVertexLayout::Get({ { VertexElement::Types::Attribute0, 3, 0, 1, PixelFormat::R32_UInt } }))
-    , _delayedDraws(&Memory)
 {
 }

--- a/Source/Engine/Renderer/RenderList.h
+++ b/Source/Engine/Renderer/RenderList.h
@@ -459,13 +459,14 @@ public:
    /// </summary>
    DynamicTypedBuffer TempObjectBuffer;

-    typedef Function<void(RenderContextBatch& renderContextBatch, int32 contextIndex)> DelayedDraw;
+    typedef Function<void(GPUContext* context, RenderContextBatch& renderContextBatch, int32 renderContextIndex)> DelayedDraw;
    void AddDelayedDraw(DelayedDraw&& func);
-    void DrainDelayedDraws(RenderContextBatch& renderContextBatch, int32 contextIndex);
+    void DrainDelayedDraws(GPUContext* context, RenderContextBatch& renderContextBatch, int32 renderContextIndex);

    /// <summary>
    /// Adds custom callback (eg. lambda) to invoke after scene draw calls are collected on a main thread (some async draw tasks might be active). Allows for safe usage of GPUContext for draw preparations or to perform GPU-driven drawing.
    /// </summary>
+    /// <remarks>Can be called in async during scene rendering (thread-safe internally). Lambda is allocated by concurrent arena allocator owned by the RenderList.</remarks>
    template<typename T>
    FORCE_INLINE void AddDelayedDraw(const T& lambda)
    {
@@ -476,7 +477,7 @@ public:

 private:
    DynamicVertexBuffer _instanceBuffer;
-    Array<DelayedDraw, ConcurrentArenaAllocation> _delayedDraws;
+    RenderListBuffer<DelayedDraw> _delayedDraws;

 public:
    /// <summary>
--- a/Source/Engine/Renderer/Renderer.cpp
+++ b/Source/Engine/Renderer/Renderer.cpp
@@ -461,7 +461,7 @@ void RenderInner(SceneRenderTask* task, RenderContext& renderContext, RenderCont

        // Perform custom post-scene drawing (eg. GPU dispatches used by VFX)
        for (int32 i = 0; i < renderContextBatch.Contexts.Count(); i++)
-            renderContextBatch.Contexts[i].List->DrainDelayedDraws(renderContextBatch, i);
+            renderContextBatch.Contexts[i].List->DrainDelayedDraws(context, renderContextBatch, i);

 #if USE_EDITOR
        GBufferPass::Instance()->OverrideDrawCalls(renderContext);