Optimize Animated Model bones buffer flushing with delayed draw action to reduce lock contention

#3917 #3827
This commit is contained in:
Wojtek Figat
2026-02-06 13:27:53 +01:00
parent 73c19b278f
commit 4afd9fd8df
5 changed files with 22 additions and 21 deletions

View File

@@ -20,6 +20,7 @@
#include "Engine/Level/Scene/Scene.h"
#include "Engine/Level/SceneObjectsFactory.h"
#include "Engine/Profiler/ProfilerMemory.h"
#include "Engine/Renderer/RenderList.h"
#include "Engine/Serialization/Serialization.h"
AnimatedModel::AnimatedModel(const SpawnParams& params)
@@ -1012,9 +1013,10 @@ void AnimatedModel::Draw(RenderContext& renderContext)
// Flush skinning data with GPU
if (_skinningData.IsDirty())
{
RenderContext::GPULocker.Lock();
GPUDevice::Instance->GetMainContext()->UpdateBuffer(_skinningData.BoneMatrices, _skinningData.Data.Get(), _skinningData.Data.Count());
RenderContext::GPULocker.Unlock();
renderContext.List->AddDelayedDraw([this](GPUContext* context, RenderContextBatch& renderContextBatch, int32 renderContextIndex)
{
context->UpdateBuffer(_skinningData.BoneMatrices, _skinningData.Data.Get(), _skinningData.Data.Count());
});
_skinningData.OnFlush();
}
@@ -1057,9 +1059,10 @@ void AnimatedModel::Draw(RenderContextBatch& renderContextBatch)
// Flush skinning data with GPU
if (_skinningData.IsDirty())
{
RenderContext::GPULocker.Lock();
GPUDevice::Instance->GetMainContext()->UpdateBuffer(_skinningData.BoneMatrices, _skinningData.Data.Get(), _skinningData.Data.Count());
RenderContext::GPULocker.Unlock();
renderContext.List->AddDelayedDraw([this](GPUContext* context, RenderContextBatch& renderContextBatch, int32 renderContextIndex)
{
context->UpdateBuffer(_skinningData.BoneMatrices, _skinningData.Data.Get(), _skinningData.Data.Count());
});
_skinningData.OnFlush();
}

View File

@@ -677,11 +677,10 @@ void CleanupGPUParticlesSorting()
SAFE_DELETE_GPU_RESOURCE(GPUIndirectArgsBuffer);
}
void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
void DrawEmittersGPU(GPUContext* context, RenderContextBatch& renderContextBatch)
{
PROFILE_GPU_CPU_NAMED("DrawEmittersGPU");
ScopeReadLock systemScope(Particles::SystemLocker);
GPUContext* context = GPUDevice::Instance->GetMainContext();
// Count draws and sorting passes needed for resources allocation
uint32 indirectArgsSize = 0;
@@ -1124,9 +1123,9 @@ void DrawEmitterGPU(RenderContextBatch& renderContextBatch, ParticleBuffer* buff
if (GPUEmitterDraws.Count() == 0)
{
// The first emitter schedules the drawing of all batched draws
renderContextBatch.GetMainContext().List->AddDelayedDraw([](RenderContextBatch& renderContextBatch, int32 contextIndex)
renderContextBatch.GetMainContext().List->AddDelayedDraw([](GPUContext* context, RenderContextBatch& renderContextBatch, int32 renderContextIndex)
{
DrawEmittersGPU(renderContextBatch);
DrawEmittersGPU(context, renderContextBatch);
});
}
GPUEmitterDraws.Add({ buffer, drawCall, drawModes, staticFlags, bounds, renderModulesIndices, indirectArgsSize, sortOrder, sorting });

View File

@@ -258,18 +258,17 @@ void RenderList::AddSettingsBlend(IPostFxSettingsProvider* provider, float weigh
void RenderList::AddDelayedDraw(DelayedDraw&& func)
{
MemPoolLocker.Lock(); // TODO: convert _delayedDraws into RenderListBuffer with usage of arena Memory for fast alloc
_delayedDraws.Add(MoveTemp(func));
MemPoolLocker.Unlock();
}
void RenderList::DrainDelayedDraws(RenderContextBatch& renderContextBatch, int32 contextIndex)
void RenderList::DrainDelayedDraws(GPUContext* context, RenderContextBatch& renderContextBatch, int32 renderContextIndex)
{
if (_delayedDraws.IsEmpty())
if (_delayedDraws.Count() == 0)
return;
PROFILE_CPU();
for (DelayedDraw& e : _delayedDraws)
e(renderContextBatch, contextIndex);
_delayedDraws.SetCapacity(0);
e(context, renderContextBatch, renderContextIndex);
_delayedDraws.Clear();
}
void RenderList::BlendSettings()
@@ -495,7 +494,6 @@ RenderList::RenderList(const SpawnParams& params)
, ObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Buffer"))
, TempObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Buffer"))
, _instanceBuffer(0, sizeof(ShaderObjectDrawInstanceData), TEXT("Instance Buffer"), GPUVertexLayout::Get({ { VertexElement::Types::Attribute0, 3, 0, 1, PixelFormat::R32_UInt } }))
, _delayedDraws(&Memory)
{
}

View File

@@ -459,13 +459,14 @@ public:
/// </summary>
DynamicTypedBuffer TempObjectBuffer;
typedef Function<void(RenderContextBatch& renderContextBatch, int32 contextIndex)> DelayedDraw;
typedef Function<void(GPUContext* context, RenderContextBatch& renderContextBatch, int32 renderContextIndex)> DelayedDraw;
void AddDelayedDraw(DelayedDraw&& func);
void DrainDelayedDraws(RenderContextBatch& renderContextBatch, int32 contextIndex);
void DrainDelayedDraws(GPUContext* context, RenderContextBatch& renderContextBatch, int32 renderContextIndex);
/// <summary>
/// Adds custom callback (eg. lambda) to invoke after scene draw calls are collected on a main thread (some async draw tasks might be active). Allows for safe usage of GPUContext for draw preparations or to perform GPU-driven drawing.
/// </summary>
/// <remarks>Can be called in async during scene rendering (thread-safe internally). Lambda is allocated by concurrent arena allocator owned by the RenderList.</remarks>
template<typename T>
FORCE_INLINE void AddDelayedDraw(const T& lambda)
{
@@ -476,7 +477,7 @@ public:
private:
DynamicVertexBuffer _instanceBuffer;
Array<DelayedDraw, ConcurrentArenaAllocation> _delayedDraws;
RenderListBuffer<DelayedDraw> _delayedDraws;
public:
/// <summary>

View File

@@ -461,7 +461,7 @@ void RenderInner(SceneRenderTask* task, RenderContext& renderContext, RenderCont
// Perform custom post-scene drawing (eg. GPU dispatches used by VFX)
for (int32 i = 0; i < renderContextBatch.Contexts.Count(); i++)
renderContextBatch.Contexts[i].List->DrainDelayedDraws(renderContextBatch, i);
renderContextBatch.Contexts[i].List->DrainDelayedDraws(context, renderContextBatch, i);
#if USE_EDITOR
GBufferPass::Instance()->OverrideDrawCalls(renderContext);