From ff3d78548362cbeac57de955a5c26a3a0c70bb62 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Sat, 9 Aug 2025 23:59:52 +0200 Subject: [PATCH] Optimize GPU particles simulation, sorting and drawing with better resource transition barriers --- Source/Engine/Particles/Particles.cpp | 72 +++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 4 deletions(-) diff --git a/Source/Engine/Particles/Particles.cpp b/Source/Engine/Particles/Particles.cpp index 0ab1e0792..98fc230c6 100644 --- a/Source/Engine/Particles/Particles.cpp +++ b/Source/Engine/Particles/Particles.cpp @@ -12,6 +12,7 @@ #include "Engine/Graphics/RenderTask.h" #include "Engine/Graphics/DynamicBuffer.h" #include "Engine/Graphics/GPUContext.h" +#include "Engine/Graphics/GPUPass.h" #include "Engine/Graphics/RenderTools.h" #include "Engine/Graphics/Shaders/GPUVertexLayout.h" #include "Engine/Profiler/ProfilerCPU.h" @@ -748,6 +749,11 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch) { PROFILE_GPU_CPU_NAMED("Init Indirect Args"); + GPUMemoryPass pass(context); + pass.Transition(GPUIndirectArgsBuffer, GPUResourceAccess::CopyWrite); + for (GPUEmitterDraw& draw : GPUEmitterDraws) + pass.Transition(draw.Buffer->GPU.Buffer, GPUResourceAccess::CopyRead); + // Init default arguments byte* indirectArgsMemory = (byte*)renderContextBatch.GetMainContext().List->Memory.Allocate(indirectArgsSize, GPU_SHADER_DATA_ALIGNMENT); for (GPUEmitterDraw& draw : GPUEmitterDraws) @@ -872,6 +878,18 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch) // Generate sort keys for each particle { PROFILE_GPU("Gen Sort Keys"); + + GPUComputePass pass(context); + for (const GPUEmitterDraw& draw : GPUEmitterDraws) + { + if (draw.Sorting) + { + pass.Transition(draw.Buffer->GPU.Buffer, GPUResourceAccess::ShaderReadCompute); + pass.Transition(draw.Buffer->GPU.SortedIndices, GPUResourceAccess::UnorderedAccess); + pass.Transition(draw.Buffer->GPU.SortingKeys, GPUResourceAccess::UnorderedAccess); + } + } + for (const GPUEmitterDraw& draw : GPUEmitterDraws) { if (!draw.Sorting) @@ -935,12 +953,29 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch) } // Run sorting + constexpr int32 inplaceSortSizeLimit = 2048; + { + // Small emitters can be sorted in-place with a single independent dispatch (simultaneously) + GPUComputePass pass(context); + for (const GPUEmitterDraw& draw : GPUEmitterDraws) + { + if (!draw.Sorting || draw.Buffer->GPU.ParticlesCountMax > inplaceSortSizeLimit) + continue; + ParticleEmitter* emitter = draw.Buffer->Emitter; + for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++) + { + auto module = emitter->Graph.SortModules[moduleIndex]; + // TODO: add support for module->SortedIndicesOffset (multiple sort modules) + const auto sortMode = (ParticleSortMode)module->Values[2].AsInt; + bool sortAscending = sortMode == ParticleSortMode::CustomAscending; + BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.SortingKeys, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.ParticlesCountMax); + } + } + } for (const GPUEmitterDraw& draw : GPUEmitterDraws) { - if (!draw.Sorting) + if (!draw.Sorting || draw.Buffer->GPU.ParticlesCountMax <= inplaceSortSizeLimit) continue; - - // Execute all sorting modules ParticleEmitter* emitter = draw.Buffer->Emitter; for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++) { @@ -950,11 +985,12 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch) bool sortAscending = sortMode == ParticleSortMode::CustomAscending; BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.SortingKeys, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.ParticlesCountMax); // TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier (all sorting in parallel) - // TODO: run small emitters sorting (less than 2k particles) sorting in separate loop as pass without UAV barriers (all sorting in parallel) } } } + // TODO: transition here SortedIndices into ShaderReadNonPixel and Buffer into ShaderReadGraphics to reduce barriers during particles rendering + // Submit draw calls for (GPUEmitterDraw& draw : GPUEmitterDraws) { @@ -1326,6 +1362,15 @@ void UpdateGPU(RenderTask* task, GPUContext* context) // Pre-pass with buffers setup { PROFILE_CPU_NAMED("PreSim"); + + GPUMemoryPass pass(context); + for (GPUSim& sim : sims) + { + if (sim.Data.Buffer->GPU.PendingClear) + pass.Transition(sim.Data.Buffer->GPU.Buffer, GPUResourceAccess::CopyWrite); + pass.Transition(sim.Data.Buffer->GPU.BufferSecondary, GPUResourceAccess::CopyWrite); + } + for (GPUSim& sim : sims) { sim.Emitter->GPU.PreSim(context, sim.Emitter, sim.Effect, sim.EmitterIndex, sim.Data); @@ -1335,6 +1380,14 @@ void UpdateGPU(RenderTask* task, GPUContext* context) // Pre-pass with buffers setup { PROFILE_GPU_CPU_NAMED("Sim"); + + GPUComputePass pass(context); + for (GPUSim& sim : sims) + { + pass.Transition(sim.Data.Buffer->GPU.Buffer, GPUResourceAccess::ShaderReadCompute); + pass.Transition(sim.Data.Buffer->GPU.BufferSecondary, GPUResourceAccess::UnorderedAccess); + } + for (GPUSim& sim : sims) { sim.Emitter->GPU.Sim(context, sim.Emitter, sim.Effect, sim.EmitterIndex, sim.Data); @@ -1344,6 +1397,17 @@ void UpdateGPU(RenderTask* task, GPUContext* context) // Post-pass with buffers setup { PROFILE_CPU_NAMED("PostSim"); + + GPUMemoryPass pass(context); + for (GPUSim& sim : sims) + { + if (sim.Data.CustomData.HasItems()) + { + pass.Transition(sim.Data.Buffer->GPU.BufferSecondary, GPUResourceAccess::CopyRead); + pass.Transition(sim.Data.Buffer->GPU.Buffer, GPUResourceAccess::CopyWrite); + } + } + for (GPUSim& sim : sims) { sim.Emitter->GPU.PostSim(context, sim.Emitter, sim.Effect, sim.EmitterIndex, sim.Data);