Optimize GPU particles Bitonic sort to use separate buffers for indices and keys to avoid additional buffer copy

This commit is contained in:
Wojtek Figat
2025-08-08 18:24:44 +02:00
parent 519a9c0a14
commit 854f3acd4c
9 changed files with 115 additions and 169 deletions

View File

@@ -848,81 +848,68 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
context->BindCB(0, GPUParticlesSortingCB);
// Generate sort keys for each particle
for (const GPUEmitterDraw& draw : GPUEmitterDraws)
{
if (!draw.Sorting)
continue;
ASSERT(draw.Buffer->GPU.SortingKeysBuffer);
// Generate sort keys for particles
ParticleEmitter* emitter = draw.Buffer->Emitter;
for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++)
PROFILE_GPU("Gen Sort Keys");
for (const GPUEmitterDraw& draw : GPUEmitterDraws)
{
auto module = emitter->Graph.SortModules[moduleIndex];
const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
// Generate sorting keys based on sorting mode
GPUParticlesSortingData data;
data.ParticleCounterOffset = draw.Buffer->GPU.ParticleCounterOffset;
data.ParticleStride = draw.Buffer->Stride;
data.ParticleCapacity = draw.Buffer->Capacity;
int32 permutationIndex;
switch (sortMode)
if (!draw.Sorting)
continue;
ASSERT(draw.Buffer->GPU.SortingKeys);
ParticleEmitter* emitter = draw.Buffer->Emitter;
for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++)
{
case ParticleSortMode::ViewDepth:
{
permutationIndex = 0;
data.PositionOffset = emitter->Graph.GetPositionAttributeOffset();
const Matrix viewProjection = renderContextBatch.GetMainContext().View.ViewProjection();
if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
auto module = emitter->Graph.SortModules[moduleIndex];
// TODO: add support for module->SortedIndicesOffset (multiple sort modules)
const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
GPUParticlesSortingData data;
data.ParticleCounterOffset = draw.Buffer->GPU.ParticleCounterOffset;
data.ParticleStride = draw.Buffer->Stride;
data.ParticleCapacity = draw.Buffer->Capacity;
int32 permutationIndex;
switch (sortMode)
{
Matrix matrix;
Matrix::Multiply(draw.DrawCall.World, viewProjection, matrix);
Matrix::Transpose(matrix, data.PositionTransform);
}
else
case ParticleSortMode::ViewDepth:
{
Matrix::Transpose(viewProjection, data.PositionTransform);
}
break;
}
case ParticleSortMode::ViewDistance:
{
permutationIndex = 1;
data.PositionOffset = emitter->Graph.GetPositionAttributeOffset();
data.ViewPosition = renderContextBatch.GetMainContext().View.Position;
if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
{
Matrix::Transpose(draw.DrawCall.World, data.PositionTransform);
}
else
{
Matrix::Transpose(Matrix::Identity, data.PositionTransform);
}
break;
}
case ParticleSortMode::CustomAscending:
case ParticleSortMode::CustomDescending:
{
permutationIndex = 2;
int32 attributeIdx = module->Attributes[0];
if (attributeIdx == -1)
permutationIndex = 0;
data.PositionOffset = emitter->Graph.GetPositionAttributeOffset();
const Matrix viewProjection = renderContextBatch.GetMainContext().View.ViewProjection();
if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
Matrix::Transpose(draw.DrawCall.World * viewProjection, data.PositionTransform);
else
Matrix::Transpose(viewProjection, data.PositionTransform);
break;
data.CustomOffset = emitter->Graph.Layout.Attributes[attributeIdx].Offset;
break;
}
case ParticleSortMode::ViewDistance:
{
permutationIndex = 1;
data.PositionOffset = emitter->Graph.GetPositionAttributeOffset();
data.ViewPosition = renderContextBatch.GetMainContext().View.Position;
if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
Matrix::Transpose(draw.DrawCall.World, data.PositionTransform);
else
Matrix::Transpose(Matrix::Identity, data.PositionTransform);
break;
}
case ParticleSortMode::CustomAscending:
case ParticleSortMode::CustomDescending:
{
permutationIndex = 2;
int32 attributeIdx = module->Attributes[0];
if (attributeIdx == -1)
break;
data.CustomOffset = emitter->Graph.Layout.Attributes[attributeIdx].Offset;
break;
}
}
context->UpdateCB(GPUParticlesSortingCB, &data);
context->BindSR(0, draw.Buffer->GPU.Buffer->View());
context->BindUA(0, draw.Buffer->GPU.SortedIndices->View());
context->BindUA(1, draw.Buffer->GPU.SortingKeys->View());
const int32 threadGroupSize = 1024;
context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(draw.Buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1);
}
#if !BUILD_RELEASE
default:
CRASH;
return;
#endif
}
context->UpdateCB(GPUParticlesSortingCB, &data);
context->BindSR(0, draw.Buffer->GPU.Buffer->View());
context->BindUA(0, draw.Buffer->GPU.SortingKeysBuffer->View());
const int32 threadGroupSize = 1024;
context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(draw.Buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1);
}
context->ResetUA();
}
// Run sorting
@@ -930,17 +917,18 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
{
if (!draw.Sorting)
continue;
ASSERT(draw.Buffer->GPU.SortingKeysBuffer);
// Execute all sorting modules
ParticleEmitter* emitter = draw.Buffer->Emitter;
for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++)
{
auto module = emitter->Graph.SortModules[moduleIndex];
// TODO: add support for module->SortedIndicesOffset (multiple sort modules)
const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
bool sortAscending = sortMode == ParticleSortMode::CustomAscending;
BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.ParticlesCountMax);
// TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier (run all sorting in parallel)
BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.SortingKeys, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.ParticlesCountMax);
// TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier (all sorting in parallel)
// TODO: run small emitters sorting (less than 2k particles) sorting in separate loop as pass without UAV barriers (all sorting in parallel)
}
}
}

View File

@@ -98,7 +98,7 @@ ParticleBuffer::~ParticleBuffer()
{
SAFE_DELETE_GPU_RESOURCE(GPU.Buffer);
SAFE_DELETE_GPU_RESOURCE(GPU.BufferSecondary);
SAFE_DELETE_GPU_RESOURCE(GPU.SortingKeysBuffer);
SAFE_DELETE_GPU_RESOURCE(GPU.SortingKeys);
SAFE_DELETE_GPU_RESOURCE(GPU.SortedIndices);
SAFE_DELETE(GPU.RibbonIndexBufferDynamic);
SAFE_DELETE(GPU.RibbonVertexBufferDynamic);
@@ -161,7 +161,7 @@ bool ParticleBuffer::Init(ParticleEmitter* emitter)
bool ParticleBuffer::AllocateSortBuffer()
{
ASSERT(Emitter && GPU.SortedIndices == nullptr && GPU.SortingKeysBuffer == nullptr);
ASSERT(Emitter && GPU.SortedIndices == nullptr && GPU.SortingKeys == nullptr);
if (Emitter->Graph.SortModules.IsEmpty())
return false;
@@ -170,7 +170,7 @@ bool ParticleBuffer::AllocateSortBuffer()
case ParticlesSimulationMode::CPU:
{
const int32 sortedIndicesSize = Capacity * sizeof(uint32) * Emitter->Graph.SortModules.Count();
GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("SortedIndices"));
GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortedIndices"));
if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesSize, GPUBufferFlags::ShaderResource, PixelFormat::R32_UInt, nullptr, sizeof(uint32), GPUResourceUsage::Dynamic)))
return true;
break;
@@ -178,12 +178,12 @@ bool ParticleBuffer::AllocateSortBuffer()
#if COMPILE_WITH_GPU_PARTICLES
case ParticlesSimulationMode::GPU:
{
const int32 sortedIndicesSize = Capacity * sizeof(uint32) * Emitter->Graph.SortModules.Count();
GPU.SortingKeysBuffer = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortingKeysBuffer"));
if (GPU.SortingKeysBuffer->Init(GPUBufferDescription::Structured(Capacity, sizeof(float) + sizeof(uint32), true)))
const int32 sortedIndicesCount = Capacity * Emitter->Graph.SortModules.Count();
GPU.SortingKeys = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortingKeys"));
if (GPU.SortingKeys->Init(GPUBufferDescription::Buffer(sortedIndicesCount * sizeof(float), GPUBufferFlags::UnorderedAccess, PixelFormat::R32_Float, nullptr, sizeof(float))))
return true;
GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("SortedIndices"));
if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesSize, GPUBufferFlags::ShaderResource | GPUBufferFlags::UnorderedAccess, PixelFormat::R32_UInt, nullptr, sizeof(uint32))))
GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortedIndices"));
if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesCount * sizeof(uint32), GPUBufferFlags::ShaderResource | GPUBufferFlags::UnorderedAccess, PixelFormat::R32_UInt, nullptr, sizeof(uint32))))
return true;
break;
}

View File

@@ -206,7 +206,7 @@ public:
/// <summary>
/// The GPU particles sorting buffer. Contains structure of particle index and the sorting key for every particle. Used to sort particles.
/// </summary>
GPUBuffer* SortingKeysBuffer = nullptr;
GPUBuffer* SortingKeys = nullptr;
/// <summary>
/// The particles indices buffer (GPU side).