Optimize sorted particle indices format to 16-bit for small emitters

This commit is contained in:
Wojtek Figat
2025-08-08 18:46:10 +02:00
parent 854f3acd4c
commit 3907bc4957
2 changed files with 45 additions and 16 deletions

View File

@@ -208,26 +208,27 @@ void DrawEmitterCPU(RenderContextBatch& renderContextBatch, ParticleBuffer* buff
const auto sortMode = static_cast<ParticleSortMode>(module->Values[2].AsInt);
const int32 stride = buffer->Stride;
const int32 listSize = buffer->CPU.Count;
const int32 indicesByteSize = listSize * buffer->GPU.SortedIndices->GetStride();
Array<uint32, RendererAllocation> sortingKeysList[4];
Array<int32, RendererAllocation> sortingIndicesList[2];
Array<byte, RendererAllocation> sortingIndicesList[2];
uint32* sortingKeys[2];
int32* sortingIndices[2];
void* sortingIndices[2];
if (listSize < 500)
{
// Use fast stack allocator from RenderList
auto& memory = renderContextBatch.GetMainContext().List->Memory;
sortingKeys[0] = memory.Allocate<uint32>(listSize);
sortingKeys[1] = memory.Allocate<uint32>(listSize);
sortingIndices[0] = memory.Allocate<int32>(listSize);
sortingIndices[1] = memory.Allocate<int32>(listSize);
sortingIndices[0] = memory.Allocate(indicesByteSize, GPU_SHADER_DATA_ALIGNMENT);
sortingIndices[1] = memory.Allocate(indicesByteSize, GPU_SHADER_DATA_ALIGNMENT);
}
else
{
// Use shared pooled memory from RendererAllocation
sortingKeysList[0].Resize(listSize);
sortingKeysList[1].Resize(listSize);
sortingIndicesList[0].Resize(listSize);
sortingIndicesList[1].Resize(listSize);
sortingIndicesList[0].Resize(indicesByteSize);
sortingIndicesList[1].Resize(indicesByteSize);
sortingKeys[0] = sortingKeysList[0].Get();
sortingKeys[1] = sortingKeysList[1].Get();
sortingIndices[0] = sortingIndicesList[0].Get();
@@ -314,21 +315,42 @@ void DrawEmitterCPU(RenderContextBatch& renderContextBatch, ParticleBuffer* buff
}
// Generate sorting indices
int32* sortedIndices = sortingIndices[0];
void* sortedIndices = sortingIndices[0];
switch (buffer->GPU.SortedIndices->GetFormat())
{
case PixelFormat::R16_UInt:
for (int32 i = 0; i < listSize; i++)
sortedIndices[i] = i;
((uint16*)sortedIndices)[i] = i;
break;
case PixelFormat::R32_UInt:
for (int32 i = 0; i < listSize; i++)
((uint32*)sortedIndices)[i] = i;
break;
}
// Sort keys with indices
switch (buffer->GPU.SortedIndices->GetFormat())
{
Sorting::RadixSort(sortedKeys, sortedIndices, sortingKeys[1], sortingIndices[1], listSize);
case PixelFormat::R16_UInt:
{
uint16* sortedIndicesTyped = (uint16*)sortedIndices;
Sorting::RadixSort(sortedKeys, sortedIndicesTyped, sortingKeys[1], (uint16*)sortingIndices[1], listSize);
sortedIndices = sortedIndicesTyped;
break;
}
case PixelFormat::R32_UInt:
{
uint32* sortedIndicesTyped = (uint32*)sortedIndices;
Sorting::RadixSort(sortedKeys, sortedIndicesTyped, sortingKeys[1], (uint32*)sortingIndices[1], listSize);
sortedIndices = sortedIndicesTyped;
break;
}
}
// Upload CPU particles indices
{
RenderContext::GPULocker.Lock();
context->UpdateBuffer(buffer->GPU.SortedIndices, sortedIndices, listSize * sizeof(uint32), sortedIndicesOffset);
context->UpdateBuffer(buffer->GPU.SortedIndices, sortedIndices, indicesByteSize, sortedIndicesOffset);
RenderContext::GPULocker.Unlock();
}
}
@@ -1312,7 +1334,7 @@ void UpdateGPU(RenderTask* task, GPUContext* context)
// Pre-pass with buffers setup
{
PROFILE_GPU_CPU("Sim");
PROFILE_GPU_CPU_NAMED("Sim");
for (GPUSim& sim : sims)
{
sim.Emitter->GPU.Sim(context, sim.Emitter, sim.Effect, sim.EmitterIndex, sim.Data);

View File

@@ -164,26 +164,33 @@ bool ParticleBuffer::AllocateSortBuffer()
ASSERT(Emitter && GPU.SortedIndices == nullptr && GPU.SortingKeys == nullptr);
if (Emitter->Graph.SortModules.IsEmpty())
return false;
const int32 sortedIndicesCount = Capacity * Emitter->Graph.SortModules.Count();
uint32 indexSize = sizeof(uint32);
PixelFormat indexFormat = PixelFormat::R32_UInt;
if (Capacity <= MAX_uint16)
{
// 16-bit indices
indexSize = sizeof(uint16);
indexFormat = PixelFormat::R16_UInt;
}
switch (Mode)
{
case ParticlesSimulationMode::CPU:
{
const int32 sortedIndicesSize = Capacity * sizeof(uint32) * Emitter->Graph.SortModules.Count();
GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortedIndices"));
if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesSize, GPUBufferFlags::ShaderResource, PixelFormat::R32_UInt, nullptr, sizeof(uint32), GPUResourceUsage::Dynamic)))
if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesCount * indexSize, GPUBufferFlags::ShaderResource, indexFormat, nullptr, indexSize, GPUResourceUsage::Dynamic)))
return true;
break;
}
#if COMPILE_WITH_GPU_PARTICLES
case ParticlesSimulationMode::GPU:
{
const int32 sortedIndicesCount = Capacity * Emitter->Graph.SortModules.Count();
GPU.SortingKeys = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortingKeys"));
if (GPU.SortingKeys->Init(GPUBufferDescription::Buffer(sortedIndicesCount * sizeof(float), GPUBufferFlags::UnorderedAccess, PixelFormat::R32_Float, nullptr, sizeof(float))))
if (GPU.SortingKeys->Init(GPUBufferDescription::Buffer(sortedIndicesCount * sizeof(float), GPUBufferFlags::ShaderResource | GPUBufferFlags::UnorderedAccess, PixelFormat::R32_Float, nullptr, sizeof(float))))
return true;
GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortedIndices"));
if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesCount * sizeof(uint32), GPUBufferFlags::ShaderResource | GPUBufferFlags::UnorderedAccess, PixelFormat::R32_UInt, nullptr, sizeof(uint32))))
if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesCount * indexSize, GPUBufferFlags::ShaderResource | GPUBufferFlags::UnorderedAccess, indexFormat, nullptr, indexSize)))
return true;
break;
}