Optimize CPU particles drawing to support async

This commit is contained in:
Wojtek Figat
2025-08-05 09:03:47 +02:00
parent abe496fe12
commit 1a88fefd76
8 changed files with 59 additions and 41 deletions

View File

@@ -125,6 +125,13 @@ public:
// Frees all memory allocations within allocator.
void Free();
// Allocates a chunk of unitialized memory.
template<class T>
inline T* Allocate(uint64 count)
{
return (T*)Allocate(count * sizeof(T), alignof(T));
}
// Creates a new object within the arena allocator.
template<class T, class... Args>
inline T* New(Args&&...args)

View File

@@ -19,6 +19,7 @@ PointLight::PointLight(const SpawnParams& params)
_direction = Float3::Forward;
_sphere = BoundingSphere(Vector3::Zero, _radius);
BoundingBox::FromSphere(_sphere, _box);
_drawCategory = SceneRendering::SceneDrawAsync;
}
float PointLight::ComputeBrightness() const

View File

@@ -27,6 +27,7 @@ SpotLight::SpotLight(const SpawnParams& params)
const float boundsRadius = Math::Sqrt(1.25f * _radius * _radius - _radius * _radius * _cosOuterCone);
_sphere = BoundingSphere(GetPosition() + 0.5f * GetDirection() * _radius, boundsRadius);
BoundingBox::FromSphere(_sphere, _box);
_drawCategory = SceneRendering::SceneDrawAsync;
}
float SpotLight::ComputeBrightness() const

View File

@@ -40,6 +40,7 @@ PACK_STRUCT(struct SpriteParticleVertex
class SpriteParticleRenderer
{
public:
volatile int64 Ready = 0;
GPUBuffer* VB = nullptr;
GPUBuffer* IB = nullptr;
const static int32 VertexCount = 4;
@@ -48,7 +49,10 @@ public:
public:
bool Init()
{
if (VB)
if (Platform::AtomicRead(&Ready))
return false;
ScopeLock lock(RenderContext::GPULocker);
if (Platform::AtomicRead(&Ready))
return false;
VB = GPUDevice::Instance->CreateBuffer(TEXT("SpriteParticleRenderer.VB"));
IB = GPUDevice::Instance->CreateBuffer(TEXT("SpriteParticleRenderer.IB"));
@@ -64,8 +68,10 @@ public:
{ VertexElement::Types::Position, 0, 0, 0, PixelFormat::R32G32_Float },
{ VertexElement::Types::TexCoord, 0, 0, 0, PixelFormat::R32G32_Float },
});
return VB->Init(GPUBufferDescription::Vertex(layout, sizeof(SpriteParticleVertex), VertexCount, vertexBuffer)) ||
bool result = VB->Init(GPUBufferDescription::Vertex(layout, sizeof(SpriteParticleVertex), VertexCount, vertexBuffer)) ||
IB->Init(GPUBufferDescription::Index(sizeof(uint16), IndexCount, indexBuffer));
Platform::AtomicStore(&Ready, 1);
return result;
}
void Dispose()
@@ -133,13 +139,6 @@ float Particles::ParticleBufferRecycleTimeout = 10.0f;
SpriteParticleRenderer SpriteRenderer;
namespace ParticlesDrawCPU
{
Array<uint32> SortingKeys[2];
Array<int32> SortingIndices;
Array<int32> SortedIndices;
}
class ParticleManagerService : public EngineService
{
public:
@@ -190,7 +189,7 @@ void DrawEmitterCPU(RenderContext& renderContext, ParticleBuffer* buffer, DrawCa
auto emitter = buffer->Emitter;
// Check if need to perform any particles sorting
if (emitter->Graph.SortModules.HasItems() && renderContext.View.Pass != DrawPass::Depth)
if (emitter->Graph.SortModules.HasItems() && renderContext.View.Pass != DrawPass::Depth && (buffer->CPU.Count != 0 || buffer->GPU.SortedIndices))
{
// Prepare sorting data
if (!buffer->GPU.SortedIndices)
@@ -204,12 +203,31 @@ void DrawEmitterCPU(RenderContext& renderContext, ParticleBuffer* buffer, DrawCa
const auto sortMode = static_cast<ParticleSortMode>(module->Values[2].AsInt);
const int32 stride = buffer->Stride;
const int32 listSize = buffer->CPU.Count;
#define PREPARE_CACHE(list) (ParticlesDrawCPU::list).Clear(); (ParticlesDrawCPU::list).Resize(listSize)
PREPARE_CACHE(SortingKeys[0]);
PREPARE_CACHE(SortingKeys[1]);
PREPARE_CACHE(SortingIndices);
#undef PREPARE_CACHE
uint32* sortedKeys = ParticlesDrawCPU::SortingKeys[0].Get();
Array<uint32, RendererAllocation> sortingKeysList[4];
Array<int32, RendererAllocation> sortingIndicesList[2];
uint32* sortingKeys[2];
int32* sortingIndices[2];
if (listSize < 500)
{
// Use fast stack allocator from RenderList
sortingKeys[0] = renderContext.List->Memory.Allocate<uint32>(listSize);
sortingKeys[1] = renderContext.List->Memory.Allocate<uint32>(listSize);
sortingIndices[0] = renderContext.List->Memory.Allocate<int32>(listSize);
sortingIndices[1] = renderContext.List->Memory.Allocate<int32>(listSize);
}
else
{
// Use shared pooled memory from RendererAllocation
sortingKeysList[0].Resize(listSize);
sortingKeysList[1].Resize(listSize);
sortingIndicesList[0].Resize(listSize);
sortingIndicesList[1].Resize(listSize);
sortingKeys[0] = sortingKeysList[0].Get();
sortingKeys[1] = sortingKeysList[1].Get();
sortingIndices[0] = sortingIndicesList[0].Get();
sortingIndices[1] = sortingIndicesList[1].Get();
}
uint32* sortedKeys = sortingKeys[0];
const uint32 sortKeyXor = sortMode != ParticleSortMode::CustomAscending ? MAX_uint32 : 0;
switch (sortMode)
{
@@ -290,29 +308,31 @@ void DrawEmitterCPU(RenderContext& renderContext, ParticleBuffer* buffer, DrawCa
}
// Generate sorting indices
int32* sortedIndices;
int32* sortedIndices = sortingIndices[0];
{
ParticlesDrawCPU::SortedIndices.Resize(listSize);
sortedIndices = ParticlesDrawCPU::SortedIndices.Get();
for (int32 i = 0; i < listSize; i++)
sortedIndices[i] = i;
}
// Sort keys with indices
{
Sorting::RadixSort(sortedKeys, sortedIndices, ParticlesDrawCPU::SortingKeys[1].Get(), ParticlesDrawCPU::SortingIndices.Get(), listSize);
Sorting::RadixSort(sortedKeys, sortedIndices, sortingKeys[1], sortingIndices[1], listSize);
}
// Upload CPU particles indices
{
context->UpdateBuffer(buffer->GPU.SortedIndices, sortedIndices, listSize * sizeof(int32), sortedIndicesOffset);
RenderContext::GPULocker.Lock();
context->UpdateBuffer(buffer->GPU.SortedIndices, sortedIndices, listSize * sizeof(uint32), sortedIndicesOffset);
RenderContext::GPULocker.Unlock();
}
}
}
// Upload CPU particles data to GPU
{
RenderContext::GPULocker.Lock();
context->UpdateBuffer(buffer->GPU.Buffer, buffer->CPU.Buffer.Get(), buffer->CPU.Count * buffer->Stride);
RenderContext::GPULocker.Unlock();
}
// Check if need to setup ribbon modules
@@ -443,8 +463,10 @@ void DrawEmitterCPU(RenderContext& renderContext, ParticleBuffer* buffer, DrawCa
if (ribbonModuleIndex != 0)
{
// Upload data to the GPU buffer
RenderContext::GPULocker.Lock();
buffer->GPU.RibbonIndexBufferDynamic->Flush(context);
buffer->GPU.RibbonVertexBufferDynamic->Flush(context);
RenderContext::GPULocker.Unlock();
}
}
@@ -1266,10 +1288,6 @@ void ParticleManagerService::Dispose()
}
CleanupGPUParticlesSorting();
#endif
ParticlesDrawCPU::SortingKeys[0].SetCapacity(0);
ParticlesDrawCPU::SortingKeys[1].SetCapacity(0);
ParticlesDrawCPU::SortingIndices.SetCapacity(0);
ParticlesDrawCPU::SortedIndices.SetCapacity(0);
PoolLocker.Lock();
for (auto i = Pool.Begin(); i.IsNotEnd(); ++i)

View File

@@ -449,8 +449,6 @@ RenderList::RenderList(const SpawnParams& params)
: ScriptingObject(params)
, Memory(4 * 1024 * 1024, RendererAllocation::Allocate, RendererAllocation::Free) // 4MB pages, use page pooling via RendererAllocation
, DirectionalLights(4)
, PointLights(32)
, SpotLights(32)
, SkyLights(4)
, EnvironmentProbes(32)
, Decals(64)

View File

@@ -341,12 +341,12 @@ public:
/// <summary>
/// Light pass members - point lights
/// </summary>
Array<RenderPointLightData> PointLights;
RenderListBuffer<RenderPointLightData> PointLights;
/// <summary>
/// Light pass members - spot lights
/// </summary>
Array<RenderSpotLightData> SpotLights;
RenderListBuffer<RenderSpotLightData> SpotLights;
/// <summary>
/// Light pass members - sky lights
@@ -366,7 +366,7 @@ public:
/// <summary>
/// Local volumetric fog particles registered for the rendering.
/// </summary>
Array<DrawCall> VolumetricFogParticles;
RenderListBuffer<DrawCall> VolumetricFogParticles;
/// <summary>
/// Sky/skybox renderer proxy to use (only one per frame)

View File

@@ -91,16 +91,9 @@ void BitonicSort::Dispose()
void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer)
{
ASSERT(context && sortingKeysBuffer && countBuffer);
PROFILE_GPU_CPU("Bitonic Sort");
// Check if has missing resources
if (checkIfSkipPass())
{
return;
}
// Prepare
PROFILE_GPU_CPU("Bitonic Sort");
const uint32 elementSizeBytes = sizeof(uint64);
const uint32 maxNumElements = sortingKeysBuffer->GetSize() / elementSizeBytes;
const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements);

View File

@@ -384,7 +384,7 @@ void VolumetricFogPass::Render(RenderContext& renderContext)
}
// Render local fog particles
if (renderContext.List->VolumetricFogParticles.HasItems())
if (renderContext.List->VolumetricFogParticles.Count() != 0)
{
PROFILE_GPU_CPU_NAMED("Local Fog");
@@ -404,7 +404,7 @@ void VolumetricFogPass::Render(RenderContext& renderContext)
customData.VolumetricFogMaxDistance = cache.Data.VolumetricFogMaxDistance;
bindParams.CustomData = &customData;
bindParams.BindViewData();
bindParams.DrawCall = &renderContext.List->VolumetricFogParticles.First();
bindParams.DrawCall = renderContext.List->VolumetricFogParticles.begin();
bindParams.BindDrawData();
for (auto& drawCall : renderContext.List->VolumetricFogParticles)