Optimize foliage rendering with concurrent arena allocator

This commit is contained in:
Wojtek Figat
2025-07-08 22:18:00 +02:00
parent a8eb4fc140
commit 3abbf08f1f
6 changed files with 205 additions and 16 deletions

View File

@@ -52,3 +52,81 @@ void* ArenaAllocator::Allocate(uint64 size, uint64 alignment)
return mem;
}
void ConcurrentArenaAllocator::Free()
{
_locker.Lock();
// Free all pages
Page* page = (Page*)_first;
while (page)
{
#if COMPILE_WITH_PROFILER
ProfilerMemory::OnGroupUpdate(ProfilerMemory::Groups::MallocArena, -(int64)page->Size, -1);
#endif
if (_free1)
_free1(page->Memory);
else
_free2(page->Memory, page->Size);
Page* next = page->Next;
if (_free1)
_free1(page);
else
_free2(page, sizeof(Page));
page = next;
}
// Unlink
_first = 0;
_totalBytes = 0;
_locker.Unlock();
}
void* ConcurrentArenaAllocator::Allocate(uint64 size, uint64 alignment)
{
RETRY:
// Check if the current page has some space left
Page* page = (Page*)Platform::AtomicRead(&_first);
if (page)
{
int64 offset = Platform::AtomicRead(&page->Offset);
int64 offsetAligned = Math::AlignUp(offset, (int64)alignment);
int64 end = offsetAligned + size;
if (end <= page->Size)
{
// Try to allocate within a page
if (Platform::InterlockedCompareExchange(&page->Offset, end, offset) != offset)
{
// Someone else changed allocated so retry (new offset might mismatch alignment)
goto RETRY;
}
Platform::InterlockedAdd(&_totalBytes, (int64)size);
return (byte*)page->Memory + offsetAligned;
}
}
// Page allocation is thread-synced
_locker.Lock();
// Check if page was unchanged by any other thread
if ((Page*)Platform::AtomicRead(&_first) == page)
{
uint64 pageSize = Math::Max<uint64>(_pageSize, size);
#if COMPILE_WITH_PROFILER
ProfilerMemory::OnGroupUpdate(ProfilerMemory::Groups::MallocArena, (int64)pageSize, 1);
#endif
page = (Page*)(_allocate1 ? _allocate1(sizeof(Page), alignof(Page)) : _allocate2(sizeof(Page)));
page->Memory = _allocate1 ? _allocate1(pageSize, 16) : _allocate2(pageSize);
page->Next = (Page*)_first;
page->Offset = 0;
page->Size = (int64)pageSize;
Platform::AtomicStore(&_first, (intptr)page);
}
_locker.Unlock();
// Use a single cde for allocation
goto RETRY;
}

View File

@@ -3,6 +3,7 @@
#pragma once
#include "Allocation.h"
#include "Engine/Platform/CriticalSection.h"
/// <summary>
/// Allocator that uses pages for stack-based allocs without freeing memory during it's lifetime.
@@ -66,21 +67,94 @@ public:
}
};
/// <summary>
/// Allocator that uses pages for stack-based allocs without freeing memory during it's lifetime. Thread-safe to allocate memory from multiple threads at once.
/// </summary>
class ConcurrentArenaAllocator
{
private:
struct Page
{
void* Memory;
Page* Next;
volatile int64 Offset;
int64 Size;
};
int32 _pageSize;
volatile int64 _first = 0;
#if !BUILD_RELEASE
volatile int64 _totalBytes = 0;
#endif
void*(*_allocate1)(uint64 size, uint64 alignment) = nullptr;
void(*_free1)(void* ptr) = nullptr;
void*(*_allocate2)(uint64 size) = nullptr;
void(*_free2)(void* ptr, uint64 size) = nullptr;
CriticalSection _locker;
public:
ConcurrentArenaAllocator(int32 pageSizeBytes, void* (*customAllocate)(uint64 size, uint64 alignment), void(*customFree)(void* ptr))
: _pageSize(pageSizeBytes)
, _allocate1(customAllocate)
, _free1(customFree)
{
}
ConcurrentArenaAllocator(int32 pageSizeBytes, void* (*customAllocate)(uint64 size), void(*customFree)(void* ptr, uint64 size))
: _pageSize(pageSizeBytes)
, _allocate2(customAllocate)
, _free2(customFree)
{
}
ConcurrentArenaAllocator(int32 pageSizeBytes = 1024 * 1024) // 1 MB by default
: ConcurrentArenaAllocator(pageSizeBytes, Allocator::Allocate, Allocator::Free)
{
}
~ConcurrentArenaAllocator()
{
Free();
}
// Gets the total amount of bytes allocated in arena (excluding alignment).
int64 GetTotalBytes() const
{
return Platform::AtomicRead(&_totalBytes);
}
// Allocates a chunk of unitialized memory.
void* Allocate(uint64 size, uint64 alignment = 1);
// Frees all memory allocations within allocator.
void Free();
// Creates a new object within the arena allocator.
template<class T, class... Args>
inline T* New(Args&&...args)
{
T* ptr = (T*)Allocate(sizeof(T));
new(ptr) T(Forward<Args>(args)...);
return ptr;
}
};
/// <summary>
/// The memory allocation policy that uses a part of shared page allocator. Allocations are performed in stack-manner, and free is no-op.
/// </summary>
class ArenaAllocation
template<typename ArenaType>
class ArenaAllocationBase
{
public:
enum { HasSwap = true };
typedef ArenaAllocator* Tag;
typedef ArenaType* Tag;
template<typename T>
class Data
{
private:
T* _data = nullptr;
ArenaAllocator* _arena = nullptr;
ArenaType* _arena = nullptr;
public:
FORCE_INLINE Data()
@@ -142,3 +216,13 @@ public:
}
};
};
/// <summary>
/// The memory allocation policy that uses a part of shared page allocator. Allocations are performed in stack-manner, and free is no-op.
/// </summary>
typedef ArenaAllocationBase<ArenaAllocator> ArenaAllocation;
/// <summary>
/// The memory allocation policy that uses a part of shared page allocator. Allocations are performed in stack-manner, and free is no-op.
/// </summary>
typedef ArenaAllocationBase<ConcurrentArenaAllocator> ConcurrentArenaAllocation;

View File

@@ -103,17 +103,17 @@ void Foliage::DrawInstance(RenderContext& renderContext, FoliageInstance& instan
for (int32 meshIndex = 0; meshIndex < meshes.Count(); meshIndex++)
{
auto& drawCall = drawCallsLists[lod][meshIndex];
if (!drawCall.DrawCall.Material)
if (!drawCall.Material)
continue;
DrawKey key;
key.Mat = drawCall.DrawCall.Material;
key.Mat = drawCall.Material;
key.Geo = &meshes.Get()[meshIndex];
key.Lightmap = instance.Lightmap.TextureIndex;
auto* e = result.TryGet(key);
if (!e)
{
e = &result[key];
e = &result.Add(key, BatchedDrawCall(renderContext.List))->Value;
ASSERT_LOW_LAYER(key.Mat);
e->DrawCall.Material = key.Mat;
e->DrawCall.Surface.Lightmap = EnumHasAnyFlags(_staticFlags, StaticFlags::Lightmap) && _scene ? _scene->LightmapsData.GetReadyLightmap(key.Lightmap) : nullptr;
@@ -127,7 +127,7 @@ void Foliage::DrawInstance(RenderContext& renderContext, FoliageInstance& instan
const Float3 translation = transform.Translation - renderContext.View.Origin;
Matrix::Transformation(transform.Scale, transform.Orientation, translation, world);
constexpr float worldDeterminantSign = 1.0f;
instanceData.Store(world, world, instance.Lightmap.UVsArea, drawCall.DrawCall.Surface.GeometrySize, instance.Random, worldDeterminantSign, lodDitherFactor);
instanceData.Store(world, world, instance.Lightmap.UVsArea, drawCall.Surface.GeometrySize, instance.Random, worldDeterminantSign, lodDitherFactor);
}
}
@@ -430,7 +430,7 @@ void Foliage::DrawType(RenderContext& renderContext, const FoliageType& type, Dr
{
const auto& mesh = meshes.Get()[meshIndex];
auto& drawCall = drawCallsList.Get()[meshIndex];
drawCall.DrawCall.Material = nullptr;
drawCall.Material = nullptr; // DrawInstance skips draw calls from meshes with unset material
// Check entry visibility
const auto& entry = type.Entries[mesh.GetMaterialSlotIndex()];
@@ -455,13 +455,13 @@ void Foliage::DrawType(RenderContext& renderContext, const FoliageType& type, Dr
if (drawModes == DrawPass::None)
continue;
drawCall.DrawCall.Material = material;
drawCall.DrawCall.Surface.GeometrySize = mesh.GetBox().GetSize();
drawCall.Material = material;
drawCall.Surface.GeometrySize = mesh.GetBox().GetSize();
}
}
// Draw instances of the foliage type
BatchedDrawCalls result;
BatchedDrawCalls result(&renderContext.List->Memory);
DrawCluster(renderContext, type.Root, type, drawCallsLists, result);
// Submit draw calls with valid instances added

View File

@@ -6,6 +6,7 @@
#include "FoliageInstance.h"
#include "FoliageCluster.h"
#include "FoliageType.h"
#include "Engine/Core/Memory/ArenaAllocation.h"
#include "Engine/Level/Actor.h"
/// <summary>
@@ -178,8 +179,8 @@ private:
}
};
typedef Array<struct BatchedDrawCall, InlinedAllocation<8>> DrawCallsList;
typedef Dictionary<DrawKey, struct BatchedDrawCall, class RendererAllocation> BatchedDrawCalls;
typedef Array<struct DrawCall, InlinedAllocation<8>> DrawCallsList;
typedef Dictionary<DrawKey, struct BatchedDrawCall, ConcurrentArenaAllocation> BatchedDrawCalls;
void DrawInstance(RenderContext& renderContext, FoliageInstance& instance, const FoliageType& type, Model* model, int32 lod, float lodDitherFactor, DrawCallsList* drawCallsLists, BatchedDrawCalls& result) const;
void DrawCluster(RenderContext& renderContext, FoliageCluster* cluster, const FoliageType& type, DrawCallsList* drawCallsLists, BatchedDrawCalls& result) const;
#else

View File

@@ -169,6 +169,7 @@ void RenderEnvironmentProbeData::SetShaderData(ShaderEnvProbeData& data) const
void* RendererAllocation::Allocate(uintptr size)
{
PROFILE_CPU();
void* result = nullptr;
MemPoolLocker.Lock();
for (int32 i = 0; i < MemPool.Count(); i++)
@@ -188,6 +189,7 @@ void* RendererAllocation::Allocate(uintptr size)
void RendererAllocation::Free(void* ptr, uintptr size)
{
PROFILE_CPU();
MemPoolLocker.Lock();
MemPool.Add({ ptr, size });
MemPoolLocker.Unlock();
@@ -418,6 +420,18 @@ bool RenderList::HasAnyPostFx(const RenderContext& renderContext, MaterialPostFx
return false;
}
BatchedDrawCall::BatchedDrawCall(RenderList* list)
: Instances(&list->Memory)
{
}
BatchedDrawCall::BatchedDrawCall(BatchedDrawCall&& other) noexcept
: DrawCall(other.DrawCall)
, ObjectsStartIndex(other.ObjectsStartIndex)
, Instances(MoveTemp(other.Instances))
{
}
void DrawCallsList::Clear()
{
Indices.Clear();
@@ -433,6 +447,7 @@ bool DrawCallsList::IsEmpty() const
RenderList::RenderList(const SpawnParams& params)
: ScriptingObject(params)
, Memory(4 * 1024 * 1024, RendererAllocation::Allocate, RendererAllocation::Free) // 4MB pages, use page pooling via RendererAllocation
, DirectionalLights(4)
, PointLights(32)
, SpotLights(32)
@@ -443,8 +458,8 @@ RenderList::RenderList(const SpawnParams& params)
, AtmosphericFog(nullptr)
, Fog(nullptr)
, Blendable(32)
, ObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Bufffer"))
, TempObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Bufffer"))
, ObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Buffer"))
, TempObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Buffer"))
, _instanceBuffer(0, sizeof(ShaderObjectDrawInstanceData), TEXT("Instance Buffer"), GPUVertexLayout::Get({ { VertexElement::Types::Attribute0, 3, 0, 1, PixelFormat::R32_UInt } }))
{
}
@@ -480,6 +495,7 @@ void RenderList::Clear()
_instanceBuffer.Clear();
ObjectBuffer.Clear();
TempObjectBuffer.Clear();
Memory.Free();
}
struct PackedSortKey

View File

@@ -3,6 +3,7 @@
#pragma once
#include "Engine/Core/Collections/Array.h"
#include "Engine/Core/Memory/ArenaAllocation.h"
#include "Engine/Core/Math/Half.h"
#include "Engine/Graphics/PostProcessSettings.h"
#include "Engine/Graphics/DynamicBuffer.h"
@@ -241,7 +242,11 @@ struct BatchedDrawCall
{
DrawCall DrawCall;
uint16 ObjectsStartIndex = 0; // Index of the instances start in the ObjectsBuffer (set internally).
Array<struct ShaderObjectData, RendererAllocation> Instances;
Array<struct ShaderObjectData, ConcurrentArenaAllocation> Instances;
BatchedDrawCall() { CRASH; } // Don't use it
BatchedDrawCall(RenderList* list);
BatchedDrawCall(BatchedDrawCall&& other) noexcept;
};
/// <summary>
@@ -298,6 +303,11 @@ API_CLASS(Sealed) class FLAXENGINE_API RenderList : public ScriptingObject
static void CleanupCache();
public:
/// <summary>
/// Memory storage with all draw-related data that lives during a single frame rendering time. Thread-safe to allocate memory during rendering jobs.
/// </summary>
ConcurrentArenaAllocator Memory;
/// <summary>
/// All scenes for rendering.
/// </summary>