Optimize foliage rendering with concurrent arena allocator
This commit is contained in:
@@ -52,3 +52,81 @@ void* ArenaAllocator::Allocate(uint64 size, uint64 alignment)
|
||||
|
||||
return mem;
|
||||
}
|
||||
|
||||
void ConcurrentArenaAllocator::Free()
|
||||
{
|
||||
_locker.Lock();
|
||||
|
||||
// Free all pages
|
||||
Page* page = (Page*)_first;
|
||||
while (page)
|
||||
{
|
||||
#if COMPILE_WITH_PROFILER
|
||||
ProfilerMemory::OnGroupUpdate(ProfilerMemory::Groups::MallocArena, -(int64)page->Size, -1);
|
||||
#endif
|
||||
if (_free1)
|
||||
_free1(page->Memory);
|
||||
else
|
||||
_free2(page->Memory, page->Size);
|
||||
Page* next = page->Next;
|
||||
if (_free1)
|
||||
_free1(page);
|
||||
else
|
||||
_free2(page, sizeof(Page));
|
||||
page = next;
|
||||
}
|
||||
|
||||
// Unlink
|
||||
_first = 0;
|
||||
_totalBytes = 0;
|
||||
|
||||
_locker.Unlock();
|
||||
}
|
||||
|
||||
void* ConcurrentArenaAllocator::Allocate(uint64 size, uint64 alignment)
|
||||
{
|
||||
RETRY:
|
||||
|
||||
// Check if the current page has some space left
|
||||
Page* page = (Page*)Platform::AtomicRead(&_first);
|
||||
if (page)
|
||||
{
|
||||
int64 offset = Platform::AtomicRead(&page->Offset);
|
||||
int64 offsetAligned = Math::AlignUp(offset, (int64)alignment);
|
||||
int64 end = offsetAligned + size;
|
||||
if (end <= page->Size)
|
||||
{
|
||||
// Try to allocate within a page
|
||||
if (Platform::InterlockedCompareExchange(&page->Offset, end, offset) != offset)
|
||||
{
|
||||
// Someone else changed allocated so retry (new offset might mismatch alignment)
|
||||
goto RETRY;
|
||||
}
|
||||
Platform::InterlockedAdd(&_totalBytes, (int64)size);
|
||||
return (byte*)page->Memory + offsetAligned;
|
||||
}
|
||||
}
|
||||
|
||||
// Page allocation is thread-synced
|
||||
_locker.Lock();
|
||||
|
||||
// Check if page was unchanged by any other thread
|
||||
if ((Page*)Platform::AtomicRead(&_first) == page)
|
||||
{
|
||||
uint64 pageSize = Math::Max<uint64>(_pageSize, size);
|
||||
#if COMPILE_WITH_PROFILER
|
||||
ProfilerMemory::OnGroupUpdate(ProfilerMemory::Groups::MallocArena, (int64)pageSize, 1);
|
||||
#endif
|
||||
page = (Page*)(_allocate1 ? _allocate1(sizeof(Page), alignof(Page)) : _allocate2(sizeof(Page)));
|
||||
page->Memory = _allocate1 ? _allocate1(pageSize, 16) : _allocate2(pageSize);
|
||||
page->Next = (Page*)_first;
|
||||
page->Offset = 0;
|
||||
page->Size = (int64)pageSize;
|
||||
Platform::AtomicStore(&_first, (intptr)page);
|
||||
}
|
||||
|
||||
_locker.Unlock();
|
||||
|
||||
// Use a single cde for allocation
|
||||
goto RETRY;
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "Allocation.h"
|
||||
#include "Engine/Platform/CriticalSection.h"
|
||||
|
||||
/// <summary>
|
||||
/// Allocator that uses pages for stack-based allocs without freeing memory during it's lifetime.
|
||||
@@ -66,21 +67,94 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Allocator that uses pages for stack-based allocs without freeing memory during it's lifetime. Thread-safe to allocate memory from multiple threads at once.
|
||||
/// </summary>
|
||||
class ConcurrentArenaAllocator
|
||||
{
|
||||
private:
|
||||
struct Page
|
||||
{
|
||||
void* Memory;
|
||||
Page* Next;
|
||||
volatile int64 Offset;
|
||||
int64 Size;
|
||||
};
|
||||
|
||||
int32 _pageSize;
|
||||
volatile int64 _first = 0;
|
||||
#if !BUILD_RELEASE
|
||||
volatile int64 _totalBytes = 0;
|
||||
#endif
|
||||
void*(*_allocate1)(uint64 size, uint64 alignment) = nullptr;
|
||||
void(*_free1)(void* ptr) = nullptr;
|
||||
void*(*_allocate2)(uint64 size) = nullptr;
|
||||
void(*_free2)(void* ptr, uint64 size) = nullptr;
|
||||
CriticalSection _locker;
|
||||
|
||||
public:
|
||||
ConcurrentArenaAllocator(int32 pageSizeBytes, void* (*customAllocate)(uint64 size, uint64 alignment), void(*customFree)(void* ptr))
|
||||
: _pageSize(pageSizeBytes)
|
||||
, _allocate1(customAllocate)
|
||||
, _free1(customFree)
|
||||
{
|
||||
}
|
||||
|
||||
ConcurrentArenaAllocator(int32 pageSizeBytes, void* (*customAllocate)(uint64 size), void(*customFree)(void* ptr, uint64 size))
|
||||
: _pageSize(pageSizeBytes)
|
||||
, _allocate2(customAllocate)
|
||||
, _free2(customFree)
|
||||
{
|
||||
}
|
||||
|
||||
ConcurrentArenaAllocator(int32 pageSizeBytes = 1024 * 1024) // 1 MB by default
|
||||
: ConcurrentArenaAllocator(pageSizeBytes, Allocator::Allocate, Allocator::Free)
|
||||
{
|
||||
}
|
||||
|
||||
~ConcurrentArenaAllocator()
|
||||
{
|
||||
Free();
|
||||
}
|
||||
|
||||
// Gets the total amount of bytes allocated in arena (excluding alignment).
|
||||
int64 GetTotalBytes() const
|
||||
{
|
||||
return Platform::AtomicRead(&_totalBytes);
|
||||
}
|
||||
|
||||
// Allocates a chunk of unitialized memory.
|
||||
void* Allocate(uint64 size, uint64 alignment = 1);
|
||||
|
||||
// Frees all memory allocations within allocator.
|
||||
void Free();
|
||||
|
||||
// Creates a new object within the arena allocator.
|
||||
template<class T, class... Args>
|
||||
inline T* New(Args&&...args)
|
||||
{
|
||||
T* ptr = (T*)Allocate(sizeof(T));
|
||||
new(ptr) T(Forward<Args>(args)...);
|
||||
return ptr;
|
||||
}
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// The memory allocation policy that uses a part of shared page allocator. Allocations are performed in stack-manner, and free is no-op.
|
||||
/// </summary>
|
||||
class ArenaAllocation
|
||||
template<typename ArenaType>
|
||||
class ArenaAllocationBase
|
||||
{
|
||||
public:
|
||||
enum { HasSwap = true };
|
||||
typedef ArenaAllocator* Tag;
|
||||
typedef ArenaType* Tag;
|
||||
|
||||
template<typename T>
|
||||
class Data
|
||||
{
|
||||
private:
|
||||
T* _data = nullptr;
|
||||
ArenaAllocator* _arena = nullptr;
|
||||
ArenaType* _arena = nullptr;
|
||||
|
||||
public:
|
||||
FORCE_INLINE Data()
|
||||
@@ -142,3 +216,13 @@ public:
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// The memory allocation policy that uses a part of shared page allocator. Allocations are performed in stack-manner, and free is no-op.
|
||||
/// </summary>
|
||||
typedef ArenaAllocationBase<ArenaAllocator> ArenaAllocation;
|
||||
|
||||
/// <summary>
|
||||
/// The memory allocation policy that uses a part of shared page allocator. Allocations are performed in stack-manner, and free is no-op.
|
||||
/// </summary>
|
||||
typedef ArenaAllocationBase<ConcurrentArenaAllocator> ConcurrentArenaAllocation;
|
||||
|
||||
@@ -103,17 +103,17 @@ void Foliage::DrawInstance(RenderContext& renderContext, FoliageInstance& instan
|
||||
for (int32 meshIndex = 0; meshIndex < meshes.Count(); meshIndex++)
|
||||
{
|
||||
auto& drawCall = drawCallsLists[lod][meshIndex];
|
||||
if (!drawCall.DrawCall.Material)
|
||||
if (!drawCall.Material)
|
||||
continue;
|
||||
|
||||
DrawKey key;
|
||||
key.Mat = drawCall.DrawCall.Material;
|
||||
key.Mat = drawCall.Material;
|
||||
key.Geo = &meshes.Get()[meshIndex];
|
||||
key.Lightmap = instance.Lightmap.TextureIndex;
|
||||
auto* e = result.TryGet(key);
|
||||
if (!e)
|
||||
{
|
||||
e = &result[key];
|
||||
e = &result.Add(key, BatchedDrawCall(renderContext.List))->Value;
|
||||
ASSERT_LOW_LAYER(key.Mat);
|
||||
e->DrawCall.Material = key.Mat;
|
||||
e->DrawCall.Surface.Lightmap = EnumHasAnyFlags(_staticFlags, StaticFlags::Lightmap) && _scene ? _scene->LightmapsData.GetReadyLightmap(key.Lightmap) : nullptr;
|
||||
@@ -127,7 +127,7 @@ void Foliage::DrawInstance(RenderContext& renderContext, FoliageInstance& instan
|
||||
const Float3 translation = transform.Translation - renderContext.View.Origin;
|
||||
Matrix::Transformation(transform.Scale, transform.Orientation, translation, world);
|
||||
constexpr float worldDeterminantSign = 1.0f;
|
||||
instanceData.Store(world, world, instance.Lightmap.UVsArea, drawCall.DrawCall.Surface.GeometrySize, instance.Random, worldDeterminantSign, lodDitherFactor);
|
||||
instanceData.Store(world, world, instance.Lightmap.UVsArea, drawCall.Surface.GeometrySize, instance.Random, worldDeterminantSign, lodDitherFactor);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -430,7 +430,7 @@ void Foliage::DrawType(RenderContext& renderContext, const FoliageType& type, Dr
|
||||
{
|
||||
const auto& mesh = meshes.Get()[meshIndex];
|
||||
auto& drawCall = drawCallsList.Get()[meshIndex];
|
||||
drawCall.DrawCall.Material = nullptr;
|
||||
drawCall.Material = nullptr; // DrawInstance skips draw calls from meshes with unset material
|
||||
|
||||
// Check entry visibility
|
||||
const auto& entry = type.Entries[mesh.GetMaterialSlotIndex()];
|
||||
@@ -455,13 +455,13 @@ void Foliage::DrawType(RenderContext& renderContext, const FoliageType& type, Dr
|
||||
if (drawModes == DrawPass::None)
|
||||
continue;
|
||||
|
||||
drawCall.DrawCall.Material = material;
|
||||
drawCall.DrawCall.Surface.GeometrySize = mesh.GetBox().GetSize();
|
||||
drawCall.Material = material;
|
||||
drawCall.Surface.GeometrySize = mesh.GetBox().GetSize();
|
||||
}
|
||||
}
|
||||
|
||||
// Draw instances of the foliage type
|
||||
BatchedDrawCalls result;
|
||||
BatchedDrawCalls result(&renderContext.List->Memory);
|
||||
DrawCluster(renderContext, type.Root, type, drawCallsLists, result);
|
||||
|
||||
// Submit draw calls with valid instances added
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#include "FoliageInstance.h"
|
||||
#include "FoliageCluster.h"
|
||||
#include "FoliageType.h"
|
||||
#include "Engine/Core/Memory/ArenaAllocation.h"
|
||||
#include "Engine/Level/Actor.h"
|
||||
|
||||
/// <summary>
|
||||
@@ -178,8 +179,8 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
typedef Array<struct BatchedDrawCall, InlinedAllocation<8>> DrawCallsList;
|
||||
typedef Dictionary<DrawKey, struct BatchedDrawCall, class RendererAllocation> BatchedDrawCalls;
|
||||
typedef Array<struct DrawCall, InlinedAllocation<8>> DrawCallsList;
|
||||
typedef Dictionary<DrawKey, struct BatchedDrawCall, ConcurrentArenaAllocation> BatchedDrawCalls;
|
||||
void DrawInstance(RenderContext& renderContext, FoliageInstance& instance, const FoliageType& type, Model* model, int32 lod, float lodDitherFactor, DrawCallsList* drawCallsLists, BatchedDrawCalls& result) const;
|
||||
void DrawCluster(RenderContext& renderContext, FoliageCluster* cluster, const FoliageType& type, DrawCallsList* drawCallsLists, BatchedDrawCalls& result) const;
|
||||
#else
|
||||
|
||||
@@ -169,6 +169,7 @@ void RenderEnvironmentProbeData::SetShaderData(ShaderEnvProbeData& data) const
|
||||
|
||||
void* RendererAllocation::Allocate(uintptr size)
|
||||
{
|
||||
PROFILE_CPU();
|
||||
void* result = nullptr;
|
||||
MemPoolLocker.Lock();
|
||||
for (int32 i = 0; i < MemPool.Count(); i++)
|
||||
@@ -188,6 +189,7 @@ void* RendererAllocation::Allocate(uintptr size)
|
||||
|
||||
void RendererAllocation::Free(void* ptr, uintptr size)
|
||||
{
|
||||
PROFILE_CPU();
|
||||
MemPoolLocker.Lock();
|
||||
MemPool.Add({ ptr, size });
|
||||
MemPoolLocker.Unlock();
|
||||
@@ -418,6 +420,18 @@ bool RenderList::HasAnyPostFx(const RenderContext& renderContext, MaterialPostFx
|
||||
return false;
|
||||
}
|
||||
|
||||
BatchedDrawCall::BatchedDrawCall(RenderList* list)
|
||||
: Instances(&list->Memory)
|
||||
{
|
||||
}
|
||||
|
||||
BatchedDrawCall::BatchedDrawCall(BatchedDrawCall&& other) noexcept
|
||||
: DrawCall(other.DrawCall)
|
||||
, ObjectsStartIndex(other.ObjectsStartIndex)
|
||||
, Instances(MoveTemp(other.Instances))
|
||||
{
|
||||
}
|
||||
|
||||
void DrawCallsList::Clear()
|
||||
{
|
||||
Indices.Clear();
|
||||
@@ -433,6 +447,7 @@ bool DrawCallsList::IsEmpty() const
|
||||
|
||||
RenderList::RenderList(const SpawnParams& params)
|
||||
: ScriptingObject(params)
|
||||
, Memory(4 * 1024 * 1024, RendererAllocation::Allocate, RendererAllocation::Free) // 4MB pages, use page pooling via RendererAllocation
|
||||
, DirectionalLights(4)
|
||||
, PointLights(32)
|
||||
, SpotLights(32)
|
||||
@@ -443,8 +458,8 @@ RenderList::RenderList(const SpawnParams& params)
|
||||
, AtmosphericFog(nullptr)
|
||||
, Fog(nullptr)
|
||||
, Blendable(32)
|
||||
, ObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Bufffer"))
|
||||
, TempObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Bufffer"))
|
||||
, ObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Buffer"))
|
||||
, TempObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Buffer"))
|
||||
, _instanceBuffer(0, sizeof(ShaderObjectDrawInstanceData), TEXT("Instance Buffer"), GPUVertexLayout::Get({ { VertexElement::Types::Attribute0, 3, 0, 1, PixelFormat::R32_UInt } }))
|
||||
{
|
||||
}
|
||||
@@ -480,6 +495,7 @@ void RenderList::Clear()
|
||||
_instanceBuffer.Clear();
|
||||
ObjectBuffer.Clear();
|
||||
TempObjectBuffer.Clear();
|
||||
Memory.Free();
|
||||
}
|
||||
|
||||
struct PackedSortKey
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "Engine/Core/Collections/Array.h"
|
||||
#include "Engine/Core/Memory/ArenaAllocation.h"
|
||||
#include "Engine/Core/Math/Half.h"
|
||||
#include "Engine/Graphics/PostProcessSettings.h"
|
||||
#include "Engine/Graphics/DynamicBuffer.h"
|
||||
@@ -241,7 +242,11 @@ struct BatchedDrawCall
|
||||
{
|
||||
DrawCall DrawCall;
|
||||
uint16 ObjectsStartIndex = 0; // Index of the instances start in the ObjectsBuffer (set internally).
|
||||
Array<struct ShaderObjectData, RendererAllocation> Instances;
|
||||
Array<struct ShaderObjectData, ConcurrentArenaAllocation> Instances;
|
||||
|
||||
BatchedDrawCall() { CRASH; } // Don't use it
|
||||
BatchedDrawCall(RenderList* list);
|
||||
BatchedDrawCall(BatchedDrawCall&& other) noexcept;
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
@@ -298,6 +303,11 @@ API_CLASS(Sealed) class FLAXENGINE_API RenderList : public ScriptingObject
|
||||
static void CleanupCache();
|
||||
|
||||
public:
|
||||
/// <summary>
|
||||
/// Memory storage with all draw-related data that lives during a single frame rendering time. Thread-safe to allocate memory during rendering jobs.
|
||||
/// </summary>
|
||||
ConcurrentArenaAllocator Memory;
|
||||
|
||||
/// <summary>
|
||||
/// All scenes for rendering.
|
||||
/// </summary>
|
||||
|
||||
Reference in New Issue
Block a user