Optimize foliage rendering with concurrent arena allocator
This commit is contained in:
@@ -52,3 +52,81 @@ void* ArenaAllocator::Allocate(uint64 size, uint64 alignment)
|
|||||||
|
|
||||||
return mem;
|
return mem;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ConcurrentArenaAllocator::Free()
|
||||||
|
{
|
||||||
|
_locker.Lock();
|
||||||
|
|
||||||
|
// Free all pages
|
||||||
|
Page* page = (Page*)_first;
|
||||||
|
while (page)
|
||||||
|
{
|
||||||
|
#if COMPILE_WITH_PROFILER
|
||||||
|
ProfilerMemory::OnGroupUpdate(ProfilerMemory::Groups::MallocArena, -(int64)page->Size, -1);
|
||||||
|
#endif
|
||||||
|
if (_free1)
|
||||||
|
_free1(page->Memory);
|
||||||
|
else
|
||||||
|
_free2(page->Memory, page->Size);
|
||||||
|
Page* next = page->Next;
|
||||||
|
if (_free1)
|
||||||
|
_free1(page);
|
||||||
|
else
|
||||||
|
_free2(page, sizeof(Page));
|
||||||
|
page = next;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unlink
|
||||||
|
_first = 0;
|
||||||
|
_totalBytes = 0;
|
||||||
|
|
||||||
|
_locker.Unlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
void* ConcurrentArenaAllocator::Allocate(uint64 size, uint64 alignment)
|
||||||
|
{
|
||||||
|
RETRY:
|
||||||
|
|
||||||
|
// Check if the current page has some space left
|
||||||
|
Page* page = (Page*)Platform::AtomicRead(&_first);
|
||||||
|
if (page)
|
||||||
|
{
|
||||||
|
int64 offset = Platform::AtomicRead(&page->Offset);
|
||||||
|
int64 offsetAligned = Math::AlignUp(offset, (int64)alignment);
|
||||||
|
int64 end = offsetAligned + size;
|
||||||
|
if (end <= page->Size)
|
||||||
|
{
|
||||||
|
// Try to allocate within a page
|
||||||
|
if (Platform::InterlockedCompareExchange(&page->Offset, end, offset) != offset)
|
||||||
|
{
|
||||||
|
// Someone else changed allocated so retry (new offset might mismatch alignment)
|
||||||
|
goto RETRY;
|
||||||
|
}
|
||||||
|
Platform::InterlockedAdd(&_totalBytes, (int64)size);
|
||||||
|
return (byte*)page->Memory + offsetAligned;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Page allocation is thread-synced
|
||||||
|
_locker.Lock();
|
||||||
|
|
||||||
|
// Check if page was unchanged by any other thread
|
||||||
|
if ((Page*)Platform::AtomicRead(&_first) == page)
|
||||||
|
{
|
||||||
|
uint64 pageSize = Math::Max<uint64>(_pageSize, size);
|
||||||
|
#if COMPILE_WITH_PROFILER
|
||||||
|
ProfilerMemory::OnGroupUpdate(ProfilerMemory::Groups::MallocArena, (int64)pageSize, 1);
|
||||||
|
#endif
|
||||||
|
page = (Page*)(_allocate1 ? _allocate1(sizeof(Page), alignof(Page)) : _allocate2(sizeof(Page)));
|
||||||
|
page->Memory = _allocate1 ? _allocate1(pageSize, 16) : _allocate2(pageSize);
|
||||||
|
page->Next = (Page*)_first;
|
||||||
|
page->Offset = 0;
|
||||||
|
page->Size = (int64)pageSize;
|
||||||
|
Platform::AtomicStore(&_first, (intptr)page);
|
||||||
|
}
|
||||||
|
|
||||||
|
_locker.Unlock();
|
||||||
|
|
||||||
|
// Use a single cde for allocation
|
||||||
|
goto RETRY;
|
||||||
|
}
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "Allocation.h"
|
#include "Allocation.h"
|
||||||
|
#include "Engine/Platform/CriticalSection.h"
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Allocator that uses pages for stack-based allocs without freeing memory during it's lifetime.
|
/// Allocator that uses pages for stack-based allocs without freeing memory during it's lifetime.
|
||||||
@@ -66,21 +67,94 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Allocator that uses pages for stack-based allocs without freeing memory during it's lifetime. Thread-safe to allocate memory from multiple threads at once.
|
||||||
|
/// </summary>
|
||||||
|
class ConcurrentArenaAllocator
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
struct Page
|
||||||
|
{
|
||||||
|
void* Memory;
|
||||||
|
Page* Next;
|
||||||
|
volatile int64 Offset;
|
||||||
|
int64 Size;
|
||||||
|
};
|
||||||
|
|
||||||
|
int32 _pageSize;
|
||||||
|
volatile int64 _first = 0;
|
||||||
|
#if !BUILD_RELEASE
|
||||||
|
volatile int64 _totalBytes = 0;
|
||||||
|
#endif
|
||||||
|
void*(*_allocate1)(uint64 size, uint64 alignment) = nullptr;
|
||||||
|
void(*_free1)(void* ptr) = nullptr;
|
||||||
|
void*(*_allocate2)(uint64 size) = nullptr;
|
||||||
|
void(*_free2)(void* ptr, uint64 size) = nullptr;
|
||||||
|
CriticalSection _locker;
|
||||||
|
|
||||||
|
public:
|
||||||
|
ConcurrentArenaAllocator(int32 pageSizeBytes, void* (*customAllocate)(uint64 size, uint64 alignment), void(*customFree)(void* ptr))
|
||||||
|
: _pageSize(pageSizeBytes)
|
||||||
|
, _allocate1(customAllocate)
|
||||||
|
, _free1(customFree)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
ConcurrentArenaAllocator(int32 pageSizeBytes, void* (*customAllocate)(uint64 size), void(*customFree)(void* ptr, uint64 size))
|
||||||
|
: _pageSize(pageSizeBytes)
|
||||||
|
, _allocate2(customAllocate)
|
||||||
|
, _free2(customFree)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
ConcurrentArenaAllocator(int32 pageSizeBytes = 1024 * 1024) // 1 MB by default
|
||||||
|
: ConcurrentArenaAllocator(pageSizeBytes, Allocator::Allocate, Allocator::Free)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
~ConcurrentArenaAllocator()
|
||||||
|
{
|
||||||
|
Free();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Gets the total amount of bytes allocated in arena (excluding alignment).
|
||||||
|
int64 GetTotalBytes() const
|
||||||
|
{
|
||||||
|
return Platform::AtomicRead(&_totalBytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocates a chunk of unitialized memory.
|
||||||
|
void* Allocate(uint64 size, uint64 alignment = 1);
|
||||||
|
|
||||||
|
// Frees all memory allocations within allocator.
|
||||||
|
void Free();
|
||||||
|
|
||||||
|
// Creates a new object within the arena allocator.
|
||||||
|
template<class T, class... Args>
|
||||||
|
inline T* New(Args&&...args)
|
||||||
|
{
|
||||||
|
T* ptr = (T*)Allocate(sizeof(T));
|
||||||
|
new(ptr) T(Forward<Args>(args)...);
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// The memory allocation policy that uses a part of shared page allocator. Allocations are performed in stack-manner, and free is no-op.
|
/// The memory allocation policy that uses a part of shared page allocator. Allocations are performed in stack-manner, and free is no-op.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
class ArenaAllocation
|
template<typename ArenaType>
|
||||||
|
class ArenaAllocationBase
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
enum { HasSwap = true };
|
enum { HasSwap = true };
|
||||||
typedef ArenaAllocator* Tag;
|
typedef ArenaType* Tag;
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
class Data
|
class Data
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
T* _data = nullptr;
|
T* _data = nullptr;
|
||||||
ArenaAllocator* _arena = nullptr;
|
ArenaType* _arena = nullptr;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
FORCE_INLINE Data()
|
FORCE_INLINE Data()
|
||||||
@@ -142,3 +216,13 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The memory allocation policy that uses a part of shared page allocator. Allocations are performed in stack-manner, and free is no-op.
|
||||||
|
/// </summary>
|
||||||
|
typedef ArenaAllocationBase<ArenaAllocator> ArenaAllocation;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The memory allocation policy that uses a part of shared page allocator. Allocations are performed in stack-manner, and free is no-op.
|
||||||
|
/// </summary>
|
||||||
|
typedef ArenaAllocationBase<ConcurrentArenaAllocator> ConcurrentArenaAllocation;
|
||||||
|
|||||||
@@ -103,17 +103,17 @@ void Foliage::DrawInstance(RenderContext& renderContext, FoliageInstance& instan
|
|||||||
for (int32 meshIndex = 0; meshIndex < meshes.Count(); meshIndex++)
|
for (int32 meshIndex = 0; meshIndex < meshes.Count(); meshIndex++)
|
||||||
{
|
{
|
||||||
auto& drawCall = drawCallsLists[lod][meshIndex];
|
auto& drawCall = drawCallsLists[lod][meshIndex];
|
||||||
if (!drawCall.DrawCall.Material)
|
if (!drawCall.Material)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
DrawKey key;
|
DrawKey key;
|
||||||
key.Mat = drawCall.DrawCall.Material;
|
key.Mat = drawCall.Material;
|
||||||
key.Geo = &meshes.Get()[meshIndex];
|
key.Geo = &meshes.Get()[meshIndex];
|
||||||
key.Lightmap = instance.Lightmap.TextureIndex;
|
key.Lightmap = instance.Lightmap.TextureIndex;
|
||||||
auto* e = result.TryGet(key);
|
auto* e = result.TryGet(key);
|
||||||
if (!e)
|
if (!e)
|
||||||
{
|
{
|
||||||
e = &result[key];
|
e = &result.Add(key, BatchedDrawCall(renderContext.List))->Value;
|
||||||
ASSERT_LOW_LAYER(key.Mat);
|
ASSERT_LOW_LAYER(key.Mat);
|
||||||
e->DrawCall.Material = key.Mat;
|
e->DrawCall.Material = key.Mat;
|
||||||
e->DrawCall.Surface.Lightmap = EnumHasAnyFlags(_staticFlags, StaticFlags::Lightmap) && _scene ? _scene->LightmapsData.GetReadyLightmap(key.Lightmap) : nullptr;
|
e->DrawCall.Surface.Lightmap = EnumHasAnyFlags(_staticFlags, StaticFlags::Lightmap) && _scene ? _scene->LightmapsData.GetReadyLightmap(key.Lightmap) : nullptr;
|
||||||
@@ -127,7 +127,7 @@ void Foliage::DrawInstance(RenderContext& renderContext, FoliageInstance& instan
|
|||||||
const Float3 translation = transform.Translation - renderContext.View.Origin;
|
const Float3 translation = transform.Translation - renderContext.View.Origin;
|
||||||
Matrix::Transformation(transform.Scale, transform.Orientation, translation, world);
|
Matrix::Transformation(transform.Scale, transform.Orientation, translation, world);
|
||||||
constexpr float worldDeterminantSign = 1.0f;
|
constexpr float worldDeterminantSign = 1.0f;
|
||||||
instanceData.Store(world, world, instance.Lightmap.UVsArea, drawCall.DrawCall.Surface.GeometrySize, instance.Random, worldDeterminantSign, lodDitherFactor);
|
instanceData.Store(world, world, instance.Lightmap.UVsArea, drawCall.Surface.GeometrySize, instance.Random, worldDeterminantSign, lodDitherFactor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -430,7 +430,7 @@ void Foliage::DrawType(RenderContext& renderContext, const FoliageType& type, Dr
|
|||||||
{
|
{
|
||||||
const auto& mesh = meshes.Get()[meshIndex];
|
const auto& mesh = meshes.Get()[meshIndex];
|
||||||
auto& drawCall = drawCallsList.Get()[meshIndex];
|
auto& drawCall = drawCallsList.Get()[meshIndex];
|
||||||
drawCall.DrawCall.Material = nullptr;
|
drawCall.Material = nullptr; // DrawInstance skips draw calls from meshes with unset material
|
||||||
|
|
||||||
// Check entry visibility
|
// Check entry visibility
|
||||||
const auto& entry = type.Entries[mesh.GetMaterialSlotIndex()];
|
const auto& entry = type.Entries[mesh.GetMaterialSlotIndex()];
|
||||||
@@ -455,13 +455,13 @@ void Foliage::DrawType(RenderContext& renderContext, const FoliageType& type, Dr
|
|||||||
if (drawModes == DrawPass::None)
|
if (drawModes == DrawPass::None)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
drawCall.DrawCall.Material = material;
|
drawCall.Material = material;
|
||||||
drawCall.DrawCall.Surface.GeometrySize = mesh.GetBox().GetSize();
|
drawCall.Surface.GeometrySize = mesh.GetBox().GetSize();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Draw instances of the foliage type
|
// Draw instances of the foliage type
|
||||||
BatchedDrawCalls result;
|
BatchedDrawCalls result(&renderContext.List->Memory);
|
||||||
DrawCluster(renderContext, type.Root, type, drawCallsLists, result);
|
DrawCluster(renderContext, type.Root, type, drawCallsLists, result);
|
||||||
|
|
||||||
// Submit draw calls with valid instances added
|
// Submit draw calls with valid instances added
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
#include "FoliageInstance.h"
|
#include "FoliageInstance.h"
|
||||||
#include "FoliageCluster.h"
|
#include "FoliageCluster.h"
|
||||||
#include "FoliageType.h"
|
#include "FoliageType.h"
|
||||||
|
#include "Engine/Core/Memory/ArenaAllocation.h"
|
||||||
#include "Engine/Level/Actor.h"
|
#include "Engine/Level/Actor.h"
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -178,8 +179,8 @@ private:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef Array<struct BatchedDrawCall, InlinedAllocation<8>> DrawCallsList;
|
typedef Array<struct DrawCall, InlinedAllocation<8>> DrawCallsList;
|
||||||
typedef Dictionary<DrawKey, struct BatchedDrawCall, class RendererAllocation> BatchedDrawCalls;
|
typedef Dictionary<DrawKey, struct BatchedDrawCall, ConcurrentArenaAllocation> BatchedDrawCalls;
|
||||||
void DrawInstance(RenderContext& renderContext, FoliageInstance& instance, const FoliageType& type, Model* model, int32 lod, float lodDitherFactor, DrawCallsList* drawCallsLists, BatchedDrawCalls& result) const;
|
void DrawInstance(RenderContext& renderContext, FoliageInstance& instance, const FoliageType& type, Model* model, int32 lod, float lodDitherFactor, DrawCallsList* drawCallsLists, BatchedDrawCalls& result) const;
|
||||||
void DrawCluster(RenderContext& renderContext, FoliageCluster* cluster, const FoliageType& type, DrawCallsList* drawCallsLists, BatchedDrawCalls& result) const;
|
void DrawCluster(RenderContext& renderContext, FoliageCluster* cluster, const FoliageType& type, DrawCallsList* drawCallsLists, BatchedDrawCalls& result) const;
|
||||||
#else
|
#else
|
||||||
|
|||||||
@@ -169,6 +169,7 @@ void RenderEnvironmentProbeData::SetShaderData(ShaderEnvProbeData& data) const
|
|||||||
|
|
||||||
void* RendererAllocation::Allocate(uintptr size)
|
void* RendererAllocation::Allocate(uintptr size)
|
||||||
{
|
{
|
||||||
|
PROFILE_CPU();
|
||||||
void* result = nullptr;
|
void* result = nullptr;
|
||||||
MemPoolLocker.Lock();
|
MemPoolLocker.Lock();
|
||||||
for (int32 i = 0; i < MemPool.Count(); i++)
|
for (int32 i = 0; i < MemPool.Count(); i++)
|
||||||
@@ -188,6 +189,7 @@ void* RendererAllocation::Allocate(uintptr size)
|
|||||||
|
|
||||||
void RendererAllocation::Free(void* ptr, uintptr size)
|
void RendererAllocation::Free(void* ptr, uintptr size)
|
||||||
{
|
{
|
||||||
|
PROFILE_CPU();
|
||||||
MemPoolLocker.Lock();
|
MemPoolLocker.Lock();
|
||||||
MemPool.Add({ ptr, size });
|
MemPool.Add({ ptr, size });
|
||||||
MemPoolLocker.Unlock();
|
MemPoolLocker.Unlock();
|
||||||
@@ -418,6 +420,18 @@ bool RenderList::HasAnyPostFx(const RenderContext& renderContext, MaterialPostFx
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BatchedDrawCall::BatchedDrawCall(RenderList* list)
|
||||||
|
: Instances(&list->Memory)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
BatchedDrawCall::BatchedDrawCall(BatchedDrawCall&& other) noexcept
|
||||||
|
: DrawCall(other.DrawCall)
|
||||||
|
, ObjectsStartIndex(other.ObjectsStartIndex)
|
||||||
|
, Instances(MoveTemp(other.Instances))
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
void DrawCallsList::Clear()
|
void DrawCallsList::Clear()
|
||||||
{
|
{
|
||||||
Indices.Clear();
|
Indices.Clear();
|
||||||
@@ -433,6 +447,7 @@ bool DrawCallsList::IsEmpty() const
|
|||||||
|
|
||||||
RenderList::RenderList(const SpawnParams& params)
|
RenderList::RenderList(const SpawnParams& params)
|
||||||
: ScriptingObject(params)
|
: ScriptingObject(params)
|
||||||
|
, Memory(4 * 1024 * 1024, RendererAllocation::Allocate, RendererAllocation::Free) // 4MB pages, use page pooling via RendererAllocation
|
||||||
, DirectionalLights(4)
|
, DirectionalLights(4)
|
||||||
, PointLights(32)
|
, PointLights(32)
|
||||||
, SpotLights(32)
|
, SpotLights(32)
|
||||||
@@ -443,8 +458,8 @@ RenderList::RenderList(const SpawnParams& params)
|
|||||||
, AtmosphericFog(nullptr)
|
, AtmosphericFog(nullptr)
|
||||||
, Fog(nullptr)
|
, Fog(nullptr)
|
||||||
, Blendable(32)
|
, Blendable(32)
|
||||||
, ObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Bufffer"))
|
, ObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Buffer"))
|
||||||
, TempObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Bufffer"))
|
, TempObjectBuffer(0, PixelFormat::R32G32B32A32_Float, false, TEXT("Object Buffer"))
|
||||||
, _instanceBuffer(0, sizeof(ShaderObjectDrawInstanceData), TEXT("Instance Buffer"), GPUVertexLayout::Get({ { VertexElement::Types::Attribute0, 3, 0, 1, PixelFormat::R32_UInt } }))
|
, _instanceBuffer(0, sizeof(ShaderObjectDrawInstanceData), TEXT("Instance Buffer"), GPUVertexLayout::Get({ { VertexElement::Types::Attribute0, 3, 0, 1, PixelFormat::R32_UInt } }))
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
@@ -480,6 +495,7 @@ void RenderList::Clear()
|
|||||||
_instanceBuffer.Clear();
|
_instanceBuffer.Clear();
|
||||||
ObjectBuffer.Clear();
|
ObjectBuffer.Clear();
|
||||||
TempObjectBuffer.Clear();
|
TempObjectBuffer.Clear();
|
||||||
|
Memory.Free();
|
||||||
}
|
}
|
||||||
|
|
||||||
struct PackedSortKey
|
struct PackedSortKey
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "Engine/Core/Collections/Array.h"
|
#include "Engine/Core/Collections/Array.h"
|
||||||
|
#include "Engine/Core/Memory/ArenaAllocation.h"
|
||||||
#include "Engine/Core/Math/Half.h"
|
#include "Engine/Core/Math/Half.h"
|
||||||
#include "Engine/Graphics/PostProcessSettings.h"
|
#include "Engine/Graphics/PostProcessSettings.h"
|
||||||
#include "Engine/Graphics/DynamicBuffer.h"
|
#include "Engine/Graphics/DynamicBuffer.h"
|
||||||
@@ -241,7 +242,11 @@ struct BatchedDrawCall
|
|||||||
{
|
{
|
||||||
DrawCall DrawCall;
|
DrawCall DrawCall;
|
||||||
uint16 ObjectsStartIndex = 0; // Index of the instances start in the ObjectsBuffer (set internally).
|
uint16 ObjectsStartIndex = 0; // Index of the instances start in the ObjectsBuffer (set internally).
|
||||||
Array<struct ShaderObjectData, RendererAllocation> Instances;
|
Array<struct ShaderObjectData, ConcurrentArenaAllocation> Instances;
|
||||||
|
|
||||||
|
BatchedDrawCall() { CRASH; } // Don't use it
|
||||||
|
BatchedDrawCall(RenderList* list);
|
||||||
|
BatchedDrawCall(BatchedDrawCall&& other) noexcept;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -298,6 +303,11 @@ API_CLASS(Sealed) class FLAXENGINE_API RenderList : public ScriptingObject
|
|||||||
static void CleanupCache();
|
static void CleanupCache();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
/// <summary>
|
||||||
|
/// Memory storage with all draw-related data that lives during a single frame rendering time. Thread-safe to allocate memory during rendering jobs.
|
||||||
|
/// </summary>
|
||||||
|
ConcurrentArenaAllocator Memory;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// All scenes for rendering.
|
/// All scenes for rendering.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|||||||
Reference in New Issue
Block a user