From 27739491977c643916f4ca30f71043d74b448f90 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Sat, 22 Jun 2024 09:58:20 +0200 Subject: [PATCH 01/11] Optimize wait signal in Job System to wake waiting threads only when job batch ends --- Source/Engine/Threading/JobSystem.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Source/Engine/Threading/JobSystem.cpp b/Source/Engine/Threading/JobSystem.cpp index f278d7d02..090bab2ea 100644 --- a/Source/Engine/Threading/JobSystem.cpp +++ b/Source/Engine/Threading/JobSystem.cpp @@ -200,16 +200,19 @@ int32 JobSystemThread::Run() data.Job(data.Index); // Move forward with the job queue + bool notifyWaiting = false; JobsLocker.Lock(); JobContext& context = JobContexts.At(data.JobKey); if (Platform::InterlockedDecrement(&context.JobsLeft) <= 0) { ASSERT_LOW_LAYER(context.JobsLeft <= 0); JobContexts.Remove(data.JobKey); + notifyWaiting = true; } JobsLocker.Unlock(); - WaitSignal.NotifyAll(); + if (notifyWaiting) + WaitSignal.NotifyAll(); data.Job.Unbind(); } From 861d8a683f8158efb5dba5ec73457e9a7ca644c0 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Mon, 24 Jun 2024 13:12:48 +0200 Subject: [PATCH 02/11] Add `JobSystem::Dispatch` that accepts dependent jobs that needs to be completed before --- Source/Engine/Threading/JobSystem.cpp | 131 +++++++++++++++++++++----- Source/Engine/Threading/JobSystem.h | 12 +++ 2 files changed, 119 insertions(+), 24 deletions(-) diff --git a/Source/Engine/Threading/JobSystem.cpp b/Source/Engine/Threading/JobSystem.cpp index 090bab2ea..d3f879124 100644 --- a/Source/Engine/Threading/JobSystem.cpp +++ b/Source/Engine/Threading/JobSystem.cpp @@ -5,6 +5,7 @@ #include "Engine/Platform/CPUInfo.h" #include "Engine/Platform/Thread.h" #include "Engine/Platform/ConditionVariable.h" +#include "Engine/Core/Types/Span.h" #include "Engine/Core/Collections/Dictionary.h" #include "Engine/Engine/EngineService.h" #include "Engine/Profiler/ProfilerCPU.h" @@ -48,13 +49,26 @@ public: struct JobData { - Function Job; int32 Index; int64 JobKey; }; template<> struct TIsPODType +{ + enum { Value = true }; +}; + +struct JobContext +{ + volatile int64 JobsLeft; + volatile int64 DependenciesLeft; + Function Job; + Array Dependants; +}; + +template<> +struct TIsPODType { enum { Value = false }; }; @@ -79,17 +93,6 @@ public: } }; -struct JobContext -{ - volatile int64 JobsLeft; -}; - -template<> -struct TIsPODType -{ - enum { Value = true }; -}; - namespace { JobSystemService JobSystemInstance; @@ -158,6 +161,7 @@ int32 JobSystemThread::Run() Platform::SetThreadAffinityMask(1ull << Index); JobData data; + Function job; bool attachCSharpThread = true; #if !JOB_SYSTEM_USE_MUTEX moodycamel::ConsumerToken consumerToken(Jobs); @@ -174,18 +178,23 @@ int32 JobSystemThread::Run() { data = Jobs.PeekFront(); Jobs.PopFront(); + const JobContext& context = ((const Dictionary&)JobContexts).At(data.JobKey); + job = context.Job; } JobsLocker.Unlock(); #else - if (!Jobs.try_dequeue(consumerToken, data)) - data.Job.Unbind(); + if (Jobs.try_dequeue(consumerToken, data)) + { + const JobContext& context = ((const Dictionary&)JobContexts).At(data.JobKey); + job = context.Job; + } #endif #if JOB_SYSTEM_USE_STATS Platform::InterlockedIncrement(&DequeueCount); Platform::InterlockedAdd(&DequeueSum, Platform::GetTimeCycles() - start); #endif - if (data.Job.IsBinded()) + if (job.IsBinded()) { #if USE_CSHARP // Ensure to have C# thread attached to this thead (late init due to MCore being initialized after Job System) @@ -197,7 +206,7 @@ int32 JobSystemThread::Run() #endif // Run job - data.Job(data.Index); + job(data.Index); // Move forward with the job queue bool notifyWaiting = false; @@ -205,16 +214,33 @@ int32 JobSystemThread::Run() JobContext& context = JobContexts.At(data.JobKey); if (Platform::InterlockedDecrement(&context.JobsLeft) <= 0) { - ASSERT_LOW_LAYER(context.JobsLeft <= 0); + // Update any dependant jobs + for (int64 dependant : context.Dependants) + { + JobContext& dependantContext = JobContexts.At(dependant); + if (Platform::InterlockedDecrement(&dependantContext.DependenciesLeft) <= 0) + { + // Dispatch dependency when it's ready + JobData dependantData; + dependantData.JobKey = dependant; + for (dependantData.Index = 0; dependantData.Index < dependantContext.JobsLeft; dependantData.Index++) +#if JOB_SYSTEM_USE_MUTEX + Jobs.PushBack(dependantData); +#else + Jobs.enqueue(dependantData); +#endif + } + } + + // Remove completed context JobContexts.Remove(data.JobKey); notifyWaiting = true; } JobsLocker.Unlock(); - if (notifyWaiting) WaitSignal.NotifyAll(); - data.Job.Unbind(); + job.Unbind(); } else { @@ -250,9 +276,9 @@ void JobSystem::Execute(const Function& job, int32 jobCount) int64 JobSystem::Dispatch(const Function& job, int32 jobCount) { - PROFILE_CPU(); if (jobCount <= 0) return 0; + PROFILE_CPU(); #if JOB_SYSTEM_ENABLED #if JOB_SYSTEM_USE_STATS const auto start = Platform::GetTimeCycles(); @@ -260,21 +286,20 @@ int64 JobSystem::Dispatch(const Function& job, int32 jobCount) const auto label = Platform::InterlockedAdd(&JobLabel, (int64)jobCount) + jobCount; JobData data; - data.Job = job; data.JobKey = label; JobContext context; + context.Job = job; context.JobsLeft = jobCount; + context.DependenciesLeft = 0; -#if JOB_SYSTEM_USE_MUTEX JobsLocker.Lock(); JobContexts.Add(label, context); +#if JOB_SYSTEM_USE_MUTEX for (data.Index = 0; data.Index < jobCount; data.Index++) Jobs.PushBack(data); JobsLocker.Unlock(); #else - JobsLocker.Lock(); - JobContexts.Add(label, context); JobsLocker.Unlock(); for (data.Index = 0; data.Index < jobCount; data.Index++) Jobs.enqueue(data); @@ -300,6 +325,64 @@ int64 JobSystem::Dispatch(const Function& job, int32 jobCount) #endif } +int64 JobSystem::Dispatch(const Function& job, Span dependencies, int32 jobCount) +{ + if (jobCount <= 0) + return 0; + PROFILE_CPU(); +#if JOB_SYSTEM_ENABLED + const auto label = Platform::InterlockedAdd(&JobLabel, (int64)jobCount) + jobCount; + + JobData data; + data.JobKey = label; + + JobContext context; + context.Job = job; + context.JobsLeft = jobCount; + context.DependenciesLeft = 0; + + JobsLocker.Lock(); + for (int64 dependency : dependencies) + { + if (JobContext* dependencyContext = JobContexts.TryGet(dependency)) + { + context.DependenciesLeft++; + dependencyContext->Dependants.Add(label); + } + } + JobContexts.Add(label, context); +#if JOB_SYSTEM_USE_MUTEX + if (context.DependenciesLeft == 0) + { + for (data.Index = 0; data.Index < jobCount; data.Index++) + Jobs.PushBack(data); + } + JobsLocker.Unlock(); +#else + JobsLocker.Unlock(); + if (dispatchNow) + { + for (data.Index = 0; data.Index < jobCount; data.Index++) + Jobs.enqueue(data); + } +#endif + + if (context.DependenciesLeft == 0 && JobStartingOnDispatch) + { + if (jobCount == 1) + JobsSignal.NotifyOne(); + else + JobsSignal.NotifyAll(); + } + + return label; +#else + for (int32 i = 0; i < jobCount; i++) + job(i); + return 0; +#endif +} + void JobSystem::Wait() { #if JOB_SYSTEM_ENABLED diff --git a/Source/Engine/Threading/JobSystem.h b/Source/Engine/Threading/JobSystem.h index d269aa196..c6b4500fa 100644 --- a/Source/Engine/Threading/JobSystem.h +++ b/Source/Engine/Threading/JobSystem.h @@ -4,6 +4,9 @@ #include "Engine/Core/Delegate.h" +template +class Span; + /// /// Lightweight multi-threaded jobs execution scheduler. Uses a pool of threads and supports work-stealing concept. /// @@ -26,6 +29,15 @@ API_CLASS(Static) class FLAXENGINE_API JobSystem /// The label identifying this dispatch. Can be used to wait for the execution end. API_FUNCTION() static int64 Dispatch(const Function& job, int32 jobCount = 1); + /// + /// Dispatches the job for the execution after all of dependant jobs will complete. + /// + /// The job. Argument is an index of the job execution. + /// The list of dependant jobs that need to complete in order to start executing this job. + /// The job executions count. + /// The label identifying this dispatch. Can be used to wait for the execution end. + API_FUNCTION() static int64 Dispatch(const Function& job, Span dependencies, int32 jobCount = 1); + /// /// Waits for all dispatched jobs to finish. /// From 59bbb9e0584f916c6e2a54e98418c6aa76faa5d8 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Mon, 24 Jun 2024 13:15:05 +0200 Subject: [PATCH 03/11] Fix error when reopening project --- Source/Editor/Modules/UIModule.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/Source/Editor/Modules/UIModule.cs b/Source/Editor/Modules/UIModule.cs index 00200900d..4282d5305 100644 --- a/Source/Editor/Modules/UIModule.cs +++ b/Source/Editor/Modules/UIModule.cs @@ -841,7 +841,6 @@ namespace FlaxEditor.Modules { // Open project, then close it Editor.OpenProject(Editor.GameProject.ProjectPath); - Editor.Windows.MainWindow.Close(ClosingReason.User); } private void OnMenuFileShowHide(Control control) From 8190d7f171ccc323909f1a9067311f8afdf7b876 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Mon, 24 Jun 2024 13:15:45 +0200 Subject: [PATCH 04/11] Optimize Global Surface Atlas setup and objects buffer writing to be async --- .../Renderer/GI/GlobalSurfaceAtlasPass.cpp | 245 +++++++++--------- 1 file changed, 125 insertions(+), 120 deletions(-) diff --git a/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp b/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp index 3fd2060d9..0fb63af62 100644 --- a/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp +++ b/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp @@ -131,6 +131,7 @@ class GlobalSurfaceAtlasCustomBuffer : public RenderBuffers::CustomBuffer, publi { public: int32 Resolution = 0; + float ResolutionInv; int32 AtlasPixelsTotal = 0; int32 AtlasPixelsUsed = 0; uint64 LastFrameAtlasInsertFail = 0; @@ -155,7 +156,6 @@ public: Array DirtyObjectsBuffer; Vector4 CullingPosDistance; uint64 CurrentFrame; - float ResolutionInv; Float3 ViewPosition; float TileTexelsPerWorldUnit; float DistanceScalingStart; @@ -164,7 +164,7 @@ public: float MinObjectRadius; // Async objects drawing cache - Array> AsyncDrawWaitLabels; + Array> AsyncDrawWaitLabels; RenderListBuffer AsyncFreeTiles; RenderListBuffer AsyncNewObjects; RenderListBuffer AsyncNewTiles; @@ -257,9 +257,9 @@ public: int32 resolution; float distance; GetOptions(renderContext, resolution, distance); - const float resolutionInv = 1.0f / (float)resolution; + if (Resolution != resolution) + return; // Not yet initialized const auto currentFrame = Engine::FrameCount; - if (Resolution == resolution) { // Perform atlas defragmentation if needed constexpr float maxUsageToDefrag = 0.8f; @@ -281,7 +281,6 @@ public: // Setup data for rendering CurrentFrame = currentFrame; - ResolutionInv = resolutionInv; ViewPosition = renderContext.View.Position; TileTexelsPerWorldUnit = 1.0f / METERS_TO_UNITS(0.1f); // Scales the tiles resolution DistanceScalingStart = METERS_TO_UNITS(20.0f); // Distance from camera at which the tiles resolution starts to be scaled down @@ -310,11 +309,16 @@ public: // Run sync actors drawing now or force in async (different drawing path doesn't interfere with normal scene drawing) func.Bind(this); AsyncDrawWaitLabels.Add(JobSystem::Dispatch(func, jobCount)); + + // Run dependant job that will process objects data in async + func.Bind(this); + AsyncDrawWaitLabels.Add(JobSystem::Dispatch(func, ToSpan(AsyncDrawWaitLabels))); } else { DrawActorsJob(-1); DrawActorsJob(0); + SetupJob(0); } } @@ -325,18 +329,14 @@ public: AsyncDrawWaitLabels.Clear(); } - void PostDrawActors() + void FlushNewObjects() { - PROFILE_CPU_NAMED("Post Draw"); + PROFILE_CPU_NAMED("Flush Atlas"); - // Flush atlas tiles freeing for (auto* tile : AsyncFreeTiles) - { Atlas.Free(tile, this); - } AsyncFreeTiles.Clear(); - // Flush new objects adding for (auto& newObject : AsyncNewObjects) { auto& object = Objects[newObject.ActorObject]; @@ -350,7 +350,6 @@ public: } AsyncNewObjects.Clear(); - // Flush new tiles adding for (auto& newTile : AsyncNewTiles) { auto& object = Objects[newTile.ActorObject]; @@ -371,6 +370,114 @@ public: AsyncNewTiles.Clear(); } + void CompactObjects() + { + PROFILE_CPU_NAMED("Compact Objects"); + for (auto it = Objects.Begin(); it.IsNotEnd(); ++it) + { + if (it->Value.LastFrameUsed != CurrentFrame) + { + for (auto& tile : it->Value.Tiles) + { + if (tile) + Atlas.Free(tile, this); + } + Objects.Remove(it); + } + } + } + + void WriteObjects() + { + PROFILE_CPU_NAMED("Write Objects"); + DirtyObjectsBuffer.Clear(); + ObjectsBuffer.Clear(); + for (auto& e : Objects) + { + auto& object = e.Value; + if (object.Dirty) + { + // Collect dirty objects + object.LastFrameUpdated = CurrentFrame; + object.LightingUpdateFrame = CurrentFrame; + DirtyObjectsBuffer.Add(e.Key); + } + + Matrix3x3 worldToLocalRotation; + Matrix3x3::RotationQuaternion(object.Bounds.Transformation.Orientation.Conjugated(), worldToLocalRotation); + Float3 worldPosition = object.Bounds.Transformation.Translation; + Float3 worldExtents = object.Bounds.Extents * object.Bounds.Transformation.Scale; + + // Write to objects buffer (this must match unpacking logic in HLSL) + uint32 objectAddress = ObjectsBuffer.Data.Count() / sizeof(Float4); + auto* objectData = ObjectsBuffer.WriteReserve(GLOBAL_SURFACE_ATLAS_OBJECT_DATA_STRIDE); + objectData[0] = Float4(object.Position, object.Radius); + objectData[1] = Float4::Zero; + objectData[2] = Float4(worldToLocalRotation.M11, worldToLocalRotation.M12, worldToLocalRotation.M13, worldPosition.X); + objectData[3] = Float4(worldToLocalRotation.M21, worldToLocalRotation.M22, worldToLocalRotation.M23, worldPosition.Y); + objectData[4] = Float4(worldToLocalRotation.M31, worldToLocalRotation.M32, worldToLocalRotation.M33, worldPosition.Z); + objectData[5] = Float4(worldExtents, object.UseVisibility ? 1.0f : 0.0f); + auto tileOffsets = reinterpret_cast(&objectData[1]); // xyz used for tile offsets packed into uint16 + auto objectDataSize = reinterpret_cast(&objectData[1].W); // w used for object size (count of Float4s for object+tiles) + *objectDataSize = GLOBAL_SURFACE_ATLAS_OBJECT_DATA_STRIDE; + for (int32 tileIndex = 0; tileIndex < 6; tileIndex++) + { + auto* tile = object.Tiles[tileIndex]; + if (!tile) + continue; + tile->ObjectAddressOffset = *objectDataSize; + tile->Address = objectAddress + tile->ObjectAddressOffset; + tileOffsets[tileIndex] = tile->ObjectAddressOffset; + *objectDataSize += GLOBAL_SURFACE_ATLAS_TILE_DATA_STRIDE; + + // Setup view to render object from the side + Float3 xAxis, yAxis, zAxis = Float3::Zero; + zAxis.Raw[tileIndex / 2] = tileIndex & 1 ? 1.0f : -1.0f; + yAxis = tileIndex == 2 || tileIndex == 3 ? Float3::Right : Float3::Up; + Float3::Cross(yAxis, zAxis, xAxis); + Float3 localSpaceOffset = -zAxis * object.Bounds.Extents; + xAxis = object.Bounds.Transformation.LocalToWorldVector(xAxis); + yAxis = object.Bounds.Transformation.LocalToWorldVector(yAxis); + zAxis = object.Bounds.Transformation.LocalToWorldVector(zAxis); + xAxis.NormalizeFast(); + yAxis.NormalizeFast(); + zAxis.NormalizeFast(); + tile->ViewPosition = object.Bounds.Transformation.LocalToWorld(localSpaceOffset); + tile->ViewDirection = zAxis; + + // Create view matrix + tile->ViewMatrix.SetColumn1(Float4(xAxis, -Float3::Dot(xAxis, tile->ViewPosition))); + tile->ViewMatrix.SetColumn2(Float4(yAxis, -Float3::Dot(yAxis, tile->ViewPosition))); + tile->ViewMatrix.SetColumn3(Float4(zAxis, -Float3::Dot(zAxis, tile->ViewPosition))); + tile->ViewMatrix.SetColumn4(Float4(0, 0, 0, 1)); + + // Calculate object bounds size in the view + OrientedBoundingBox viewBounds(object.Bounds); + viewBounds.Transform(tile->ViewMatrix); + Float3 viewExtent = viewBounds.Transformation.LocalToWorldVector(viewBounds.Extents); + tile->ViewBoundsSize = viewExtent.GetAbsolute() * 2.0f; + + // Per-tile data + const float tileWidth = (float)tile->Width - GLOBAL_SURFACE_ATLAS_TILE_PADDING; + const float tileHeight = (float)tile->Height - GLOBAL_SURFACE_ATLAS_TILE_PADDING; + auto* tileData = ObjectsBuffer.WriteReserve(GLOBAL_SURFACE_ATLAS_TILE_DATA_STRIDE); + tileData[0] = Float4(tile->X, tile->Y, tileWidth, tileHeight) * ResolutionInv; + tileData[1] = Float4(tile->ViewMatrix.M11, tile->ViewMatrix.M12, tile->ViewMatrix.M13, tile->ViewMatrix.M41); + tileData[2] = Float4(tile->ViewMatrix.M21, tile->ViewMatrix.M22, tile->ViewMatrix.M23, tile->ViewMatrix.M42); + tileData[3] = Float4(tile->ViewMatrix.M31, tile->ViewMatrix.M32, tile->ViewMatrix.M33, tile->ViewMatrix.M43); + tileData[4] = Float4(tile->ViewBoundsSize, 0.0f); // w unused + } + } + } + + void SetupJob(int32) + { + PROFILE_CPU(); + FlushNewObjects(); + CompactObjects(); + WriteObjects(); + } + // [ISceneRenderingListener] void OnSceneRenderingAddActor(Actor* a) override { @@ -587,10 +694,6 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co surfaceAtlasData.LastFrameUsed = currentFrame; PROFILE_GPU_CPU_NAMED("Global Surface Atlas"); - // Start objects drawing (in case not et started earlier this frame) - _surfaceAtlasData = &surfaceAtlasData; - surfaceAtlasData.StartDrawActors(renderContext); - // Setup options int32 resolution; float distance; @@ -617,6 +720,7 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co INIT_ATLAS_TEXTURE(AtlasDepth, PixelFormat::D16_UNorm); #undef INIT_ATLAS_TEXTURE surfaceAtlasData.Resolution = resolution; + surfaceAtlasData.ResolutionInv = resolutionInv; surfaceAtlasData.AtlasPixelsTotal = resolution * resolution; if (!surfaceAtlasData.ChunksBuffer) { @@ -632,6 +736,11 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co if (!_vertexBuffer) _vertexBuffer = New(0u, (uint32)sizeof(AtlasTileVertex), TEXT("GlobalSurfaceAtlas.VertexBuffer")); + // Ensure that async objects drawing ended + _surfaceAtlasData = &surfaceAtlasData; + surfaceAtlasData.StartDrawActors(renderContext); // (ignored if not started earlier this frame) + surfaceAtlasData.WaitForDrawActors(); + // Utility for writing into tiles vertex buffer const Float2 posToClipMul(2.0f * resolutionInv, -2.0f * resolutionInv); const Float2 posToClipAdd(-1.0f, 1.0f); @@ -662,110 +771,6 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co context->BindVB(ToSpan(&vb, 1)); \ context->DrawInstanced(_vertexBuffer->Data.Count() / sizeof(AtlasTileVertex), 1); - // Ensure that async objects drawing ended - surfaceAtlasData.WaitForDrawActors(); - surfaceAtlasData.PostDrawActors(); - - // Remove unused objects - { - PROFILE_GPU_CPU_NAMED("Compact Objects"); - for (auto it = surfaceAtlasData.Objects.Begin(); it.IsNotEnd(); ++it) - { - if (it->Value.LastFrameUsed != currentFrame) - { - for (auto& tile : it->Value.Tiles) - { - if (tile) - surfaceAtlasData.Atlas.Free(tile, &surfaceAtlasData); - } - surfaceAtlasData.Objects.Remove(it); - } - } - } - - // Write objects to the data buffer - { - PROFILE_CPU_NAMED("Write Objects"); - surfaceAtlasData.DirtyObjectsBuffer.Clear(); - surfaceAtlasData.ObjectsBuffer.Clear(); - for (auto& e : surfaceAtlasData.Objects) - { - auto& object = e.Value; - if (object.Dirty) - { - // Collect dirty objects - object.LastFrameUpdated = surfaceAtlasData.CurrentFrame; - object.LightingUpdateFrame = surfaceAtlasData.CurrentFrame; - surfaceAtlasData.DirtyObjectsBuffer.Add(e.Key); - } - - Matrix3x3 worldToLocalRotation; - Matrix3x3::RotationQuaternion(object.Bounds.Transformation.Orientation.Conjugated(), worldToLocalRotation); - Float3 worldPosition = object.Bounds.Transformation.Translation; - Float3 worldExtents = object.Bounds.Extents * object.Bounds.Transformation.Scale; - - // Write to objects buffer (this must match unpacking logic in HLSL) - uint32 objectAddress = surfaceAtlasData.ObjectsBuffer.Data.Count() / sizeof(Float4); - auto* objectData = surfaceAtlasData.ObjectsBuffer.WriteReserve(GLOBAL_SURFACE_ATLAS_OBJECT_DATA_STRIDE); - objectData[0] = Float4(object.Position, object.Radius); - objectData[1] = Float4::Zero; - objectData[2] = Float4(worldToLocalRotation.M11, worldToLocalRotation.M12, worldToLocalRotation.M13, worldPosition.X); - objectData[3] = Float4(worldToLocalRotation.M21, worldToLocalRotation.M22, worldToLocalRotation.M23, worldPosition.Y); - objectData[4] = Float4(worldToLocalRotation.M31, worldToLocalRotation.M32, worldToLocalRotation.M33, worldPosition.Z); - objectData[5] = Float4(worldExtents, object.UseVisibility ? 1.0f : 0.0f); - auto tileOffsets = reinterpret_cast(&objectData[1]); // xyz used for tile offsets packed into uint16 - auto objectDataSize = reinterpret_cast(&objectData[1].W); // w used for object size (count of Float4s for object+tiles) - *objectDataSize = GLOBAL_SURFACE_ATLAS_OBJECT_DATA_STRIDE; - for (int32 tileIndex = 0; tileIndex < 6; tileIndex++) - { - auto* tile = object.Tiles[tileIndex]; - if (!tile) - continue; - tile->ObjectAddressOffset = *objectDataSize; - tile->Address = objectAddress + tile->ObjectAddressOffset; - tileOffsets[tileIndex] = tile->ObjectAddressOffset; - *objectDataSize += GLOBAL_SURFACE_ATLAS_TILE_DATA_STRIDE; - - // Setup view to render object from the side - Float3 xAxis, yAxis, zAxis = Float3::Zero; - zAxis.Raw[tileIndex / 2] = tileIndex & 1 ? 1.0f : -1.0f; - yAxis = tileIndex == 2 || tileIndex == 3 ? Float3::Right : Float3::Up; - Float3::Cross(yAxis, zAxis, xAxis); - Float3 localSpaceOffset = -zAxis * object.Bounds.Extents; - xAxis = object.Bounds.Transformation.LocalToWorldVector(xAxis); - yAxis = object.Bounds.Transformation.LocalToWorldVector(yAxis); - zAxis = object.Bounds.Transformation.LocalToWorldVector(zAxis); - xAxis.NormalizeFast(); - yAxis.NormalizeFast(); - zAxis.NormalizeFast(); - tile->ViewPosition = object.Bounds.Transformation.LocalToWorld(localSpaceOffset); - tile->ViewDirection = zAxis; - - // Create view matrix - tile->ViewMatrix.SetColumn1(Float4(xAxis, -Float3::Dot(xAxis, tile->ViewPosition))); - tile->ViewMatrix.SetColumn2(Float4(yAxis, -Float3::Dot(yAxis, tile->ViewPosition))); - tile->ViewMatrix.SetColumn3(Float4(zAxis, -Float3::Dot(zAxis, tile->ViewPosition))); - tile->ViewMatrix.SetColumn4(Float4(0, 0, 0, 1)); - - // Calculate object bounds size in the view - OrientedBoundingBox viewBounds(object.Bounds); - viewBounds.Transform(tile->ViewMatrix); - Float3 viewExtent = viewBounds.Transformation.LocalToWorldVector(viewBounds.Extents); - tile->ViewBoundsSize = viewExtent.GetAbsolute() * 2.0f; - - // Per-tile data - const float tileWidth = (float)tile->Width - GLOBAL_SURFACE_ATLAS_TILE_PADDING; - const float tileHeight = (float)tile->Height - GLOBAL_SURFACE_ATLAS_TILE_PADDING; - auto* tileData = surfaceAtlasData.ObjectsBuffer.WriteReserve(GLOBAL_SURFACE_ATLAS_TILE_DATA_STRIDE); - tileData[0] = Float4(tile->X, tile->Y, tileWidth, tileHeight) * surfaceAtlasData.ResolutionInv; - tileData[1] = Float4(tile->ViewMatrix.M11, tile->ViewMatrix.M12, tile->ViewMatrix.M13, tile->ViewMatrix.M41); - tileData[2] = Float4(tile->ViewMatrix.M21, tile->ViewMatrix.M22, tile->ViewMatrix.M23, tile->ViewMatrix.M42); - tileData[3] = Float4(tile->ViewMatrix.M31, tile->ViewMatrix.M32, tile->ViewMatrix.M33, tile->ViewMatrix.M43); - tileData[4] = Float4(tile->ViewBoundsSize, 0.0f); // w unused - } - } - } - // Rasterize world geometry material properties into Global Surface Atlas if (surfaceAtlasData.DirtyObjectsBuffer.Count() != 0) { From a1c251c3b7ec7c65caac848e7488a8fe646cf351 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Mon, 24 Jun 2024 19:01:35 +0200 Subject: [PATCH 05/11] Add various optimizations to Global Surface Atlas --- .../Renderer/GI/GlobalSurfaceAtlasPass.cpp | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp b/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp index 0fb63af62..a64a06635 100644 --- a/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp +++ b/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp @@ -14,7 +14,6 @@ #include "Engine/Content/Content.h" #include "Engine/Graphics/GPUContext.h" #include "Engine/Graphics/GPUDevice.h" -#include "Engine/Graphics/Graphics.h" #include "Engine/Graphics/RenderTask.h" #include "Engine/Graphics/RenderBuffers.h" #include "Engine/Graphics/RenderTargetPool.h" @@ -40,6 +39,8 @@ #define GLOBAL_SURFACE_ATLAS_DEBUG_FORCE_REDRAW_TILES 0 // Forces to redraw all object tiles every frame #define GLOBAL_SURFACE_ATLAS_DEBUG_DRAW_OBJECTS 0 // Debug draws object bounds on redraw (and tile draw projection locations) #define GLOBAL_SURFACE_ATLAS_DEBUG_DRAW_CHUNKS 0 // Debug draws culled chunks bounds (non-empty) +#define GLOBAL_SURFACE_ATLAS_MAX_NEW_OBJECTS_PER_FRAME 500 // Limits the amount of newly added objects to atlas per-frame to reduce hitches on 1st frame or camera-cut +#define GLOBAL_SURFACE_ATLAS_DIRTY_FRAMES(flags) (EnumHasAnyFlags(flags, StaticFlags::Lightmap) ? 200 : 10) // Amount of frames after which update object (less frequent updates for static scenes) #if GLOBAL_SURFACE_ATLAS_DEBUG_DRAW_OBJECTS || GLOBAL_SURFACE_ATLAS_DEBUG_DRAW_CHUNKS #include "Engine/Debug/DebugDraw.h" @@ -300,14 +301,14 @@ public: if (enableAsync) { - // Run in async via Job System + // Run sync actors drawing now or force in async (different drawing path doesn't interfere with normal scene drawing) Function func; - func.Bind(this); + func.Bind(this); const int32 jobCount = Math::Max(JobSystem::GetThreadsCount() - 1, 1); // Leave 1 thread unused to not block the main-thread (jobs will overlap with rendering) AsyncDrawWaitLabels.Add(JobSystem::Dispatch(func, jobCount)); - // Run sync actors drawing now or force in async (different drawing path doesn't interfere with normal scene drawing) - func.Bind(this); + // Run in async via Job System + func.Bind(this); AsyncDrawWaitLabels.Add(JobSystem::Dispatch(func, jobCount)); // Run dependant job that will process objects data in async @@ -797,7 +798,7 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co context->SetRenderTarget(depthBuffer, ToSpan(targetBuffers, ARRAY_COUNT(targetBuffers))); { PROFILE_GPU_CPU_NAMED("Clear"); - if (noCache || GLOBAL_SURFACE_ATLAS_DEBUG_FORCE_REDRAW_TILES || !GPU_SPREAD_WORKLOAD) + if (noCache || GLOBAL_SURFACE_ATLAS_DEBUG_FORCE_REDRAW_TILES) { // Full-atlas hardware clear context->ClearDepth(depthBuffer); @@ -1086,7 +1087,7 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co { GlobalSurfaceAtlasLight& lightData = surfaceAtlasData.Lights[light.ID]; lightData.LastFrameUsed = currentFrame; - uint32 redrawFramesCount = EnumHasAnyFlags(light.StaticFlags, StaticFlags::Lightmap) ? 120 : 4; + uint32 redrawFramesCount = GLOBAL_SURFACE_ATLAS_DIRTY_FRAMES(light.StaticFlags); if (surfaceAtlasData.CurrentFrame - lightData.LastFrameUpdated < (redrawFramesCount + (light.ID.D & redrawFramesCount))) continue; lightData.LastFrameUpdated = currentFrame; @@ -1121,7 +1122,7 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co { GlobalSurfaceAtlasLight& lightData = surfaceAtlasData.Lights[light.ID]; lightData.LastFrameUsed = currentFrame; - uint32 redrawFramesCount = EnumHasAnyFlags(light.StaticFlags, StaticFlags::Lightmap) ? 120 : 4; + uint32 redrawFramesCount = GLOBAL_SURFACE_ATLAS_DIRTY_FRAMES(light.StaticFlags); if (surfaceAtlasData.CurrentFrame - lightData.LastFrameUpdated < (redrawFramesCount + (light.ID.D & redrawFramesCount))) continue; lightData.LastFrameUpdated = currentFrame; @@ -1143,7 +1144,7 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co { GlobalSurfaceAtlasLight& lightData = surfaceAtlasData.Lights[light.ID]; lightData.LastFrameUsed = currentFrame; - uint32 redrawFramesCount = EnumHasAnyFlags(light.StaticFlags, StaticFlags::Lightmap) ? 120 : 4; + uint32 redrawFramesCount = GLOBAL_SURFACE_ATLAS_DIRTY_FRAMES(light.StaticFlags); if (surfaceAtlasData.CurrentFrame - lightData.LastFrameUpdated < (redrawFramesCount + (light.ID.D & redrawFramesCount))) continue; lightData.LastFrameUpdated = currentFrame; @@ -1459,9 +1460,9 @@ void GlobalSurfaceAtlasPass::RasterizeActor(Actor* actor, void* actorObject, con const float distanceScale = Math::Lerp(1.0f, surfaceAtlasData.DistanceScaling, Math::InverseLerp(surfaceAtlasData.DistanceScalingStart, surfaceAtlasData.DistanceScalingEnd, (float)CollisionsHelper::DistanceSpherePoint(actorObjectBounds, surfaceAtlasData.ViewPosition))); const float tilesScale = surfaceAtlasData.TileTexelsPerWorldUnit * distanceScale * qualityScale; GlobalSurfaceAtlasObject* object = surfaceAtlasData.Objects.TryGet(actorObject); - if (!object && surfaceAtlasData.AsyncNewObjects.Count() >= 512) + if (!object && surfaceAtlasData.AsyncNewObjects.Count() >= GLOBAL_SURFACE_ATLAS_MAX_NEW_OBJECTS_PER_FRAME) return; // Reduce load on 1st frame and add more objects during next frames to balance performance - bool anyTile = false, dirty = GLOBAL_SURFACE_ATLAS_DEBUG_FORCE_REDRAW_TILES || !GPU_SPREAD_WORKLOAD; + bool anyTile = false, dirty = GLOBAL_SURFACE_ATLAS_DEBUG_FORCE_REDRAW_TILES; for (int32 tileIndex = 0; tileIndex < 6; tileIndex++) { if (((1 << tileIndex) & tilesMask) == 0) @@ -1517,7 +1518,7 @@ void GlobalSurfaceAtlasPass::RasterizeActor(Actor* actor, void* actorObject, con if (object) { // Redraw objects from time-to-time (dynamic objects can be animated, static objects can have textures streamed) - uint32 redrawFramesCount = actor->HasStaticFlag(StaticFlags::Lightmap) ? 120 : 4; + uint32 redrawFramesCount = GLOBAL_SURFACE_ATLAS_DIRTY_FRAMES(actor->GetStaticFlags()); if (surfaceAtlasData.CurrentFrame - object->LastFrameUpdated >= (redrawFramesCount + (actor->GetID().D & redrawFramesCount))) dirty = true; @@ -1527,7 +1528,7 @@ void GlobalSurfaceAtlasPass::RasterizeActor(Actor* actor, void* actorObject, con object->Bounds = bounds; object->Position = (Float3)actorObjectBounds.Center; // TODO: large worlds object->Radius = (float)actorObjectBounds.Radius; - object->Dirty = dirty; + object->Dirty |= dirty; object->UseVisibility = useVisibility; } else From 3bbaa8dad0c774d8090eca0dae50000b32d9565e Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Mon, 24 Jun 2024 19:02:38 +0200 Subject: [PATCH 06/11] Remove unused job system stats and concurrent queue code --- Source/Engine/Threading/JobSystem.cpp | 80 +-------------------------- 1 file changed, 1 insertion(+), 79 deletions(-) diff --git a/Source/Engine/Threading/JobSystem.cpp b/Source/Engine/Threading/JobSystem.cpp index d3f879124..34e2b4b9a 100644 --- a/Source/Engine/Threading/JobSystem.cpp +++ b/Source/Engine/Threading/JobSystem.cpp @@ -7,30 +7,14 @@ #include "Engine/Platform/ConditionVariable.h" #include "Engine/Core/Types/Span.h" #include "Engine/Core/Collections/Dictionary.h" +#include "Engine/Core/Collections/RingBuffer.h" #include "Engine/Engine/EngineService.h" #include "Engine/Profiler/ProfilerCPU.h" #if USE_CSHARP #include "Engine/Scripting/ManagedCLR/MCore.h" #endif -// Jobs storage perf info: -// (500 jobs, i7 9th gen) -// JOB_SYSTEM_USE_MUTEX=1, enqueue=130-280 cycles, dequeue=2-6 cycles -// JOB_SYSTEM_USE_MUTEX=0, enqueue=300-700 cycles, dequeue=10-16 cycles -// So using RingBuffer+Mutex+Signals is better than moodycamel::ConcurrentQueue - #define JOB_SYSTEM_ENABLED 1 -#define JOB_SYSTEM_USE_MUTEX 1 -#define JOB_SYSTEM_USE_STATS 0 - -#if JOB_SYSTEM_USE_STATS -#include "Engine/Core/Log.h" -#endif -#if JOB_SYSTEM_USE_MUTEX -#include "Engine/Core/Collections/RingBuffer.h" -#else -#include "ConcurrentQueue.h" -#endif #if JOB_SYSTEM_ENABLED @@ -107,15 +91,7 @@ namespace ConditionVariable WaitSignal; CriticalSection WaitMutex; CriticalSection JobsLocker; -#if JOB_SYSTEM_USE_MUTEX RingBuffer Jobs; -#else - ConcurrentQueue Jobs; -#endif -#if JOB_SYSTEM_USE_STATS - int64 DequeueCount = 0; - int64 DequeueSum = 0; -#endif } bool JobSystemService::Init() @@ -163,16 +139,9 @@ int32 JobSystemThread::Run() JobData data; Function job; bool attachCSharpThread = true; -#if !JOB_SYSTEM_USE_MUTEX - moodycamel::ConsumerToken consumerToken(Jobs); -#endif while (Platform::AtomicRead(&ExitFlag) == 0) { // Try to get a job -#if JOB_SYSTEM_USE_STATS - const auto start = Platform::GetTimeCycles(); -#endif -#if JOB_SYSTEM_USE_MUTEX JobsLocker.Lock(); if (Jobs.Count() != 0) { @@ -182,17 +151,6 @@ int32 JobSystemThread::Run() job = context.Job; } JobsLocker.Unlock(); -#else - if (Jobs.try_dequeue(consumerToken, data)) - { - const JobContext& context = ((const Dictionary&)JobContexts).At(data.JobKey); - job = context.Job; - } -#endif -#if JOB_SYSTEM_USE_STATS - Platform::InterlockedIncrement(&DequeueCount); - Platform::InterlockedAdd(&DequeueSum, Platform::GetTimeCycles() - start); -#endif if (job.IsBinded()) { @@ -224,11 +182,7 @@ int32 JobSystemThread::Run() JobData dependantData; dependantData.JobKey = dependant; for (dependantData.Index = 0; dependantData.Index < dependantContext.JobsLeft; dependantData.Index++) -#if JOB_SYSTEM_USE_MUTEX Jobs.PushBack(dependantData); -#else - Jobs.enqueue(dependantData); -#endif } } @@ -280,9 +234,6 @@ int64 JobSystem::Dispatch(const Function& job, int32 jobCount) return 0; PROFILE_CPU(); #if JOB_SYSTEM_ENABLED -#if JOB_SYSTEM_USE_STATS - const auto start = Platform::GetTimeCycles(); -#endif const auto label = Platform::InterlockedAdd(&JobLabel, (int64)jobCount) + jobCount; JobData data; @@ -295,19 +246,9 @@ int64 JobSystem::Dispatch(const Function& job, int32 jobCount) JobsLocker.Lock(); JobContexts.Add(label, context); -#if JOB_SYSTEM_USE_MUTEX for (data.Index = 0; data.Index < jobCount; data.Index++) Jobs.PushBack(data); JobsLocker.Unlock(); -#else - JobsLocker.Unlock(); - for (data.Index = 0; data.Index < jobCount; data.Index++) - Jobs.enqueue(data); -#endif - -#if JOB_SYSTEM_USE_STATS - LOG(Info, "Job enqueue time: {0} cycles", (int64)(Platform::GetTimeCycles() - start)); -#endif if (JobStartingOnDispatch) { @@ -351,21 +292,12 @@ int64 JobSystem::Dispatch(const Function& job, Span dependen } } JobContexts.Add(label, context); -#if JOB_SYSTEM_USE_MUTEX if (context.DependenciesLeft == 0) { for (data.Index = 0; data.Index < jobCount; data.Index++) Jobs.PushBack(data); } JobsLocker.Unlock(); -#else - JobsLocker.Unlock(); - if (dispatchNow) - { - for (data.Index = 0; data.Index < jobCount; data.Index++) - Jobs.enqueue(data); - } -#endif if (context.DependenciesLeft == 0 && JobStartingOnDispatch) { @@ -426,11 +358,6 @@ void JobSystem::Wait(int64 label) // Wake up any thread to prevent stalling in highly multi-threaded environment JobsSignal.NotifyOne(); } - -#if JOB_SYSTEM_USE_STATS - LOG(Info, "Job average dequeue time: {0} cycles", DequeueSum / DequeueCount); - DequeueSum = DequeueCount = 0; -#endif #endif } @@ -438,16 +365,11 @@ void JobSystem::SetJobStartingOnDispatch(bool value) { #if JOB_SYSTEM_ENABLED JobStartingOnDispatch = value; - if (value) { -#if JOB_SYSTEM_USE_MUTEX JobsLocker.Lock(); const int32 count = Jobs.Count(); JobsLocker.Unlock(); -#else - const int32 count = Jobs.Count(); -#endif if (count == 1) JobsSignal.NotifyOne(); else if (count != 0) From b545d8800c0b5c4683f4ec9418c75f636f2130cb Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Mon, 24 Jun 2024 23:19:01 +0200 Subject: [PATCH 07/11] Optimize job system memory allocations --- Source/Engine/Core/Collections/RingBuffer.h | 7 ++ .../Engine/Core/Memory/SimpleHeapAllocation.h | 86 +++++++++++++++++++ Source/Engine/Renderer/RenderList.cpp | 16 +--- Source/Engine/Renderer/RendererAllocation.h | 79 +---------------- Source/Engine/Threading/JobSystem.cpp | 52 +++++++++-- 5 files changed, 145 insertions(+), 95 deletions(-) create mode 100644 Source/Engine/Core/Memory/SimpleHeapAllocation.h diff --git a/Source/Engine/Core/Collections/RingBuffer.h b/Source/Engine/Core/Collections/RingBuffer.h index 898c51c67..4dcc75e33 100644 --- a/Source/Engine/Core/Collections/RingBuffer.h +++ b/Source/Engine/Core/Collections/RingBuffer.h @@ -5,6 +5,7 @@ #include "Engine/Platform/Platform.h" #include "Engine/Core/Memory/Memory.h" #include "Engine/Core/Memory/Allocation.h" +#include "Engine/Core/Math/Math.h" /// /// Template for ring buffer with variable capacity. @@ -98,4 +99,10 @@ public: Memory::DestructItems(Get() + Math::Min(_front, _back), _count); _front = _back = _count = 0; } + + void Release() + { + Clear(); + _allocation.Free(); + } }; diff --git a/Source/Engine/Core/Memory/SimpleHeapAllocation.h b/Source/Engine/Core/Memory/SimpleHeapAllocation.h new file mode 100644 index 000000000..0428d1d7f --- /dev/null +++ b/Source/Engine/Core/Memory/SimpleHeapAllocation.h @@ -0,0 +1,86 @@ +// Copyright (c) 2012-2024 Wojciech Figat. All rights reserved. + +#pragma once + +#include "Engine/Core/Memory/Memory.h" +#include "Engine/Core/Types/BaseTypes.h" + +// Base class for custom heap-based allocators (eg. with local pooling/paging). Expects only Allocate/Free methods to be provided. +template +class SimpleHeapAllocation +{ +public: + enum { HasSwap = true }; + + template + class Data + { + T* _data = nullptr; + uintptr _size; + + public: + FORCE_INLINE Data() + { + } + + FORCE_INLINE ~Data() + { + if (_data) + This::Free(_data, _size); + } + + FORCE_INLINE T* Get() + { + return _data; + } + + FORCE_INLINE const T* Get() const + { + return _data; + } + + FORCE_INLINE int32 CalculateCapacityGrow(int32 capacity, int32 minCapacity) const + { + capacity = capacity ? capacity * 2 : InitialCapacity; + if (capacity < minCapacity) + capacity = minCapacity; + return capacity; + } + + FORCE_INLINE void Allocate(uint64 capacity) + { + _size = capacity * sizeof(T); + _data = (T*)This::Allocate(_size); + } + + FORCE_INLINE void Relocate(uint64 capacity, int32 oldCount, int32 newCount) + { + T* newData = capacity != 0 ? (T*)This::Allocate(capacity * sizeof(T)) : nullptr; + if (oldCount) + { + if (newCount > 0) + Memory::MoveItems(newData, _data, newCount); + Memory::DestructItems(_data, oldCount); + } + if (_data) + This::Free(_data, _size); + _data = newData; + _size = capacity * sizeof(T); + } + + FORCE_INLINE void Free() + { + if (_data) + { + This::Free(_data, _size); + _data = nullptr; + } + } + + FORCE_INLINE void Swap(Data& other) + { + ::Swap(_data, other._data); + ::Swap(_size, other._size); + } + }; +}; diff --git a/Source/Engine/Renderer/RenderList.cpp b/Source/Engine/Renderer/RenderList.cpp index 09e7f9b62..1ca5cf244 100644 --- a/Source/Engine/Renderer/RenderList.cpp +++ b/Source/Engine/Renderer/RenderList.cpp @@ -30,13 +30,7 @@ namespace Array SortingBatches; Array FreeRenderList; - struct MemPoolEntry - { - void* Ptr; - uintptr Size; - }; - - Array MemPool; + Array> MemPool; CriticalSection MemPoolLocker; } @@ -147,18 +141,16 @@ void* RendererAllocation::Allocate(uintptr size) MemPoolLocker.Lock(); for (int32 i = 0; i < MemPool.Count(); i++) { - if (MemPool[i].Size == size) + if (MemPool.Get()[i].Second == size) { - result = MemPool[i].Ptr; + result = MemPool.Get()[i].First; MemPool.RemoveAt(i); break; } } MemPoolLocker.Unlock(); if (!result) - { result = Platform::Allocate(size, 16); - } return result; } @@ -201,7 +193,7 @@ void RenderList::CleanupCache() SortingIndices.Resize(0); FreeRenderList.ClearDelete(); for (auto& e : MemPool) - Platform::Free(e.Ptr); + Platform::Free(e.First); MemPool.Clear(); } diff --git a/Source/Engine/Renderer/RendererAllocation.h b/Source/Engine/Renderer/RendererAllocation.h index 42cd5e755..c0ef46a91 100644 --- a/Source/Engine/Renderer/RendererAllocation.h +++ b/Source/Engine/Renderer/RendererAllocation.h @@ -2,86 +2,11 @@ #pragma once -#include "Engine/Core/Memory/Memory.h" -#include "Engine/Core/Types/BaseTypes.h" +#include "Engine/Core/Memory/SimpleHeapAllocation.h" -class RendererAllocation +class RendererAllocation : public SimpleHeapAllocation { public: static FLAXENGINE_API void* Allocate(uintptr size); static FLAXENGINE_API void Free(void* ptr, uintptr size); - - enum { HasSwap = true }; - - template - class Data - { - T* _data = nullptr; - uintptr _size; - - public: - FORCE_INLINE Data() - { - } - - FORCE_INLINE ~Data() - { - if (_data) - RendererAllocation::Free(_data, _size); - } - - FORCE_INLINE T* Get() - { - return _data; - } - - FORCE_INLINE const T* Get() const - { - return _data; - } - - FORCE_INLINE int32 CalculateCapacityGrow(int32 capacity, int32 minCapacity) const - { - capacity = capacity ? capacity * 2 : 64; - if (capacity < minCapacity) - capacity = minCapacity; - return capacity; - } - - FORCE_INLINE void Allocate(uint64 capacity) - { - _size = capacity * sizeof(T); - _data = (T*)RendererAllocation::Allocate(_size); - } - - FORCE_INLINE void Relocate(uint64 capacity, int32 oldCount, int32 newCount) - { - T* newData = capacity != 0 ? (T*)RendererAllocation::Allocate(capacity * sizeof(T)) : nullptr; - if (oldCount) - { - if (newCount > 0) - Memory::MoveItems(newData, _data, newCount); - Memory::DestructItems(_data, oldCount); - } - if (_data) - RendererAllocation::Free(_data, _size); - _data = newData; - _size = capacity * sizeof(T); - } - - FORCE_INLINE void Free() - { - if (_data) - { - RendererAllocation::Free(_data, _size); - _data = nullptr; - } - } - - FORCE_INLINE void Swap(Data& other) - { - ::Swap(_data, other._data); - ::Swap(_size, other._size); - } - }; }; diff --git a/Source/Engine/Threading/JobSystem.cpp b/Source/Engine/Threading/JobSystem.cpp index 34e2b4b9a..847291bbc 100644 --- a/Source/Engine/Threading/JobSystem.cpp +++ b/Source/Engine/Threading/JobSystem.cpp @@ -6,6 +6,8 @@ #include "Engine/Platform/Thread.h" #include "Engine/Platform/ConditionVariable.h" #include "Engine/Core/Types/Span.h" +#include "Engine/Core/Types/Pair.h" +#include "Engine/Core/Memory/SimpleHeapAllocation.h" #include "Engine/Core/Collections/Dictionary.h" #include "Engine/Core/Collections/RingBuffer.h" #include "Engine/Engine/EngineService.h" @@ -18,6 +20,14 @@ #if JOB_SYSTEM_ENABLED +// Local allocator for job system memory that uses internal pooling and assumes that JobsLocker is taken (write access owned by the calling thread). +class JobSystemAllocation : public SimpleHeapAllocation +{ +public: + static void* Allocate(uintptr size); + static void Free(void* ptr, uintptr size); +}; + class JobSystemService : public EngineService { public: @@ -46,9 +56,9 @@ struct TIsPODType struct JobContext { volatile int64 JobsLeft; - volatile int64 DependenciesLeft; + int32 DependenciesLeft; Function Job; - Array Dependants; + Array Dependants; }; template<> @@ -80,12 +90,13 @@ public: namespace { JobSystemService JobSystemInstance; + Array> MemPool; Thread* Threads[PLATFORM_THREADS_LIMIT / 2] = {}; int32 ThreadsCount = 0; bool JobStartingOnDispatch = true; volatile int64 ExitFlag = 0; volatile int64 JobLabel = 0; - Dictionary JobContexts; + Dictionary JobContexts; ConditionVariable JobsSignal; CriticalSection JobsMutex; ConditionVariable WaitSignal; @@ -94,6 +105,28 @@ namespace RingBuffer Jobs; } +void* JobSystemAllocation::Allocate(uintptr size) +{ + void* result = nullptr; + for (int32 i = 0; i < MemPool.Count(); i++) + { + if (MemPool.Get()[i].Second == size) + { + result = MemPool.Get()[i].First; + MemPool.RemoveAt(i); + break; + } + } + if (!result) + result = Platform::Allocate(size, 16); + return result; +} + +void JobSystemAllocation::Free(void* ptr, uintptr size) +{ + MemPool.Add({ ptr, size }); +} + bool JobSystemService::Init() { ThreadsCount = Math::Min(Platform::GetCPUInfo().LogicalProcessorCount, ARRAY_COUNT(Threads)); @@ -130,6 +163,12 @@ void JobSystemService::Dispose() Threads[i] = nullptr; } } + + JobContexts.SetCapacity(0); + Jobs.Release(); + for (auto& e : MemPool) + Platform::Free(e.First); + MemPool.Clear(); } int32 JobSystemThread::Run() @@ -176,7 +215,7 @@ int32 JobSystemThread::Run() for (int64 dependant : context.Dependants) { JobContext& dependantContext = JobContexts.At(dependant); - if (Platform::InterlockedDecrement(&dependantContext.DependenciesLeft) <= 0) + if (--dependantContext.DependenciesLeft <= 0) { // Dispatch dependency when it's ready JobData dependantData; @@ -245,7 +284,7 @@ int64 JobSystem::Dispatch(const Function& job, int32 jobCount) context.DependenciesLeft = 0; JobsLocker.Lock(); - JobContexts.Add(label, context); + JobContexts.Add(label, MoveTemp(context)); for (data.Index = 0; data.Index < jobCount; data.Index++) Jobs.PushBack(data); JobsLocker.Unlock(); @@ -291,9 +330,10 @@ int64 JobSystem::Dispatch(const Function& job, Span dependen dependencyContext->Dependants.Add(label); } } - JobContexts.Add(label, context); + JobContexts.Add(label, MoveTemp(context)); if (context.DependenciesLeft == 0) { + // No dependencies left to complete so dispatch now for (data.Index = 0; data.Index < jobCount; data.Index++) Jobs.PushBack(data); } From 18c3f274f8b3b10243f39ac72934ecf812705e09 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Wed, 26 Jun 2024 18:16:58 +0200 Subject: [PATCH 08/11] Optimize Global SDF drawing with async job system --- .../Renderer/GlobalSignDistanceFieldPass.cpp | 704 +++++++++++------- .../Renderer/GlobalSignDistanceFieldPass.h | 22 +- Source/Engine/Renderer/Renderer.cpp | 2 + 3 files changed, 428 insertions(+), 300 deletions(-) diff --git a/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp b/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp index 2d35ac09e..7a676f584 100644 --- a/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp +++ b/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp @@ -18,9 +18,9 @@ #include "Engine/Graphics/Shaders/GPUShader.h" #include "Engine/Level/Scene/SceneRendering.h" #include "Engine/Level/Actors/StaticModel.h" +#include "Engine/Threading/JobSystem.h" // Some of those constants must match in shader -// TODO: try using R8 format for Global SDF #define GLOBAL_SDF_FORMAT PixelFormat::R16_Float #define GLOBAL_SDF_RASTERIZE_MODEL_MAX_COUNT 28 // The maximum amount of models to rasterize at once as a batch into Global SDF. #define GLOBAL_SDF_RASTERIZE_HEIGHTFIELD_MAX_COUNT 2 // The maximum amount of heightfields to store in a single chunk. @@ -30,7 +30,7 @@ #define GLOBAL_SDF_RASTERIZE_MIP_FACTOR 4 // Global SDF mip resolution downscale factor. #define GLOBAL_SDF_MIP_GROUP_SIZE 4 #define GLOBAL_SDF_MIP_FLOODS 5 // Amount of flood fill passes for mip. -#define GLOBAL_SDF_DEBUG_CHUNKS 0 +#define GLOBAL_SDF_DEBUG_CHUNKS 0 // Toggles debug drawing of Global SDF chunks bounds including objects count label (only for the first cascade) #define GLOBAL_SDF_DEBUG_FORCE_REDRAW 0 // Forces to redraw all SDF cascades every frame #define GLOBAL_SDF_ACTOR_IS_STATIC(actor) EnumHasAllFlags(actor->GetStaticFlags(), StaticFlags::Lightmap | StaticFlags::Transform) @@ -130,13 +130,30 @@ uint32 GetHash(const RasterizeChunkKey& key) struct CascadeData { + bool Dirty; + int32 Index; + float ChunkSize; + float MaxDistance; Float3 Position; float VoxelSize; BoundingBox Bounds; + BoundingBox CullingBounds; + BoundingBox RasterizeBounds; + Vector3 OriginMin; + Vector3 OriginMax; HashSet NonEmptyChunks; HashSet StaticChunks; - FORCE_INLINE void OnSceneRenderingDirty(const BoundingBox& objectBounds) + // Cache + Dictionary Chunks; + Array RasterizeObjects; + Array ObjectsData; + Array ObjectsTextures; + Dictionary ObjectIndexToDataIndex; + HashSet PendingSDFTextures; + HashSet PendingObjectTypes; + + void OnSceneRenderingDirty(const BoundingBox& objectBounds) { if (StaticChunks.IsEmpty() || !Bounds.Intersects(objectBounds)) return; @@ -147,9 +164,8 @@ struct CascadeData Vector3::Subtract(objectBoundsCascade.Minimum, Bounds.Minimum, objectBoundsCascade.Minimum); Vector3::Clamp(objectBounds.Maximum + objectMargin, Bounds.Minimum, Bounds.Maximum, objectBoundsCascade.Maximum); Vector3::Subtract(objectBoundsCascade.Maximum, Bounds.Minimum, objectBoundsCascade.Maximum); - const float chunkSize = VoxelSize * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE; - const Int3 objectChunkMin(objectBoundsCascade.Minimum / chunkSize); - const Int3 objectChunkMax(objectBoundsCascade.Maximum / chunkSize); + const Int3 objectChunkMin(objectBoundsCascade.Minimum / ChunkSize); + const Int3 objectChunkMax(objectBoundsCascade.Maximum / ChunkSize); // Invalidate static chunks intersecting with dirty bounds RasterizeChunkKey key; @@ -181,8 +197,13 @@ public: HashSet SDFTextures; GlobalSignDistanceFieldPass::BindingData Result; + // Async objects drawing cache + Array> AsyncDrawWaitLabels; + RenderContext AsyncRenderContext; + ~GlobalSignDistanceFieldCustomBuffer() { + WaitForDrawing(); for (const auto& e : SDFTextures) { e.Item->Deleted.Unbind(this); @@ -215,6 +236,145 @@ public: } } + const float CascadesDistanceScales[4] = { 1.0f, 2.5f, 5.0f, 10.0f }; + + void GetOptions(const RenderContext& renderContext, int32& resolution, int32& cascadesCount, int32& resolutionMip, float& distance) + { + switch (Graphics::GlobalSDFQuality) + { + case Quality::Low: + resolution = 128; + cascadesCount = 2; + break; + case Quality::Medium: + resolution = 128; + cascadesCount = 3; + break; + case Quality::High: + resolution = 192; + cascadesCount = 4; + break; + case Quality::Ultra: + default: + resolution = 256; + cascadesCount = 4; + break; + } + resolutionMip = Math::DivideAndRoundUp(resolution, GLOBAL_SDF_RASTERIZE_MIP_FACTOR); + auto& giSettings = renderContext.List->Settings.GlobalIllumination; + distance = GraphicsSettings::Get()->GlobalSDFDistance; + if (giSettings.Mode == GlobalIlluminationMode::DDGI) + distance = Math::Max(distance, giSettings.Distance); + distance = Math::Min(distance, renderContext.View.Far); + } + + void DrawCascadeActors(const CascadeData& cascade); + void UpdateCascadeChunks(CascadeData& cascade); + void WriteCascadeObjects(CascadeData& cascade); + void DrawCascadeJob(int32 cascadeIndex); + + void StartDrawing(const RenderContext& renderContext, bool enableAsync = false, bool reset = false) + { + if (AsyncDrawWaitLabels.HasItems()) + return; // Already started earlier this frame + int32 resolution, cascadesCount, resolutionMip; + float distance; + GetOptions(renderContext, resolution, cascadesCount, resolutionMip, distance); + if (Cascades.Count() != cascadesCount || Resolution != resolution || Origin != renderContext.View.Origin) + return; // Not yet initialized + PROFILE_CPU(); + + // Calculate origin for Global SDF by shifting it towards the view direction to account for better view frustum coverage + const float distanceExtent = distance / CascadesDistanceScales[cascadesCount - 1]; + Float3 viewPosition = renderContext.View.Position; + { + Float3 viewDirection = renderContext.View.Direction; + const float cascade0Distance = distanceExtent * CascadesDistanceScales[0]; + const Vector2 viewRayHit = CollisionsHelper::LineHitsBox(viewPosition, viewPosition + viewDirection * (cascade0Distance * 2.0f), viewPosition - cascade0Distance, viewPosition + cascade0Distance); + const float viewOriginOffset = (float)viewRayHit.Y * cascade0Distance * 0.6f; + viewPosition += viewDirection * viewOriginOffset; + } + + // Setup data for rendering + if (FrameIndex++ > 128) + FrameIndex = 0; + AsyncRenderContext = renderContext; + AsyncRenderContext.View.Pass = DrawPass::GlobalSDF; + const bool useCache = !reset && !GLOBAL_SDF_DEBUG_FORCE_REDRAW && GPU_SPREAD_WORKLOAD; + static_assert(GLOBAL_SDF_RASTERIZE_CHUNK_SIZE % GLOBAL_SDF_RASTERIZE_GROUP_SIZE == 0, "Invalid chunk size for Global SDF rasterization group size."); + const int32 rasterizeChunks = Math::CeilToInt((float)resolution / (float)GLOBAL_SDF_RASTERIZE_CHUNK_SIZE); + const bool updateEveryFrame = false; // true if update all cascades every frame + const int32 maxCascadeUpdatesPerFrame = 1; // maximum cascades to update at a single frame + + // Rasterize world geometry into Global SDF + for (int32 cascadeIndex = 0; cascadeIndex < cascadesCount; cascadeIndex++) + { + // Reduce frequency of the updates + auto& cascade = Cascades[cascadeIndex]; + cascade.Index = cascadeIndex; + cascade.Dirty = !useCache || RenderTools::ShouldUpdateCascade(FrameIndex, cascadeIndex, cascadesCount, maxCascadeUpdatesPerFrame, updateEveryFrame); + if (!cascade.Dirty) + continue; + const float cascadeDistance = distanceExtent * CascadesDistanceScales[cascadeIndex]; + const float cascadeMaxDistance = cascadeDistance * 2; + const float cascadeVoxelSize = cascadeMaxDistance / (float)resolution; + const float cascadeChunkSize = cascadeVoxelSize * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE; + static_assert(GLOBAL_SDF_RASTERIZE_CHUNK_SIZE % GLOBAL_SDF_RASTERIZE_MIP_FACTOR == 0, "Adjust chunk size to match the mip factor scale."); + const Float3 center = Float3::Floor(viewPosition / cascadeChunkSize) * cascadeChunkSize; + //const Float3 center = Float3::Zero; + BoundingBox cascadeBounds(center - cascadeDistance, center + cascadeDistance); + + // Clear cascade before rasterization + cascade.Chunks.Clear(); + // TODO: consider using for RendererAllocation Chunks and RasterizeObjects to share memory with other rendering internals (ensure to release memory after SDF draw ends) + cascade.Chunks.EnsureCapacity(rasterizeChunks * rasterizeChunks, false); + // TODO: cache RasterizeObjects size from the previous frame (for this cascade) and preallocate it here once RendererAllocation is used + cascade.RasterizeObjects.Clear(); + cascade.PendingSDFTextures.Clear(); + + // Check if cascade center has been moved + if (!(useCache && Float3::NearEqual(cascade.Position, center, cascadeVoxelSize))) + { + // TODO: optimize for moving camera (use chunkCoords scrolling) + cascade.StaticChunks.Clear(); + } + + // Setup cascade info + cascade.Position = center; + cascade.VoxelSize = cascadeVoxelSize; + cascade.ChunkSize = cascadeVoxelSize * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE; + cascade.MaxDistance = cascadeMaxDistance; + cascade.Bounds = cascadeBounds; + cascade.RasterizeBounds = cascadeBounds; + cascade.RasterizeBounds.Minimum += 0.1f; // Adjust to prevent overflowing chunk keys (cascade bounds are used for clamping object bounds) + cascade.RasterizeBounds.Maximum -= 0.1f; // Adjust to prevent overflowing chunk keys (cascade bounds are used for clamping object bounds) + cascade.CullingBounds = cascadeBounds.MakeOffsetted(Origin); + const float objectMargin = cascadeVoxelSize * GLOBAL_SDF_RASTERIZE_CHUNK_MARGIN; + cascade.OriginMin = -Origin - objectMargin; + cascade.OriginMax = -Origin + objectMargin; + } + if (enableAsync) + { + // Draw all dirty cascades in async (separate job for each cascade) + Function func; + func.Bind(this); + AsyncDrawWaitLabels.Add(JobSystem::Dispatch(func, cascadesCount)); + } + else + { + // Synchronized drawing in sequence + for (int32 cascadeIndex = 0; cascadeIndex < cascadesCount; cascadeIndex++) + DrawCascadeJob(cascadeIndex); + } + } + + void WaitForDrawing() + { + for (int64 label : AsyncDrawWaitLabels) + JobSystem::Wait(label); + AsyncDrawWaitLabels.Clear(); + } + FORCE_INLINE void OnSceneRenderingDirty(const BoundingBox& objectBounds) { for (auto& cascade : Cascades) @@ -256,9 +416,161 @@ public: namespace { - Dictionary ChunksCache; - Array RasterizeObjectsCache; - Dictionary ObjectIndexToDataIndexCache; + GlobalSignDistanceFieldCustomBuffer* Current = nullptr; + ThreadLocal CurrentCascade; +} + +void GlobalSignDistanceFieldCustomBuffer::DrawCascadeActors(const CascadeData& cascade) +{ + PROFILE_CPU(); + const BoundingBox cullingBounds = cascade.CullingBounds; + const uint32 viewMask = AsyncRenderContext.View.RenderLayersMask; + // TODO: add scene detail scale factor to PostFx settings (eg. to increase or decrease scene details and quality) + const float minObjectRadius = Math::Max(20.0f, cascade.VoxelSize * 2.0f); // Skip too small objects for this cascade + int32 actorsDrawn = 0; + SceneRendering::DrawCategory drawCategories[] = { SceneRendering::SceneDraw, SceneRendering::SceneDrawAsync }; + for (auto* scene : AsyncRenderContext.List->Scenes) + { + for (SceneRendering::DrawCategory drawCategory : drawCategories) + { + auto& list = scene->Actors[drawCategory]; + for (const auto& e : list) + { + if (e.Bounds.Radius >= minObjectRadius && viewMask & e.LayerMask && CollisionsHelper::BoxIntersectsSphere(cullingBounds, e.Bounds)) + { + //PROFILE_CPU_ACTOR(e.Actor); + e.Actor->Draw(AsyncRenderContext); +#if COMPILE_WITH_PROFILER + actorsDrawn++; +#endif + } + } + } + } + ZoneValue(actorsDrawn); +} + +void GlobalSignDistanceFieldCustomBuffer::UpdateCascadeChunks(CascadeData& cascade) +{ + PROFILE_CPU(); + + // Update static chunks + for (auto it = cascade.Chunks.Begin(); it.IsNotEnd(); ++it) + { + auto& e = *it; + if (e.Key.Layer != 0) + continue; + if (e.Value.Dynamic) + { + // Remove static chunk with dynamic objects + cascade.StaticChunks.Remove(e.Key); + } + else if (cascade.StaticChunks.Contains(e.Key)) + { + // Skip updating static chunk + auto key = e.Key; + while (cascade.Chunks.Remove(key)) + key.NextLayer(); + } + else + { + // Add to cache (render now but skip next frame) + cascade.StaticChunks.Add(e.Key); + } + } +} + +void GlobalSignDistanceFieldCustomBuffer::WriteCascadeObjects(CascadeData& cascade) +{ + PROFILE_CPU(); + + // Write all objects to the buffer + int32 objectsBufferCount = 0; + cascade.ObjectsData.Clear(); + cascade.ObjectsTextures.Clear(); + cascade.ObjectIndexToDataIndex.Clear(); + for (const auto& e : cascade.Chunks) + { + auto& chunk = e.Value; + for (int32 i = 0; i < chunk.ModelsCount; i++) + { + auto objectIndex = chunk.Models[i]; + if (cascade.ObjectIndexToDataIndex.ContainsKey(objectIndex)) + continue; + const auto& object = cascade.RasterizeObjects.Get()[objectIndex]; + + // Pick the SDF mip for the cascade + int32 mipLevelIndex = 1; + float worldUnitsPerVoxel = object.SDF->WorldUnitsPerVoxel * object.LocalToWorld.Scale.MaxValue() * 4; + const int32 mipLevels = object.SDF->Texture->MipLevels(); + while (cascade.VoxelSize > worldUnitsPerVoxel && mipLevelIndex < mipLevels) + { + mipLevelIndex++; + worldUnitsPerVoxel *= 2.0f; + } + mipLevelIndex--; + + // Add object data for the GPU buffer + uint16 dataIndex = objectsBufferCount++; + ObjectRasterizeData objectData; + Platform::MemoryClear(&objectData, sizeof(objectData)); + Matrix localToWorld, worldToLocal, volumeToWorld; + Matrix::Transformation(object.LocalToWorld.Scale, object.LocalToWorld.Orientation, object.LocalToWorld.Translation - Origin, localToWorld); + Matrix::Invert(localToWorld, worldToLocal); + BoundingBox localVolumeBounds(object.SDF->LocalBoundsMin, object.SDF->LocalBoundsMax); + Float3 volumeLocalBoundsExtent = localVolumeBounds.GetSize() * 0.5f; + Matrix worldToVolume = worldToLocal * Matrix::Translation(-(localVolumeBounds.Minimum + volumeLocalBoundsExtent)); + Matrix::Invert(worldToVolume, volumeToWorld); + objectData.WorldToVolume.SetMatrixTranspose(worldToVolume); + objectData.VolumeToWorld.SetMatrixTranspose(volumeToWorld); + objectData.VolumeLocalBoundsExtent = volumeLocalBoundsExtent; + objectData.VolumeToUVWMul = object.SDF->LocalToUVWMul; + objectData.VolumeToUVWAdd = object.SDF->LocalToUVWAdd + (localVolumeBounds.Minimum + volumeLocalBoundsExtent) * object.SDF->LocalToUVWMul; + objectData.MipOffset = (float)mipLevelIndex; + objectData.DecodeMul = 2.0f * object.SDF->MaxDistance; + objectData.DecodeAdd = -object.SDF->MaxDistance; + cascade.ObjectsData.Add((const byte*)&objectData, sizeof(objectData)); + cascade.ObjectsTextures.Add(object.SDF->Texture->ViewVolume()); + cascade.PendingObjectTypes.Add(object.Actor->GetTypeHandle()); + cascade.ObjectIndexToDataIndex.Add(objectIndex, dataIndex); + } + for (int32 i = 0; i < chunk.HeightfieldsCount; i++) + { + auto objectIndex = chunk.Heightfields[i]; + if (cascade.ObjectIndexToDataIndex.ContainsKey(objectIndex)) + continue; + const auto& object = cascade.RasterizeObjects.Get()[objectIndex]; + + // Add object data for the GPU buffer + uint16 dataIndex = objectsBufferCount++; + ObjectRasterizeData objectData; + Platform::MemoryClear(&objectData, sizeof(objectData)); + Matrix localToWorld, worldToLocal; + Matrix::Transformation(object.LocalToWorld.Scale, object.LocalToWorld.Orientation, object.LocalToWorld.Translation - Origin, localToWorld); + Matrix::Invert(localToWorld, worldToLocal); + objectData.WorldToVolume.SetMatrixTranspose(worldToLocal); + objectData.VolumeToWorld.SetMatrixTranspose(localToWorld); + objectData.VolumeToUVWMul = Float3(object.LocalToUV.X, 1.0f, object.LocalToUV.Y); + objectData.VolumeToUVWAdd = Float3(object.LocalToUV.Z, 0.0f, object.LocalToUV.W); + objectData.MipOffset = (float)cascade.Index * 0.5f; // Use lower-quality mip for far cascades + cascade.ObjectsData.Add((const byte*)&objectData, sizeof(objectData)); + cascade.ObjectsTextures.Add(object.Heightfield->View()); + cascade.PendingObjectTypes.Add(object.Actor->GetTypeHandle()); + cascade.ObjectIndexToDataIndex.Add(objectIndex, dataIndex); + } + } +} + +void GlobalSignDistanceFieldCustomBuffer::DrawCascadeJob(int32 cascadeIndex) +{ + auto& cascade = Cascades[cascadeIndex]; + if (!cascade.Dirty) + return; + PROFILE_CPU(); + CurrentCascade.Set(&cascade); + DrawCascadeActors(cascade); + UpdateCascadeChunks(cascade); + WriteCascadeObjects(cascade); } String GlobalSignDistanceFieldPass::ToString() const @@ -309,7 +621,7 @@ bool GlobalSignDistanceFieldPass::setupResources() // Init buffer if (!_objectsBuffer) - _objectsBuffer = New(64u * (uint32)sizeof(ObjectRasterizeData), (uint32)sizeof(ObjectRasterizeData), false, TEXT("GlobalSDF.ObjectsBuffer")); + _objectsBuffer = New(0, (uint32)sizeof(ObjectRasterizeData), false, TEXT("GlobalSDF.ObjectsBuffer")); // Create pipeline state GPUPipelineState::Description psDesc = GPUPipelineState::Description::DefaultFullscreenTriangle; @@ -347,12 +659,22 @@ void GlobalSignDistanceFieldPass::Dispose() // Cleanup SAFE_DELETE(_objectsBuffer); - _objectsTextures.Resize(0); SAFE_DELETE_GPU_RESOURCE(_psDebug); _shader = nullptr; - ChunksCache.SetCapacity(0); - RasterizeObjectsCache.SetCapacity(0); - ObjectIndexToDataIndexCache.SetCapacity(0); +} + +void GlobalSignDistanceFieldPass::OnCollectDrawCalls(RenderContextBatch& renderContextBatch) +{ + // Check if Global SDF will be used this frame + PROFILE_CPU_NAMED("Global SDF"); + if (checkIfSkipPass()) + return; + RenderContext& renderContext = renderContextBatch.GetMainContext(); + if (renderContext.List->Scenes.Count() == 0) + return; + auto& sdfData = *renderContext.Buffers->GetCustomBuffer(TEXT("GlobalSignDistanceField")); + Current = &sdfData; + sdfData.StartDrawing(renderContext, renderContextBatch.EnableAsync); } bool GlobalSignDistanceFieldPass::Get(const RenderBuffers* buffers, BindingData& result) @@ -386,44 +708,19 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex PROFILE_GPU_CPU("Global SDF"); // Setup options - int32 resolution, cascadesCount; - switch (Graphics::GlobalSDFQuality) - { - case Quality::Low: - resolution = 128; - cascadesCount = 2; - break; - case Quality::Medium: - resolution = 128; - cascadesCount = 3; - break; - case Quality::High: - resolution = 192; - cascadesCount = 4; - break; - case Quality::Ultra: - default: - resolution = 256; - cascadesCount = 4; - break; - } - const int32 resolutionMip = Math::DivideAndRoundUp(resolution, GLOBAL_SDF_RASTERIZE_MIP_FACTOR); - auto& giSettings = renderContext.List->Settings.GlobalIllumination; - float distance = GraphicsSettings::Get()->GlobalSDFDistance; - if (giSettings.Mode == GlobalIlluminationMode::DDGI) - distance = Math::Max(distance, giSettings.Distance); - distance = Math::Min(distance, renderContext.View.Far); - const float cascadesDistanceScales[] = { 1.0f, 2.5f, 5.0f, 10.0f }; - const float distanceExtent = distance / cascadesDistanceScales[cascadesCount - 1]; + int32 resolution, cascadesCount, resolutionMip; + float distance; + sdfData.GetOptions(renderContext, resolution, cascadesCount, resolutionMip, distance); + const float distanceExtent = distance / sdfData.CascadesDistanceScales[cascadesCount - 1]; // Initialize buffers - bool updated = false; + bool reset = false; if (sdfData.Cascades.Count() != cascadesCount || sdfData.Resolution != resolution) { sdfData.Cascades.Resize(cascadesCount); sdfData.Resolution = resolution; sdfData.FrameIndex = 0; - updated = true; + reset = true; auto desc = GPUTextureDescription::New3D(resolution * cascadesCount, resolution, resolution, GLOBAL_SDF_FORMAT, GPUTextureFlags::ShaderResource | GPUTextureFlags::UnorderedAccess, 1); { GPUTexture*& texture = sdfData.Texture; @@ -463,10 +760,10 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex if (sdfData.Origin != renderContext.View.Origin) { sdfData.Origin = renderContext.View.Origin; - updated = true; + reset = true; } GPUTexture* tmpMip = nullptr; - if (updated) + if (reset) { PROFILE_GPU_CPU_NAMED("Init"); for (auto& cascade : sdfData.Cascades) @@ -480,126 +777,60 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex for (SceneRendering* scene : renderContext.List->Scenes) sdfData.ListenSceneRendering(scene); - // Calculate origin for Global SDF by shifting it towards the view direction to account for better view frustum coverage - Float3 viewPosition = renderContext.View.Position; - { - Float3 viewDirection = renderContext.View.Direction; - const float cascade0Distance = distanceExtent * cascadesDistanceScales[0]; - const Vector2 viewRayHit = CollisionsHelper::LineHitsBox(viewPosition, viewPosition + viewDirection * (cascade0Distance * 2.0f), viewPosition - cascade0Distance, viewPosition + cascade0Distance); - const float viewOriginOffset = (float)viewRayHit.Y * cascade0Distance * 0.6f; - viewPosition += viewDirection * viewOriginOffset; - } + // Ensure that async objects drawing ended + Current = &sdfData; + sdfData.StartDrawing(renderContext, false, reset); // (ignored if not started earlier this frame) + sdfData.WaitForDrawing(); // Rasterize world geometry into Global SDF - renderContext.View.Pass = DrawPass::GlobalSDF; - uint32 viewMask = renderContext.View.RenderLayersMask; - const bool useCache = !updated && !GLOBAL_SDF_DEBUG_FORCE_REDRAW && GPU_SPREAD_WORKLOAD; - static_assert(GLOBAL_SDF_RASTERIZE_CHUNK_SIZE % GLOBAL_SDF_RASTERIZE_GROUP_SIZE == 0, "Invalid chunk size for Global SDF rasterization group size."); - const int32 rasterizeChunks = Math::CeilToInt((float)resolution / (float)GLOBAL_SDF_RASTERIZE_CHUNK_SIZE); - auto& chunks = ChunksCache; - chunks.EnsureCapacity(rasterizeChunks * rasterizeChunks, false); bool anyDraw = false; - const bool updateEveryFrame = false; // true if update all cascades every frame - const int32 maxCascadeUpdatesPerFrame = 1; // maximum cascades to update at a single frame GPUTextureView* textureView = sdfData.Texture->ViewVolume(); GPUTextureView* textureMipView = sdfData.TextureMip->ViewVolume(); - if (sdfData.FrameIndex++ > 128) - sdfData.FrameIndex = 0; for (int32 cascadeIndex = 0; cascadeIndex < cascadesCount; cascadeIndex++) { - // Reduce frequency of the updates - if (useCache && !RenderTools::ShouldUpdateCascade(sdfData.FrameIndex, cascadeIndex, cascadesCount, maxCascadeUpdatesPerFrame, updateEveryFrame)) - continue; auto& cascade = sdfData.Cascades[cascadeIndex]; - const float cascadeDistance = distanceExtent * cascadesDistanceScales[cascadeIndex]; - const float cascadeMaxDistance = cascadeDistance * 2; - const float cascadeVoxelSize = cascadeMaxDistance / (float)resolution; - const float cascadeChunkSize = cascadeVoxelSize * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE; - static_assert(GLOBAL_SDF_RASTERIZE_CHUNK_SIZE % GLOBAL_SDF_RASTERIZE_MIP_FACTOR == 0, "Adjust chunk size to match the mip factor scale."); - const Float3 center = Float3::Floor(viewPosition / cascadeChunkSize) * cascadeChunkSize; - //const Float3 center = Float3::Zero; - BoundingBox cascadeBounds(center - cascadeDistance, center + cascadeDistance); - // TODO: add scene detail scale factor to PostFx settings (eg. to increase or decrease scene details and quality) - const float minObjectRadius = Math::Max(20.0f, cascadeVoxelSize * 2.0f); // Skip too small objects for this cascade + if (!cascade.Dirty) + continue; - // Clear cascade before rasterization + // Process all pending SDF textures tracking + for (auto& e : cascade.PendingSDFTextures) { - PROFILE_CPU_NAMED("Clear"); - chunks.Clear(); - RasterizeObjectsCache.Clear(); - _objectsBuffer->Clear(); - _objectsTextures.Clear(); - } - - // Check if cascade center has been moved - if (!(useCache && Float3::NearEqual(cascade.Position, center, cascadeVoxelSize))) - { - // TODO: optimize for moving camera (copy sdf for cached chunks) - cascade.StaticChunks.Clear(); - } - cascade.Position = center; - cascade.VoxelSize = cascadeVoxelSize; - cascade.Bounds = cascadeBounds; - - // Draw all objects from all scenes into the cascade - _objectsBufferCount = 0; - _voxelSize = cascadeVoxelSize; - _chunkSize = _voxelSize * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE; - _cascadeBounds = cascadeBounds; - _cascadeBounds.Minimum += 0.1f; // Adjust to prevent overflowing chunk keys (cascade bounds are used for clamping object bounds) - _cascadeBounds.Maximum -= 0.1f; // Adjust to prevent overflowing chunk keys (cascade bounds are used for clamping object bounds) - _cascadeIndex = cascadeIndex; - _sdfData = &sdfData; - const float objectMargin = _voxelSize * GLOBAL_SDF_RASTERIZE_CHUNK_MARGIN; - _sdfDataOriginMin = -sdfData.Origin - objectMargin; - _sdfDataOriginMax = -sdfData.Origin + objectMargin; - { - PROFILE_CPU_NAMED("Draw"); - BoundingBox cascadeBoundsWorld = cascadeBounds.MakeOffsetted(sdfData.Origin); - _cascadeCullingBounds = cascadeBoundsWorld; - int32 actorsDrawn = 0; - SceneRendering::DrawCategory drawCategories[] = { SceneRendering::SceneDraw, SceneRendering::SceneDrawAsync }; - for (auto* scene : renderContext.List->Scenes) + GPUTexture* texture = e.Item; + if (Current->SDFTextures.Add(texture)) { - for (SceneRendering::DrawCategory drawCategory : drawCategories) - { - auto& list = scene->Actors[drawCategory]; - for (const auto& e : list) - { - if (e.Bounds.Radius >= minObjectRadius && viewMask & e.LayerMask && CollisionsHelper::BoxIntersectsSphere(cascadeBoundsWorld, e.Bounds)) - { - //PROFILE_CPU_ACTOR(e.Actor); - e.Actor->Draw(renderContext); - actorsDrawn++; - } - } - } + texture->Deleted.Bind(Current); + texture->ResidentMipsChanged.Bind(Current); } - ZoneValue(actorsDrawn); } + cascade.PendingSDFTextures.Clear(); + + // Process all pending object types tracking + for (auto& e : cascade.PendingObjectTypes) + sdfData.ObjectTypes.Add(e.Item); // Perform batched chunks rasterization anyDraw = true; context->ResetSR(); ModelsRasterizeData data; - data.CascadeCoordToPosMul = (Float3)cascadeBounds.GetSize() / (float)resolution; - data.CascadeCoordToPosAdd = (Float3)cascadeBounds.Minimum + cascadeVoxelSize * 0.5f; - data.MaxDistance = cascadeMaxDistance; + data.CascadeCoordToPosMul = (Float3)cascade.Bounds.GetSize() / (float)resolution; + data.CascadeCoordToPosAdd = (Float3)cascade.Bounds.Minimum + cascade.VoxelSize * 0.5f; + data.MaxDistance = cascade.MaxDistance; data.CascadeResolution = resolution; data.CascadeMipResolution = resolutionMip; data.CascadeIndex = cascadeIndex; data.CascadeMipFactor = GLOBAL_SDF_RASTERIZE_MIP_FACTOR; - data.CascadeVoxelSize = cascadeVoxelSize; + data.CascadeVoxelSize = cascade.VoxelSize; context->BindUA(0, textureView); context->BindCB(1, _cb1); - const int32 chunkDispatchGroups = GLOBAL_SDF_RASTERIZE_CHUNK_SIZE / GLOBAL_SDF_RASTERIZE_GROUP_SIZE; + constexpr int32 chunkDispatchGroups = GLOBAL_SDF_RASTERIZE_CHUNK_SIZE / GLOBAL_SDF_RASTERIZE_GROUP_SIZE; bool anyChunkDispatch = false; + if (!reset) { PROFILE_GPU_CPU_NAMED("Clear Chunks"); for (auto it = cascade.NonEmptyChunks.Begin(); it.IsNotEnd(); ++it) { auto& key = it->Item; - if (chunks.ContainsKey(key)) + if (cascade.Chunks.ContainsKey(key) || cascade.StaticChunks.Contains(key)) continue; // Clear empty chunk @@ -614,121 +845,21 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex { PROFILE_GPU_CPU_NAMED("Rasterize Chunks"); - // Update static chunks - for (auto it = chunks.Begin(); it.IsNotEnd(); ++it) - { - auto& e = *it; - if (e.Key.Layer != 0) - continue; - if (e.Value.Dynamic) - { - // Remove static chunk with dynamic objects - cascade.StaticChunks.Remove(e.Key); - } - else if (cascade.StaticChunks.Contains(e.Key)) - { - // Skip updating static chunk - auto key = e.Key; - while (chunks.Remove(key)) - key.NextLayer(); - } - else - { - // Add to cache (render now but skip next frame) - cascade.StaticChunks.Add(e.Key); - } - } - // Send models data to the GPU - const auto& objectIndexToDataIndex = ObjectIndexToDataIndexCache; - if (chunks.Count() != 0) + const auto& objectIndexToDataIndex = cascade.ObjectIndexToDataIndex; + GPUTextureView** objectsTextures = cascade.ObjectsTextures.Get(); + if (cascade.Chunks.Count() != 0) { - PROFILE_GPU_CPU_NAMED("Update Objects"); - auto& objectIndexToDataIndexCache = ObjectIndexToDataIndexCache; - objectIndexToDataIndexCache.Clear(); - - // Write used objects to the buffer - const auto& rasterizeObjectsCache = RasterizeObjectsCache; - for (const auto& e : chunks) - { - auto& chunk = e.Value; - for (int32 i = 0; i < chunk.ModelsCount; i++) - { - auto objectIndex = chunk.Models[i]; - if (objectIndexToDataIndexCache.ContainsKey(objectIndex)) - continue; - const auto& object = rasterizeObjectsCache.Get()[objectIndex]; - - // Pick the SDF mip for the cascade - int32 mipLevelIndex = 1; - float worldUnitsPerVoxel = object.SDF->WorldUnitsPerVoxel * object.LocalToWorld.Scale.MaxValue() * 4; - const int32 mipLevels = object.SDF->Texture->MipLevels(); - while (_voxelSize > worldUnitsPerVoxel && mipLevelIndex < mipLevels) - { - mipLevelIndex++; - worldUnitsPerVoxel *= 2.0f; - } - mipLevelIndex--; - - // Add object data for the GPU buffer - uint16 dataIndex = _objectsBufferCount++; - ObjectRasterizeData objectData; - Matrix localToWorld, worldToLocal, volumeToWorld; - Matrix::Transformation(object.LocalToWorld.Scale, object.LocalToWorld.Orientation, object.LocalToWorld.Translation - _sdfData->Origin, localToWorld); - Matrix::Invert(localToWorld, worldToLocal); - BoundingBox localVolumeBounds(object.SDF->LocalBoundsMin, object.SDF->LocalBoundsMax); - Float3 volumeLocalBoundsExtent = localVolumeBounds.GetSize() * 0.5f; - Matrix worldToVolume = worldToLocal * Matrix::Translation(-(localVolumeBounds.Minimum + volumeLocalBoundsExtent)); - Matrix::Invert(worldToVolume, volumeToWorld); - objectData.WorldToVolume.SetMatrixTranspose(worldToVolume); - objectData.VolumeToWorld.SetMatrixTranspose(volumeToWorld); - objectData.VolumeLocalBoundsExtent = volumeLocalBoundsExtent; - objectData.VolumeToUVWMul = object.SDF->LocalToUVWMul; - objectData.VolumeToUVWAdd = object.SDF->LocalToUVWAdd + (localVolumeBounds.Minimum + volumeLocalBoundsExtent) * object.SDF->LocalToUVWMul; - objectData.MipOffset = (float)mipLevelIndex; - objectData.DecodeMul = 2.0f * object.SDF->MaxDistance; - objectData.DecodeAdd = -object.SDF->MaxDistance; - _objectsBuffer->Write(objectData); - _objectsTextures.Add(object.SDF->Texture->ViewVolume()); - _sdfData->ObjectTypes.Add(object.Actor->GetTypeHandle()); - - // Cache the mapping - objectIndexToDataIndexCache.Add(objectIndex, dataIndex); - } - for (int32 i = 0; i < chunk.HeightfieldsCount; i++) - { - auto objectIndex = chunk.Heightfields[i]; - if (objectIndexToDataIndexCache.ContainsKey(objectIndex)) - continue; - const auto& object = rasterizeObjectsCache.Get()[objectIndex]; - - // Add object data for the GPU buffer - uint16 dataIndex = _objectsBufferCount++; - ObjectRasterizeData objectData; - Matrix localToWorld, worldToLocal; - Matrix::Transformation(object.LocalToWorld.Scale, object.LocalToWorld.Orientation, object.LocalToWorld.Translation - _sdfData->Origin, localToWorld); - Matrix::Invert(localToWorld, worldToLocal); - objectData.WorldToVolume.SetMatrixTranspose(worldToLocal); - objectData.VolumeToWorld.SetMatrixTranspose(localToWorld); - objectData.VolumeToUVWMul = Float3(object.LocalToUV.X, 1.0f, object.LocalToUV.Y); - objectData.VolumeToUVWAdd = Float3(object.LocalToUV.Z, 0.0f, object.LocalToUV.W); - objectData.MipOffset = (float)_cascadeIndex * 0.5f; // Use lower-quality mip for far cascades - _objectsBuffer->Write(objectData); - _objectsTextures.Add(object.Heightfield->View()); - _sdfData->ObjectTypes.Add(object.Actor->GetTypeHandle()); - - // Cache the mapping - objectIndexToDataIndexCache.Add(objectIndex, dataIndex); - } - } - - // Flush buffer + // Flush buffer but don't allocate any CPU memory by swapping Data pointer with the cascade ObjectsData + PROFILE_CPU_NAMED("Update Objects"); + _objectsBuffer->Data.Swap(cascade.ObjectsData); _objectsBuffer->Flush(context); + _objectsBuffer->Data.Swap(cascade.ObjectsData); } context->BindSR(0, _objectsBuffer->GetBuffer() ? _objectsBuffer->GetBuffer()->View() : nullptr); // Rasterize non-empty chunks (first layer so can override existing chunk data) - for (const auto& e : chunks) + for (const auto& e : cascade.Chunks) { if (e.Key.Layer != 0) continue; @@ -739,7 +870,7 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex { auto objectIndex = objectIndexToDataIndex.At(chunk.Models[i]); data.Objects[i] = objectIndex; - context->BindSR(i + 1, _objectsTextures[objectIndex]); + context->BindSR(i + 1, objectsTextures[objectIndex]); } for (int32 i = chunk.ModelsCount; i < GLOBAL_SDF_RASTERIZE_HEIGHTFIELD_MAX_COUNT; i++) context->UnBindSR(i + 1); @@ -758,7 +889,7 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex { auto objectIndex = objectIndexToDataIndex.At(chunk.Heightfields[i]); data.Objects[i] = objectIndex; - context->BindSR(i + 1, _objectsTextures[objectIndex]); + context->BindSR(i + 1, objectsTextures[objectIndex]); } for (int32 i = chunk.HeightfieldsCount; i < GLOBAL_SDF_RASTERIZE_HEIGHTFIELD_MAX_COUNT; i++) context->UnBindSR(i + 1); @@ -774,21 +905,21 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex int32 count = chunk.ModelsCount + chunk.HeightfieldsCount; RasterizeChunkKey tmp = e.Key; tmp.NextLayer(); - while (chunks.ContainsKey(tmp)) + while (cascade.Chunks.ContainsKey(tmp)) { - count += chunks[tmp].ModelsCount + chunks[tmp].HeightfieldsCount; + count += cascade.Chunks[tmp].ModelsCount + cascade.Chunks[tmp].HeightfieldsCount; tmp.NextLayer(); } - Float3 chunkMin = cascadeBounds.Minimum + Float3(e.Key.Coord) * cascadeChunkSize; - BoundingBox chunkBounds(chunkMin, chunkMin + cascadeChunkSize); + Float3 chunkMin = cascade.Bounds.Minimum + Float3(e.Key.Coord) * cascade.ChunkSize; + BoundingBox chunkBounds(chunkMin, chunkMin + cascade.ChunkSize); DebugDraw::DrawWireBox(chunkBounds, Color::Red, 0, false); DebugDraw::DrawText(StringUtils::ToString(count), chunkBounds.GetCenter(), Color::Red); } #endif } - // Rasterize non-empty chunks (additive layers so so need combine with existing chunk data) - for (const auto& e : chunks) + // Rasterize non-empty chunks (additive layers so need combine with existing chunk data) + for (const auto& e : cascade.Chunks) { if (e.Key.Layer == 0) continue; @@ -802,7 +933,7 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex { auto objectIndex = objectIndexToDataIndex.At(chunk.Models[i]); data.Objects[i] = objectIndex; - context->BindSR(i + 1, _objectsTextures[objectIndex]); + context->BindSR(i + 1, objectsTextures[objectIndex]); } for (int32 i = chunk.ModelsCount; i < GLOBAL_SDF_RASTERIZE_HEIGHTFIELD_MAX_COUNT; i++) context->UnBindSR(i + 1); @@ -818,7 +949,7 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex { auto objectIndex = objectIndexToDataIndex.At(chunk.Heightfields[i]); data.Objects[i] = objectIndex; - context->BindSR(i + 1, _objectsTextures[objectIndex]); + context->BindSR(i + 1, objectsTextures[objectIndex]); } for (int32 i = chunk.HeightfieldsCount; i < GLOBAL_SDF_RASTERIZE_HEIGHTFIELD_MAX_COUNT; i++) context->UnBindSR(i + 1); @@ -831,13 +962,13 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex } // Generate mip out of cascade (empty chunks have distance value 1 which is incorrect so mip will be used as a fallback - lower res) - if (updated || anyChunkDispatch) + if (reset || anyChunkDispatch) { PROFILE_GPU_CPU_NAMED("Generate Mip"); context->ResetUA(); const int32 mipDispatchGroups = Math::DivideAndRoundUp(resolutionMip, GLOBAL_SDF_MIP_GROUP_SIZE); static_assert((GLOBAL_SDF_MIP_FLOODS % 2) == 1, "Invalid Global SDF mip flood iterations count."); - int32 floodFillIterations = chunks.Count() == 0 ? 1 : GLOBAL_SDF_MIP_FLOODS; + int32 floodFillIterations = cascade.Chunks.Count() == 0 ? 1 : GLOBAL_SDF_MIP_FLOODS; if (!tmpMip) { // Use temporary texture to flood fill mip @@ -850,7 +981,6 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex GPUTextureView* tmpMipView = tmpMip->ViewVolume(); // Tex -> Mip - // TODO: use push constants on DX12/Vulkan to provide those 4 uints to the shader data.GenerateMipTexResolution = data.CascadeResolution; data.GenerateMipCoordScale = data.CascadeMipFactor; data.GenerateMipTexOffsetX = data.CascadeIndex * data.CascadeResolution; @@ -903,7 +1033,7 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex for (int32 cascadeIndex = 0; cascadeIndex < cascadesCount; cascadeIndex++) { auto& cascade = sdfData.Cascades[cascadeIndex]; - const float cascadeDistance = distanceExtent * cascadesDistanceScales[cascadeIndex]; + const float cascadeDistance = distanceExtent * sdfData.CascadesDistanceScales[cascadeIndex]; const float cascadeMaxDistance = cascadeDistance * 2; const float cascadeVoxelSize = cascadeMaxDistance / (float)resolution; const Float3 center = cascade.Position; @@ -952,26 +1082,33 @@ void GlobalSignDistanceFieldPass::RenderDebug(RenderContext& renderContext, GPUC context->DrawFullscreenTriangle(); } +void GlobalSignDistanceFieldPass::GetCullingData(BoundingBox& bounds) const +{ + auto& cascade = *CurrentCascade.Get(); + bounds = cascade.CullingBounds; +} + void GlobalSignDistanceFieldPass::RasterizeModelSDF(Actor* actor, const ModelBase::SDFData& sdf, const Transform& localToWorld, const BoundingBox& objectBounds) { if (!sdf.Texture) return; + auto& cascade = *CurrentCascade.Get(); const bool dynamic = !GLOBAL_SDF_ACTOR_IS_STATIC(actor); const int32 residentMipLevels = sdf.Texture->ResidentMipLevels(); if (residentMipLevels != 0) { // Setup object data BoundingBox objectBoundsCascade; - Vector3::Clamp(objectBounds.Minimum + _sdfDataOriginMin, _cascadeBounds.Minimum, _cascadeBounds.Maximum, objectBoundsCascade.Minimum); - Vector3::Subtract(objectBoundsCascade.Minimum, _cascadeBounds.Minimum, objectBoundsCascade.Minimum); - Vector3::Clamp(objectBounds.Maximum + _sdfDataOriginMax, _cascadeBounds.Minimum, _cascadeBounds.Maximum, objectBoundsCascade.Maximum); - Vector3::Subtract(objectBoundsCascade.Maximum, _cascadeBounds.Minimum, objectBoundsCascade.Maximum); - const Int3 objectChunkMin(objectBoundsCascade.Minimum / _chunkSize); - const Int3 objectChunkMax(objectBoundsCascade.Maximum / _chunkSize); + Vector3::Clamp(objectBounds.Minimum + cascade.OriginMin, cascade.RasterizeBounds.Minimum, cascade.RasterizeBounds.Maximum, objectBoundsCascade.Minimum); + Vector3::Subtract(objectBoundsCascade.Minimum, cascade.RasterizeBounds.Minimum, objectBoundsCascade.Minimum); + Vector3::Clamp(objectBounds.Maximum + cascade.OriginMax, cascade.RasterizeBounds.Minimum, cascade.RasterizeBounds.Maximum, objectBoundsCascade.Maximum); + Vector3::Subtract(objectBoundsCascade.Maximum, cascade.RasterizeBounds.Minimum, objectBoundsCascade.Maximum); + const Int3 objectChunkMin(objectBoundsCascade.Minimum / cascade.ChunkSize); + const Int3 objectChunkMax(objectBoundsCascade.Maximum / cascade.ChunkSize); // Add object data - const uint16 dataIndex = RasterizeObjectsCache.Count(); - auto& data = RasterizeObjectsCache.AddOne(); + const uint16 dataIndex = cascade.RasterizeObjects.Count(); + auto& data = cascade.RasterizeObjects.AddOne(); data.Actor = actor; data.SDF = &sdf; data.LocalToWorld = localToWorld; @@ -979,7 +1116,7 @@ void GlobalSignDistanceFieldPass::RasterizeModelSDF(Actor* actor, const ModelBas // Inject object into the intersecting cascade chunks RasterizeChunkKey key; - auto& chunks = ChunksCache; + auto& chunks = cascade.Chunks; for (key.Coord.Z = objectChunkMin.Z; key.Coord.Z <= objectChunkMax.Z; key.Coord.Z++) { for (key.Coord.Y = objectChunkMin.Y; key.Coord.Y <= objectChunkMax.Y; key.Coord.Y++) @@ -1005,11 +1142,9 @@ void GlobalSignDistanceFieldPass::RasterizeModelSDF(Actor* actor, const ModelBas } // Track streaming for textures used in static chunks to invalidate cache - if (!dynamic && residentMipLevels != sdf.Texture->MipLevels() && !_sdfData->SDFTextures.Contains(sdf.Texture)) + if (!dynamic && residentMipLevels != sdf.Texture->MipLevels() && !Current->SDFTextures.Contains(sdf.Texture)) { - sdf.Texture->Deleted.Bind(_sdfData); - sdf.Texture->ResidentMipsChanged.Bind(_sdfData); - _sdfData->SDFTextures.Add(sdf.Texture); + cascade.PendingSDFTextures.Add(sdf.Texture); } } @@ -1017,22 +1152,23 @@ void GlobalSignDistanceFieldPass::RasterizeHeightfield(Actor* actor, GPUTexture* { if (!heightfield) return; + auto& cascade = *CurrentCascade.Get(); const bool dynamic = !GLOBAL_SDF_ACTOR_IS_STATIC(actor); const int32 residentMipLevels = heightfield->ResidentMipLevels(); if (residentMipLevels != 0) { // Setup object data BoundingBox objectBoundsCascade; - Vector3::Clamp(objectBounds.Minimum + _sdfDataOriginMin, _cascadeBounds.Minimum, _cascadeBounds.Maximum, objectBoundsCascade.Minimum); - Vector3::Subtract(objectBoundsCascade.Minimum, _cascadeBounds.Minimum, objectBoundsCascade.Minimum); - Vector3::Clamp(objectBounds.Maximum + _sdfDataOriginMax, _cascadeBounds.Minimum, _cascadeBounds.Maximum, objectBoundsCascade.Maximum); - Vector3::Subtract(objectBoundsCascade.Maximum, _cascadeBounds.Minimum, objectBoundsCascade.Maximum); - const Int3 objectChunkMin(objectBoundsCascade.Minimum / _chunkSize); - const Int3 objectChunkMax(objectBoundsCascade.Maximum / _chunkSize); + Vector3::Clamp(objectBounds.Minimum + cascade.OriginMin, cascade.RasterizeBounds.Minimum, cascade.RasterizeBounds.Maximum, objectBoundsCascade.Minimum); + Vector3::Subtract(objectBoundsCascade.Minimum, cascade.RasterizeBounds.Minimum, objectBoundsCascade.Minimum); + Vector3::Clamp(objectBounds.Maximum + cascade.OriginMax, cascade.RasterizeBounds.Minimum, cascade.RasterizeBounds.Maximum, objectBoundsCascade.Maximum); + Vector3::Subtract(objectBoundsCascade.Maximum, cascade.RasterizeBounds.Minimum, objectBoundsCascade.Maximum); + const Int3 objectChunkMin(objectBoundsCascade.Minimum / cascade.ChunkSize); + const Int3 objectChunkMax(objectBoundsCascade.Maximum / cascade.ChunkSize); // Add object data - const uint16 dataIndex = RasterizeObjectsCache.Count(); - auto& data = RasterizeObjectsCache.AddOne(); + const uint16 dataIndex = cascade.RasterizeObjects.Count(); + auto& data = cascade.RasterizeObjects.AddOne(); data.Actor = actor; data.Heightfield = heightfield; data.LocalToWorld = localToWorld; @@ -1041,7 +1177,7 @@ void GlobalSignDistanceFieldPass::RasterizeHeightfield(Actor* actor, GPUTexture* // Inject object into the intersecting cascade chunks RasterizeChunkKey key; - auto& chunks = ChunksCache; + auto& chunks = cascade.Chunks; for (key.Coord.Z = objectChunkMin.Z; key.Coord.Z <= objectChunkMax.Z; key.Coord.Z++) { for (key.Coord.Y = objectChunkMin.Y; key.Coord.Y <= objectChunkMax.Y; key.Coord.Y++) @@ -1067,10 +1203,8 @@ void GlobalSignDistanceFieldPass::RasterizeHeightfield(Actor* actor, GPUTexture* } // Track streaming for textures used in static chunks to invalidate cache - if (!dynamic && residentMipLevels != heightfield->MipLevels() && !_sdfData->SDFTextures.Contains(heightfield)) + if (!dynamic && residentMipLevels != heightfield->MipLevels() && !Current->SDFTextures.Contains(heightfield)) { - heightfield->Deleted.Bind(_sdfData); - heightfield->ResidentMipsChanged.Bind(_sdfData); - _sdfData->SDFTextures.Add(heightfield); + cascade.PendingSDFTextures.Add(heightfield); } } diff --git a/Source/Engine/Renderer/GlobalSignDistanceFieldPass.h b/Source/Engine/Renderer/GlobalSignDistanceFieldPass.h index 71ee97bea..104aae790 100644 --- a/Source/Engine/Renderer/GlobalSignDistanceFieldPass.h +++ b/Source/Engine/Renderer/GlobalSignDistanceFieldPass.h @@ -39,20 +39,15 @@ private: GPUShaderProgramCS* _csGenerateMip = nullptr; GPUConstantBuffer* _cb0 = nullptr; GPUConstantBuffer* _cb1 = nullptr; - - // Rasterization cache class DynamicStructuredBuffer* _objectsBuffer = nullptr; - Array _objectsTextures; - uint16 _objectsBufferCount; - int32 _cascadeIndex; - float _voxelSize, _chunkSize; - BoundingBox _cascadeBounds; - BoundingBox _cascadeCullingBounds; - class GlobalSignDistanceFieldCustomBuffer* _sdfData; - Vector3 _sdfDataOriginMin; - Vector3 _sdfDataOriginMax; public: + /// + /// Calls drawing scene objects in async early in the frame. + /// + /// The rendering context batch. + void OnCollectDrawCalls(RenderContextBatch& renderContextBatch); + /// /// Gets the Global SDF (only if enabled in Graphics Settings). /// @@ -78,10 +73,7 @@ public: /// The output buffer. void RenderDebug(RenderContext& renderContext, GPUContext* context, GPUTexture* output); - void GetCullingData(BoundingBox& bounds) const - { - bounds = _cascadeCullingBounds; - } + void GetCullingData(BoundingBox& bounds) const; // Rasterize Model SDF into the Global SDF. Call it from actor Draw() method during DrawPass::GlobalSDF. void RasterizeModelSDF(Actor* actor, const ModelBase::SDFData& sdf, const Transform& localToWorld, const BoundingBox& objectBounds); diff --git a/Source/Engine/Renderer/Renderer.cpp b/Source/Engine/Renderer/Renderer.cpp index b33e3ad47..d6fbb17cc 100644 --- a/Source/Engine/Renderer/Renderer.cpp +++ b/Source/Engine/Renderer/Renderer.cpp @@ -409,6 +409,8 @@ void RenderInner(SceneRenderTask* task, RenderContext& renderContext, RenderCont JobSystem::SetJobStartingOnDispatch(false); task->OnCollectDrawCalls(renderContextBatch, SceneRendering::DrawCategory::SceneDraw); task->OnCollectDrawCalls(renderContextBatch, SceneRendering::DrawCategory::SceneDrawAsync); + if (setup.UseGlobalSDF) + GlobalSignDistanceFieldPass::Instance()->OnCollectDrawCalls(renderContextBatch); if (setup.UseGlobalSurfaceAtlas) GlobalSurfaceAtlasPass::Instance()->OnCollectDrawCalls(renderContextBatch); From 91d3216a005b4179ce4448d2537d80ae72ad9126 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Wed, 26 Jun 2024 18:19:36 +0200 Subject: [PATCH 09/11] Fix crash on shutdown when physical material asset was left alone --- Source/Engine/Physics/PhysX/PhysicsBackendPhysX.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Source/Engine/Physics/PhysX/PhysicsBackendPhysX.cpp b/Source/Engine/Physics/PhysX/PhysicsBackendPhysX.cpp index 80d264857..8bdb87122 100644 --- a/Source/Engine/Physics/PhysX/PhysicsBackendPhysX.cpp +++ b/Source/Engine/Physics/PhysX/PhysicsBackendPhysX.cpp @@ -4476,6 +4476,8 @@ void PhysicsBackend::DestroyController(void* controller) void PhysicsBackend::DestroyMaterial(void* material) { + if (!PhysX) + return; // Skip when called by Content unload after Physics is disposed ASSERT_LOW_LAYER(material); auto materialPhysX = (PxMaterial*)material; materialPhysX->userData = nullptr; @@ -4486,6 +4488,8 @@ void PhysicsBackend::DestroyMaterial(void* material) void PhysicsBackend::DestroyObject(void* object) { + if (!PhysX) + return; // Skip when called by Content unload after Physics is disposed ASSERT_LOW_LAYER(object); auto objectPhysX = (PxBase*)object; FlushLocker.Lock(); From 8eaa635385f5bf96ad6d9c024e57684963a5f2bf Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Wed, 26 Jun 2024 18:19:48 +0200 Subject: [PATCH 10/11] Minor tweaks to GlobalSA --- .../Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp b/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp index a64a06635..e76c19b4e 100644 --- a/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp +++ b/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp @@ -179,7 +179,7 @@ public: void ClearObjects() { - WaitForDrawActors(); + WaitForDrawing(); CulledObjectsCounterIndex = -1; CulledObjectsUsageHistory.Clear(); LastFrameAtlasDefragmentation = Engine::FrameCount; @@ -251,7 +251,7 @@ public: } } - void StartDrawActors(const RenderContext& renderContext, bool enableAsync = false) + void StartDrawing(const RenderContext& renderContext, bool enableAsync = false) { if (AsyncDrawWaitLabels.HasItems()) return; // Already started earlier this frame @@ -260,6 +260,7 @@ public: GetOptions(renderContext, resolution, distance); if (Resolution != resolution) return; // Not yet initialized + PROFILE_CPU(); const auto currentFrame = Engine::FrameCount; { // Perform atlas defragmentation if needed @@ -323,7 +324,7 @@ public: } } - void WaitForDrawActors() + void WaitForDrawing() { for (int64 label : AsyncDrawWaitLabels) JobSystem::Wait(label); @@ -655,7 +656,7 @@ void GlobalSurfaceAtlasPass::Dispose() void GlobalSurfaceAtlasPass::OnCollectDrawCalls(RenderContextBatch& renderContextBatch) { // Check if Global Surface Atlas will be used this frame - PROFILE_GPU_CPU_NAMED("Global Surface Atlas"); + PROFILE_CPU_NAMED("Global Surface Atlas"); if (checkIfSkipPass()) return; RenderContext& renderContext = renderContextBatch.GetMainContext(); @@ -668,7 +669,7 @@ void GlobalSurfaceAtlasPass::OnCollectDrawCalls(RenderContextBatch& renderContex return; auto& surfaceAtlasData = *renderContext.Buffers->GetCustomBuffer(TEXT("GlobalSurfaceAtlas")); _surfaceAtlasData = &surfaceAtlasData; - surfaceAtlasData.StartDrawActors(renderContext, renderContextBatch.EnableAsync); + surfaceAtlasData.StartDrawing(renderContext, renderContextBatch.EnableAsync); } bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* context, BindingData& result) @@ -739,8 +740,8 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co // Ensure that async objects drawing ended _surfaceAtlasData = &surfaceAtlasData; - surfaceAtlasData.StartDrawActors(renderContext); // (ignored if not started earlier this frame) - surfaceAtlasData.WaitForDrawActors(); + surfaceAtlasData.StartDrawing(renderContext); // (ignored if not started earlier this frame) + surfaceAtlasData.WaitForDrawing(); // Utility for writing into tiles vertex buffer const Float2 posToClipMul(2.0f * resolutionInv, -2.0f * resolutionInv); From 7b5edc363a7b064a7c233e92d757e735247e061a Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Thu, 27 Jun 2024 09:29:09 +0200 Subject: [PATCH 11/11] Fix GLobalSDF update when not using workload spread and add dispatches count to profiler zone data --- .../Renderer/GlobalSignDistanceFieldPass.cpp | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp b/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp index 7a676f584..8e176f849 100644 --- a/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp +++ b/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp @@ -300,10 +300,10 @@ public: FrameIndex = 0; AsyncRenderContext = renderContext; AsyncRenderContext.View.Pass = DrawPass::GlobalSDF; - const bool useCache = !reset && !GLOBAL_SDF_DEBUG_FORCE_REDRAW && GPU_SPREAD_WORKLOAD; + const bool useCache = !reset && !GLOBAL_SDF_DEBUG_FORCE_REDRAW; static_assert(GLOBAL_SDF_RASTERIZE_CHUNK_SIZE % GLOBAL_SDF_RASTERIZE_GROUP_SIZE == 0, "Invalid chunk size for Global SDF rasterization group size."); const int32 rasterizeChunks = Math::CeilToInt((float)resolution / (float)GLOBAL_SDF_RASTERIZE_CHUNK_SIZE); - const bool updateEveryFrame = false; // true if update all cascades every frame + const bool updateEveryFrame = !GPU_SPREAD_WORKLOAD; // true if update all cascades every frame const int32 maxCascadeUpdatesPerFrame = 1; // maximum cascades to update at a single frame // Rasterize world geometry into Global SDF @@ -462,19 +462,19 @@ void GlobalSignDistanceFieldCustomBuffer::UpdateCascadeChunks(CascadeData& casca continue; if (e.Value.Dynamic) { - // Remove static chunk with dynamic objects + // Remove static chunk if it contains any dynamic object cascade.StaticChunks.Remove(e.Key); } else if (cascade.StaticChunks.Contains(e.Key)) { - // Skip updating static chunk + // Remove chunk from update since it's static auto key = e.Key; while (cascade.Chunks.Remove(key)) key.NextLayer(); } else { - // Add to cache (render now but skip next frame) + // Add to static cache (render now but skip next frame) cascade.StaticChunks.Add(e.Key); } } @@ -823,7 +823,7 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex context->BindUA(0, textureView); context->BindCB(1, _cb1); constexpr int32 chunkDispatchGroups = GLOBAL_SDF_RASTERIZE_CHUNK_SIZE / GLOBAL_SDF_RASTERIZE_GROUP_SIZE; - bool anyChunkDispatch = false; + int32 chunkDispatches = 0; if (!reset) { PROFILE_GPU_CPU_NAMED("Clear Chunks"); @@ -838,9 +838,10 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex data.ChunkCoord = key.Coord * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE; context->UpdateCB(_cb1, &data); context->Dispatch(_csClearChunk, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); - anyChunkDispatch = true; + chunkDispatches++; // TODO: don't stall with UAV barrier on D3D12/Vulkan if UAVs don't change between dispatches } + ZoneValue(chunkDispatches); } { PROFILE_GPU_CPU_NAMED("Rasterize Chunks"); @@ -879,7 +880,7 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex context->UpdateCB(_cb1, &data); auto cs = data.ObjectsCount != 0 ? _csRasterizeModel0 : _csClearChunk; // Terrain-only chunk can be quickly cleared context->Dispatch(cs, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); - anyChunkDispatch = true; + chunkDispatches++; // TODO: don't stall with UAV barrier on D3D12/Vulkan if UAVs don't change between dispatches (maybe cache per-shader write/read flags for all UAVs?) if (chunk.HeightfieldsCount != 0) @@ -896,6 +897,7 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex data.ObjectsCount = chunk.HeightfieldsCount; context->UpdateCB(_cb1, &data); context->Dispatch(_csRasterizeHeightfield, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); + chunkDispatches++; } #if GLOBAL_SDF_DEBUG_CHUNKS @@ -940,6 +942,7 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex data.ObjectsCount = chunk.ModelsCount; context->UpdateCB(_cb1, &data); context->Dispatch(_csRasterizeModel1, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); + chunkDispatches++; } if (chunk.HeightfieldsCount != 0) @@ -956,13 +959,15 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex data.ObjectsCount = chunk.HeightfieldsCount; context->UpdateCB(_cb1, &data); context->Dispatch(_csRasterizeHeightfield, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); + chunkDispatches++; } - anyChunkDispatch = true; } + + ZoneValue(chunkDispatches); } // Generate mip out of cascade (empty chunks have distance value 1 which is incorrect so mip will be used as a fallback - lower res) - if (reset || anyChunkDispatch) + if (reset || chunkDispatches != 0) { PROFILE_GPU_CPU_NAMED("Generate Mip"); context->ResetUA();