From 80de56f469b4dcae7917dbf20b3bfb68d72364cf Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Wed, 3 Sep 2025 23:12:55 +0200 Subject: [PATCH] Optimize Global SDF dispatches with UAV write overlaps 2-3x faster rasterization --- .../Renderer/GlobalSignDistanceFieldPass.cpp | 169 +++++++++--------- 1 file changed, 83 insertions(+), 86 deletions(-) diff --git a/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp b/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp index c8c997180..85502cc97 100644 --- a/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp +++ b/Source/Engine/Renderer/GlobalSignDistanceFieldPass.cpp @@ -10,6 +10,7 @@ #include "Engine/Content/Content.h" #include "Engine/Graphics/GPUContext.h" #include "Engine/Graphics/GPUDevice.h" +#include "Engine/Graphics/GPUPass.h" #include "Engine/Graphics/Graphics.h" #include "Engine/Graphics/RenderTask.h" #include "Engine/Graphics/RenderBuffers.h" @@ -841,9 +842,10 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex context->BindCB(1, _cb1); constexpr int32 chunkDispatchGroups = GLOBAL_SDF_RASTERIZE_CHUNK_SIZE / GLOBAL_SDF_RASTERIZE_GROUP_SIZE; int32 chunkDispatches = 0; - if (!reset) + if (!reset && cascade.NonEmptyChunks.HasItems()) { PROFILE_GPU_CPU_NAMED("Clear Chunks"); + GPUComputePass pass(context); for (auto it = cascade.NonEmptyChunks.Begin(); it.IsNotEnd(); ++it) { auto& key = it->Item; @@ -856,7 +858,6 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex context->UpdateCB(_cb1, &data); context->Dispatch(_csClearChunk, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); chunkDispatches++; - // TODO: don't stall with UAV barrier on D3D12/Vulkan if UAVs don't change between dispatches } ZoneValue(chunkDispatches); } @@ -877,106 +878,102 @@ bool GlobalSignDistanceFieldPass::Render(RenderContext& renderContext, GPUContex context->BindSR(0, _objectsBuffer->GetBuffer() ? _objectsBuffer->GetBuffer()->View() : nullptr); // Rasterize non-empty chunks (first layer so can override existing chunk data) - for (const auto& e : cascade.Chunks) + uint32 maxLayer = 0; { - if (e.Key.Layer != 0) - continue; - auto& chunk = e.Value; - cascade.NonEmptyChunks.Add(e.Key); - - for (int32 i = 0; i < chunk.ModelsCount; i++) + GPUComputePass pass(context); + for (const auto& e : cascade.Chunks) { - auto objectIndex = objectIndexToDataIndex.At(chunk.Models[i]); - data.Objects[i] = objectIndex; - context->BindSR(i + 1, objectsTextures[objectIndex]); - } - for (int32 i = chunk.ModelsCount; i < GLOBAL_SDF_RASTERIZE_HEIGHTFIELD_MAX_COUNT; i++) - context->UnBindSR(i + 1); - data.ChunkCoord = e.Key.Coord * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE; - data.ObjectsCount = chunk.ModelsCount; - context->UpdateCB(_cb1, &data); - auto cs = data.ObjectsCount != 0 ? _csRasterizeModel0 : _csClearChunk; // Terrain-only chunk can be quickly cleared - context->Dispatch(cs, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); - chunkDispatches++; - // TODO: don't stall with UAV barrier on D3D12/Vulkan if UAVs don't change between dispatches (maybe cache per-shader write/read flags for all UAVs?) + if (e.Key.Layer != 0) + continue; + auto& chunk = e.Value; + data.ChunkCoord = e.Key.Coord * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE; + cascade.NonEmptyChunks.Add(e.Key); - if (chunk.HeightfieldsCount != 0) - { - // Inject heightfield (additive) - for (int32 i = 0; i < chunk.HeightfieldsCount; i++) - { - auto objectIndex = objectIndexToDataIndex.At(chunk.Heightfields[i]); - data.Objects[i] = objectIndex; - context->BindSR(i + 1, objectsTextures[objectIndex]); - } - for (int32 i = chunk.HeightfieldsCount; i < GLOBAL_SDF_RASTERIZE_HEIGHTFIELD_MAX_COUNT; i++) - context->UnBindSR(i + 1); - data.ObjectsCount = chunk.HeightfieldsCount; - context->UpdateCB(_cb1, &data); - context->Dispatch(_csRasterizeHeightfield, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); - chunkDispatches++; - } - -#if GLOBAL_SDF_DEBUG_CHUNKS - // Debug draw chunk bounds in world space with number of models in it - if (cascadeIndex + 1 == GLOBAL_SDF_DEBUG_CHUNKS) - { - int32 count = chunk.ModelsCount + chunk.HeightfieldsCount; - RasterizeChunkKey tmp = e.Key; - tmp.NextLayer(); - while (cascade.Chunks.ContainsKey(tmp)) - { - count += cascade.Chunks[tmp].ModelsCount + cascade.Chunks[tmp].HeightfieldsCount; - tmp.NextLayer(); - } - Float3 chunkMin = cascade.Bounds.Minimum + Float3(e.Key.Coord) * cascade.ChunkSize; - BoundingBox chunkBounds(chunkMin, chunkMin + cascade.ChunkSize); - DebugDraw::DrawWireBox(chunkBounds, Color::Red, 0, false); - DebugDraw::DrawText(StringUtils::ToString(count), chunkBounds.GetCenter(), Color::Red); - } -#endif - } - - // Rasterize non-empty chunks (additive layers so need combine with existing chunk data) - for (const auto& e : cascade.Chunks) - { - if (e.Key.Layer == 0) - continue; - auto& chunk = e.Value; - data.ChunkCoord = e.Key.Coord * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE; - - if (chunk.ModelsCount != 0) - { - // Inject models (additive) for (int32 i = 0; i < chunk.ModelsCount; i++) { auto objectIndex = objectIndexToDataIndex.At(chunk.Models[i]); data.Objects[i] = objectIndex; context->BindSR(i + 1, objectsTextures[objectIndex]); } - for (int32 i = chunk.ModelsCount; i < GLOBAL_SDF_RASTERIZE_HEIGHTFIELD_MAX_COUNT; i++) + for (int32 i = chunk.ModelsCount; i < GLOBAL_SDF_RASTERIZE_MODEL_MAX_COUNT; i++) context->UnBindSR(i + 1); data.ObjectsCount = chunk.ModelsCount; context->UpdateCB(_cb1, &data); - context->Dispatch(_csRasterizeModel1, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); + auto cs = data.ObjectsCount != 0 ? _csRasterizeModel0 : _csClearChunk; // Terrain-only chunk can be quickly cleared + context->Dispatch(cs, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); chunkDispatches++; - } - if (chunk.HeightfieldsCount != 0) - { - // Inject heightfields (additive) - for (int32 i = 0; i < chunk.HeightfieldsCount; i++) +#if GLOBAL_SDF_DEBUG_CHUNKS + // Debug draw chunk bounds in world space with number of models in it + if (cascadeIndex + 1 == GLOBAL_SDF_DEBUG_CHUNKS) { - auto objectIndex = objectIndexToDataIndex.At(chunk.Heightfields[i]); - data.Objects[i] = objectIndex; - context->BindSR(i + 1, objectsTextures[objectIndex]); + int32 count = chunk.ModelsCount + chunk.HeightfieldsCount; + RasterizeChunkKey tmp = e.Key; + tmp.NextLayer(); + while (cascade.Chunks.ContainsKey(tmp)) + { + count += cascade.Chunks[tmp].ModelsCount + cascade.Chunks[tmp].HeightfieldsCount; + tmp.NextLayer(); + } + Float3 chunkMin = cascade.Bounds.Minimum + Float3(e.Key.Coord) * cascade.ChunkSize; + BoundingBox chunkBounds(chunkMin, chunkMin + cascade.ChunkSize); + DebugDraw::DrawWireBox(chunkBounds, Color::Red, 0, false); + DebugDraw::DrawText(StringUtils::ToString(count), chunkBounds.GetCenter(), Color::Red); + } +#endif + } + } + +#if PLATFORM_WINDOWS + // Hack to fix D3D11 bug that doesn't insert UAV barrier after overlap region ends (between two GPUComputePass) + if (context->GetDevice()->GetRendererType() == RendererType::DirectX11) + context->Dispatch(_csRasterizeModel0, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); +#endif + + // Rasterize non-empty chunks (additive layers so need combine with existing chunk data) + for (uint32 layer = 0; layer <= maxLayer; layer++) + { + GPUComputePass pass(context); + for (const auto& e : cascade.Chunks) + { + if (e.Key.Layer != layer) + continue; + auto& chunk = e.Value; + data.ChunkCoord = e.Key.Coord * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE; + + if (chunk.ModelsCount != 0 && layer != 0) // Models from layer 0 has been already written + { + // Inject models (additive) + for (int32 i = 0; i < chunk.ModelsCount; i++) + { + auto objectIndex = objectIndexToDataIndex.At(chunk.Models[i]); + data.Objects[i] = objectIndex; + context->BindSR(i + 1, objectsTextures[objectIndex]); + } + for (int32 i = chunk.ModelsCount; i < GLOBAL_SDF_RASTERIZE_MODEL_MAX_COUNT; i++) + context->UnBindSR(i + 1); + data.ObjectsCount = chunk.ModelsCount; + context->UpdateCB(_cb1, &data); + context->Dispatch(_csRasterizeModel1, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); + chunkDispatches++; + } + + if (chunk.HeightfieldsCount != 0) + { + // Inject heightfields (additive) + for (int32 i = 0; i < chunk.HeightfieldsCount; i++) + { + auto objectIndex = objectIndexToDataIndex.At(chunk.Heightfields[i]); + data.Objects[i] = objectIndex; + context->BindSR(i + 1, objectsTextures[objectIndex]); + } + for (int32 i = chunk.HeightfieldsCount; i < GLOBAL_SDF_RASTERIZE_HEIGHTFIELD_MAX_COUNT; i++) + context->UnBindSR(i + 1); + data.ObjectsCount = chunk.HeightfieldsCount; + context->UpdateCB(_cb1, &data); + context->Dispatch(_csRasterizeHeightfield, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); + chunkDispatches++; } - for (int32 i = chunk.HeightfieldsCount; i < GLOBAL_SDF_RASTERIZE_HEIGHTFIELD_MAX_COUNT; i++) - context->UnBindSR(i + 1); - data.ObjectsCount = chunk.HeightfieldsCount; - context->UpdateCB(_cb1, &data); - context->Dispatch(_csRasterizeHeightfield, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); - chunkDispatches++; } }