diff --git a/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp b/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp index 65a5ba7d6..34b745e59 100644 --- a/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp +++ b/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp @@ -151,6 +151,7 @@ public: GPUBuffer* ChunksBuffer = nullptr; GPUBuffer* CulledObjectsBuffer = nullptr; DynamicTypedBuffer ObjectsBuffer; + DynamicTypedBuffer ObjectsListBuffer; int32 CulledObjectsCounterIndex = -1; GlobalSurfaceAtlasPass::BindingData Result; RectPackAtlas Atlas; @@ -179,6 +180,7 @@ public: GlobalSurfaceAtlasCustomBuffer() : ObjectsBuffer(256 * (GLOBAL_SURFACE_ATLAS_OBJECT_DATA_STRIDE + GLOBAL_SURFACE_ATLAS_TILE_DATA_STRIDE * 3 / 4), PixelFormat::R32G32B32A32_Float, false, TEXT("GlobalSurfaceAtlas.ObjectsBuffer")) + , ObjectsListBuffer(0, PixelFormat::R32_UInt, false, TEXT("GlobalSurfaceAtlas.ObjectsListBuffer")) { } @@ -403,6 +405,8 @@ public: PROFILE_CPU_NAMED("Write Objects"); DirtyObjectsBuffer.Clear(); ObjectsBuffer.Clear(); + ObjectsListBuffer.Clear(); + ObjectsListBuffer.Data.EnsureCapacity(Objects.Count() * sizeof(uint32)); for (auto& e : Objects) { auto& object = e.Value; @@ -421,6 +425,7 @@ public: // Write to objects buffer (this must match unpacking logic in HLSL) uint32 objectAddress = ObjectsBuffer.Data.Count() / sizeof(Float4); + ObjectsListBuffer.Write(objectAddress); auto* objectData = ObjectsBuffer.WriteReserve(GLOBAL_SURFACE_ATLAS_OBJECT_DATA_STRIDE); objectData[0] = Float4(object.Position, object.Radius); objectData[1] = Float4::Zero; @@ -912,6 +917,7 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co { PROFILE_GPU_CPU_NAMED("Update Objects"); surfaceAtlasData.ObjectsBuffer.Flush(context); + surfaceAtlasData.ObjectsListBuffer.Flush(context); } // Init constants @@ -924,7 +930,7 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co bool notReady = false; // Cull objects into chunks (for faster Atlas sampling) - if (surfaceAtlasData.Objects.Count() != 0) + if (surfaceAtlasData.Objects.Count() != 0 && surfaceAtlasData.ChunksBuffer) { // Each chunk (ChunksBuffer) contains uint with address of the culled objects data start in CulledObjectsBuffer. // If chunk has address=0 then it's unused/empty. @@ -935,55 +941,52 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co uint32 objectsBufferCapacity = (uint32)((float)surfaceAtlasData.Objects.Count() * 1.3f); // Copy counter from ChunksBuffer into staging buffer to access current chunks memory usage to adapt dynamically to the scene complexity - if (surfaceAtlasData.ChunksBuffer) + if (!_culledObjectsSizeBuffer) { - if (!_culledObjectsSizeBuffer) + Platform::MemoryClear(_culledObjectsSizeFrames, sizeof(_culledObjectsSizeFrames)); + _culledObjectsSizeBuffer = GPUDevice::Instance->CreateBuffer(TEXT("GlobalSurfaceAtlas.CulledObjectsSizeBuffer")); + const GPUBufferDescription desc = GPUBufferDescription::Buffer(ARRAY_COUNT(_culledObjectsSizeFrames) * sizeof(uint32), GPUBufferFlags::None, PixelFormat::R32_UInt, _culledObjectsSizeFrames, sizeof(uint32), GPUResourceUsage::StagingReadback); + if (_culledObjectsSizeBuffer->Init(desc)) + return true; + } + if (surfaceAtlasData.CulledObjectsCounterIndex != -1) + { + // Get the last counter value (accept staging readback delay or not available data yet) + notReady = true; + auto data = (uint32*)_culledObjectsSizeBuffer->Map(GPUResourceMapMode::Read); + if (data) { - Platform::MemoryClear(_culledObjectsSizeFrames, sizeof(_culledObjectsSizeFrames)); - _culledObjectsSizeBuffer = GPUDevice::Instance->CreateBuffer(TEXT("GlobalSurfaceAtlas.CulledObjectsSizeBuffer")); - const GPUBufferDescription desc = GPUBufferDescription::Buffer(ARRAY_COUNT(_culledObjectsSizeFrames) * sizeof(uint32), GPUBufferFlags::None, PixelFormat::R32_UInt, _culledObjectsSizeFrames, sizeof(uint32), GPUResourceUsage::StagingReadback); - if (_culledObjectsSizeBuffer->Init(desc)) - return true; - } - if (surfaceAtlasData.CulledObjectsCounterIndex != -1) - { - // Get the last counter value (accept staging readback delay or not available data yet) - notReady = true; - auto data = (uint32*)_culledObjectsSizeBuffer->Map(GPUResourceMapMode::Read); - if (data) + uint32 counter = data[surfaceAtlasData.CulledObjectsCounterIndex]; + if (counter > 0) { - uint32 counter = data[surfaceAtlasData.CulledObjectsCounterIndex]; - if (counter > 0) - { - objectsBufferCapacity = counter; - notReady = false; - } - _culledObjectsSizeBuffer->Unmap(); - } - - // Allow to be ready if the buffer was already used - if (notReady && surfaceAtlasData.CulledObjectsBuffer && surfaceAtlasData.CulledObjectsBuffer->IsAllocated()) + objectsBufferCapacity = counter; notReady = false; + } + _culledObjectsSizeBuffer->Unmap(); } - if (surfaceAtlasData.CulledObjectsCounterIndex == -1) + + // Allow to be ready if the buffer was already used + if (notReady && surfaceAtlasData.CulledObjectsBuffer && surfaceAtlasData.CulledObjectsBuffer->IsAllocated()) + notReady = false; + } + if (surfaceAtlasData.CulledObjectsCounterIndex == -1) + { + // Find a free timer slot + notReady = true; + for (int32 i = 0; i < ARRAY_COUNT(_culledObjectsSizeFrames); i++) { - // Find a free timer slot - notReady = true; - for (int32 i = 0; i < ARRAY_COUNT(_culledObjectsSizeFrames); i++) + if (currentFrame - _culledObjectsSizeFrames[i] > GPU_ASYNC_LATENCY) { - if (currentFrame - _culledObjectsSizeFrames[i] > GPU_ASYNC_LATENCY) - { - surfaceAtlasData.CulledObjectsCounterIndex = i; - break; - } + surfaceAtlasData.CulledObjectsCounterIndex = i; + break; } } - if (surfaceAtlasData.CulledObjectsCounterIndex != -1 && surfaceAtlasData.CulledObjectsBuffer) - { - // Copy current counter value - _culledObjectsSizeFrames[surfaceAtlasData.CulledObjectsCounterIndex] = currentFrame; - context->CopyBuffer(_culledObjectsSizeBuffer, surfaceAtlasData.CulledObjectsBuffer, sizeof(uint32), surfaceAtlasData.CulledObjectsCounterIndex * sizeof(uint32), 0); - } + } + if (surfaceAtlasData.CulledObjectsCounterIndex != -1 && surfaceAtlasData.CulledObjectsBuffer) + { + // Copy current counter value + _culledObjectsSizeFrames[surfaceAtlasData.CulledObjectsCounterIndex] = currentFrame; + context->CopyBuffer(_culledObjectsSizeBuffer, surfaceAtlasData.CulledObjectsBuffer, sizeof(uint32), surfaceAtlasData.CulledObjectsCounterIndex * sizeof(uint32), 0); } // Calculate optimal capacity for the objects buffer @@ -1024,6 +1027,7 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co static_assert(GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION % GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE == 0, "Invalid chunks resolution/groups setting."); const int32 chunkDispatchGroups = GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION / GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE; context->BindSR(0, surfaceAtlasData.ObjectsBuffer.GetBuffer()->View()); + context->BindSR(1, surfaceAtlasData.ObjectsListBuffer.GetBuffer()->View()); context->BindUA(0, surfaceAtlasData.ChunksBuffer->View()); context->BindUA(1, surfaceAtlasData.CulledObjectsBuffer->View()); context->Dispatch(_csCullObjects, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups); diff --git a/Source/Shaders/GI/GlobalSurfaceAtlas.shader b/Source/Shaders/GI/GlobalSurfaceAtlas.shader index 2a762f5be..f8066549f 100644 --- a/Source/Shaders/GI/GlobalSurfaceAtlas.shader +++ b/Source/Shaders/GI/GlobalSurfaceAtlas.shader @@ -201,41 +201,67 @@ float4 PS_Lighting(AtlasVertexOutput input) : SV_Target RWByteAddressBuffer RWGlobalSurfaceAtlasChunks : register(u0); RWByteAddressBuffer RWGlobalSurfaceAtlasCulledObjects : register(u1); Buffer GlobalSurfaceAtlasObjects : register(t0); +Buffer GlobalSurfaceAtlasObjectsList : register(t1); -#define GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE 32 // Amount of objects to cache locally per-thread for culling +#define GLOBAL_SURFACE_ATLAS_SHARED_CULL_SIZE 255 // Limit of objects that can be culled for a whole group of 4x4x4 threads (64 chunks) + +groupshared uint SharedCulledObjectsCount; +groupshared uint SharedCulledObjects[GLOBAL_SURFACE_ATLAS_SHARED_CULL_SIZE]; // Compute shader for culling objects into chunks META_CS(true, FEATURE_LEVEL_SM5) [numthreads(GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE, GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE, GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE)] -void CS_CullObjects(uint3 DispatchThreadId : SV_DispatchThreadID) +void CS_CullObjects(uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupId : SV_GroupID, uint3 GroupThreadId : SV_GroupThreadID) { uint3 chunkCoord = DispatchThreadId; uint chunkAddress = (chunkCoord.z * (GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION * GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION) + chunkCoord.y * GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION + chunkCoord.x) * 4; float3 chunkMin = GlobalSurfaceAtlas.ViewPos + (chunkCoord - (GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION * 0.5f)) * GlobalSurfaceAtlas.ChunkSize; - float3 chunkMax = chunkMin + GlobalSurfaceAtlas.ChunkSize; + float3 chunkMax = chunkMin + GlobalSurfaceAtlas.ChunkSize.xxx; + uint groupIndex = (GroupThreadId.z * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE + GroupThreadId.y) * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE + GroupThreadId.x; + float3 groupMin = GlobalSurfaceAtlas.ViewPos + (GroupId * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE - (GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION * 0.5f)) * GlobalSurfaceAtlas.ChunkSize; + float3 groupMax = groupMin + (GlobalSurfaceAtlas.ChunkSize * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE).xxx; - // Count objects in this chunk - uint objectAddress = 0, objectsCount = 0; - // TODO: pre-cull objects within a thread group - uint localCulledObjects[GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE]; + // Clear shared memory + if (groupIndex == 0) + { + SharedCulledObjectsCount = 0; + } + GroupMemoryBarrierWithGroupSync(); + + // Shared culling of all objects by all threads for a whole group LOOP - for (uint objectIndex = 0; objectIndex < GlobalSurfaceAtlas.ObjectsCount; objectIndex++) + for (uint objectIndex = groupIndex; objectIndex < GlobalSurfaceAtlas.ObjectsCount; objectIndex += GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE) { + uint objectAddress = GlobalSurfaceAtlasObjectsList.Load(objectIndex); float4 objectBounds = LoadGlobalSurfaceAtlasObjectBounds(GlobalSurfaceAtlasObjects, objectAddress); - uint objectSize = LoadGlobalSurfaceAtlasObjectDataSize(GlobalSurfaceAtlasObjects, objectAddress); - if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w)) + if (BoxIntersectsSphere(groupMin, groupMax, objectBounds.xyz, objectBounds.w)) { - localCulledObjects[objectsCount % GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE] = objectAddress; - objectsCount++; + uint sharedIndex; + InterlockedAdd(SharedCulledObjectsCount, 1, sharedIndex); + if (sharedIndex < GLOBAL_SURFACE_ATLAS_SHARED_CULL_SIZE) + SharedCulledObjects[sharedIndex] = objectAddress; } - objectAddress += objectSize; } - if (objectsCount == 0) + GroupMemoryBarrierWithGroupSync(); + + // Cull objects from the shared buffer against active thread's chunk + uint objectsCount = 0; + LOOP + for (uint i = 0; i < SharedCulledObjectsCount; i++) { - // Empty chunk - RWGlobalSurfaceAtlasChunks.Store(chunkAddress, 0); - return; + uint objectAddress = SharedCulledObjects[i]; + float4 objectBounds = LoadGlobalSurfaceAtlasObjectBounds(GlobalSurfaceAtlasObjects, objectAddress); + if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w)) + { + objectsCount++; + } } + if (objectsCount == 0) + { + // Empty chunk + RWGlobalSurfaceAtlasChunks.Store(chunkAddress, 0); + return; + } // Allocate object data size in the buffer uint objectsStart; @@ -254,34 +280,17 @@ void CS_CullObjects(uint3 DispatchThreadId : SV_DispatchThreadID) // Write objects count before actual objects indices RWGlobalSurfaceAtlasCulledObjects.Store(objectsStart * 4, objectsCount); - // Copy objects data in this chunk - if (objectsCount <= GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE) - { - // Reuse locally cached objects - LOOP - for (uint objectIndex = 0; objectIndex < objectsCount; objectIndex++) + // Copy objects data in this chunk (cull from the shared buffer) + LOOP + for (uint i = 0; i < SharedCulledObjectsCount; i++) + { + uint objectAddress = SharedCulledObjects[i]; + float4 objectBounds = LoadGlobalSurfaceAtlasObjectBounds(GlobalSurfaceAtlasObjects, objectAddress); + if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w)) { - objectAddress = localCulledObjects[objectIndex]; objectsStart++; RWGlobalSurfaceAtlasCulledObjects.Store(objectsStart * 4, objectAddress); } - } - else - { - // Brute-force culling - objectAddress = 0; - LOOP - for (uint objectIndex = 0; objectIndex < GlobalSurfaceAtlas.ObjectsCount; objectIndex++) - { - float4 objectBounds = LoadGlobalSurfaceAtlasObjectBounds(GlobalSurfaceAtlasObjects, objectAddress); - uint objectSize = LoadGlobalSurfaceAtlasObjectDataSize(GlobalSurfaceAtlasObjects, objectAddress); - if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w)) - { - objectsStart++; - RWGlobalSurfaceAtlasCulledObjects.Store(objectsStart * 4, objectAddress); - } - objectAddress += objectSize; - } } }