Optimize Global Surface Atlas objects culling with shared thread group pre-cull

This commit is contained in:
Wojtek Figat
2024-07-24 11:46:20 +02:00
parent 4976a2ef6a
commit 6f3d1cdd0a
2 changed files with 95 additions and 82 deletions

View File

@@ -151,6 +151,7 @@ public:
GPUBuffer* ChunksBuffer = nullptr;
GPUBuffer* CulledObjectsBuffer = nullptr;
DynamicTypedBuffer ObjectsBuffer;
DynamicTypedBuffer ObjectsListBuffer;
int32 CulledObjectsCounterIndex = -1;
GlobalSurfaceAtlasPass::BindingData Result;
RectPackAtlas<GlobalSurfaceAtlasTile> Atlas;
@@ -179,6 +180,7 @@ public:
GlobalSurfaceAtlasCustomBuffer()
: ObjectsBuffer(256 * (GLOBAL_SURFACE_ATLAS_OBJECT_DATA_STRIDE + GLOBAL_SURFACE_ATLAS_TILE_DATA_STRIDE * 3 / 4), PixelFormat::R32G32B32A32_Float, false, TEXT("GlobalSurfaceAtlas.ObjectsBuffer"))
, ObjectsListBuffer(0, PixelFormat::R32_UInt, false, TEXT("GlobalSurfaceAtlas.ObjectsListBuffer"))
{
}
@@ -403,6 +405,8 @@ public:
PROFILE_CPU_NAMED("Write Objects");
DirtyObjectsBuffer.Clear();
ObjectsBuffer.Clear();
ObjectsListBuffer.Clear();
ObjectsListBuffer.Data.EnsureCapacity(Objects.Count() * sizeof(uint32));
for (auto& e : Objects)
{
auto& object = e.Value;
@@ -421,6 +425,7 @@ public:
// Write to objects buffer (this must match unpacking logic in HLSL)
uint32 objectAddress = ObjectsBuffer.Data.Count() / sizeof(Float4);
ObjectsListBuffer.Write(objectAddress);
auto* objectData = ObjectsBuffer.WriteReserve<Float4>(GLOBAL_SURFACE_ATLAS_OBJECT_DATA_STRIDE);
objectData[0] = Float4(object.Position, object.Radius);
objectData[1] = Float4::Zero;
@@ -912,6 +917,7 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co
{
PROFILE_GPU_CPU_NAMED("Update Objects");
surfaceAtlasData.ObjectsBuffer.Flush(context);
surfaceAtlasData.ObjectsListBuffer.Flush(context);
}
// Init constants
@@ -924,7 +930,7 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co
bool notReady = false;
// Cull objects into chunks (for faster Atlas sampling)
if (surfaceAtlasData.Objects.Count() != 0)
if (surfaceAtlasData.Objects.Count() != 0 && surfaceAtlasData.ChunksBuffer)
{
// Each chunk (ChunksBuffer) contains uint with address of the culled objects data start in CulledObjectsBuffer.
// If chunk has address=0 then it's unused/empty.
@@ -935,55 +941,52 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co
uint32 objectsBufferCapacity = (uint32)((float)surfaceAtlasData.Objects.Count() * 1.3f);
// Copy counter from ChunksBuffer into staging buffer to access current chunks memory usage to adapt dynamically to the scene complexity
if (surfaceAtlasData.ChunksBuffer)
if (!_culledObjectsSizeBuffer)
{
if (!_culledObjectsSizeBuffer)
Platform::MemoryClear(_culledObjectsSizeFrames, sizeof(_culledObjectsSizeFrames));
_culledObjectsSizeBuffer = GPUDevice::Instance->CreateBuffer(TEXT("GlobalSurfaceAtlas.CulledObjectsSizeBuffer"));
const GPUBufferDescription desc = GPUBufferDescription::Buffer(ARRAY_COUNT(_culledObjectsSizeFrames) * sizeof(uint32), GPUBufferFlags::None, PixelFormat::R32_UInt, _culledObjectsSizeFrames, sizeof(uint32), GPUResourceUsage::StagingReadback);
if (_culledObjectsSizeBuffer->Init(desc))
return true;
}
if (surfaceAtlasData.CulledObjectsCounterIndex != -1)
{
// Get the last counter value (accept staging readback delay or not available data yet)
notReady = true;
auto data = (uint32*)_culledObjectsSizeBuffer->Map(GPUResourceMapMode::Read);
if (data)
{
Platform::MemoryClear(_culledObjectsSizeFrames, sizeof(_culledObjectsSizeFrames));
_culledObjectsSizeBuffer = GPUDevice::Instance->CreateBuffer(TEXT("GlobalSurfaceAtlas.CulledObjectsSizeBuffer"));
const GPUBufferDescription desc = GPUBufferDescription::Buffer(ARRAY_COUNT(_culledObjectsSizeFrames) * sizeof(uint32), GPUBufferFlags::None, PixelFormat::R32_UInt, _culledObjectsSizeFrames, sizeof(uint32), GPUResourceUsage::StagingReadback);
if (_culledObjectsSizeBuffer->Init(desc))
return true;
}
if (surfaceAtlasData.CulledObjectsCounterIndex != -1)
{
// Get the last counter value (accept staging readback delay or not available data yet)
notReady = true;
auto data = (uint32*)_culledObjectsSizeBuffer->Map(GPUResourceMapMode::Read);
if (data)
uint32 counter = data[surfaceAtlasData.CulledObjectsCounterIndex];
if (counter > 0)
{
uint32 counter = data[surfaceAtlasData.CulledObjectsCounterIndex];
if (counter > 0)
{
objectsBufferCapacity = counter;
notReady = false;
}
_culledObjectsSizeBuffer->Unmap();
}
// Allow to be ready if the buffer was already used
if (notReady && surfaceAtlasData.CulledObjectsBuffer && surfaceAtlasData.CulledObjectsBuffer->IsAllocated())
objectsBufferCapacity = counter;
notReady = false;
}
_culledObjectsSizeBuffer->Unmap();
}
if (surfaceAtlasData.CulledObjectsCounterIndex == -1)
// Allow to be ready if the buffer was already used
if (notReady && surfaceAtlasData.CulledObjectsBuffer && surfaceAtlasData.CulledObjectsBuffer->IsAllocated())
notReady = false;
}
if (surfaceAtlasData.CulledObjectsCounterIndex == -1)
{
// Find a free timer slot
notReady = true;
for (int32 i = 0; i < ARRAY_COUNT(_culledObjectsSizeFrames); i++)
{
// Find a free timer slot
notReady = true;
for (int32 i = 0; i < ARRAY_COUNT(_culledObjectsSizeFrames); i++)
if (currentFrame - _culledObjectsSizeFrames[i] > GPU_ASYNC_LATENCY)
{
if (currentFrame - _culledObjectsSizeFrames[i] > GPU_ASYNC_LATENCY)
{
surfaceAtlasData.CulledObjectsCounterIndex = i;
break;
}
surfaceAtlasData.CulledObjectsCounterIndex = i;
break;
}
}
if (surfaceAtlasData.CulledObjectsCounterIndex != -1 && surfaceAtlasData.CulledObjectsBuffer)
{
// Copy current counter value
_culledObjectsSizeFrames[surfaceAtlasData.CulledObjectsCounterIndex] = currentFrame;
context->CopyBuffer(_culledObjectsSizeBuffer, surfaceAtlasData.CulledObjectsBuffer, sizeof(uint32), surfaceAtlasData.CulledObjectsCounterIndex * sizeof(uint32), 0);
}
}
if (surfaceAtlasData.CulledObjectsCounterIndex != -1 && surfaceAtlasData.CulledObjectsBuffer)
{
// Copy current counter value
_culledObjectsSizeFrames[surfaceAtlasData.CulledObjectsCounterIndex] = currentFrame;
context->CopyBuffer(_culledObjectsSizeBuffer, surfaceAtlasData.CulledObjectsBuffer, sizeof(uint32), surfaceAtlasData.CulledObjectsCounterIndex * sizeof(uint32), 0);
}
// Calculate optimal capacity for the objects buffer
@@ -1024,6 +1027,7 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co
static_assert(GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION % GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE == 0, "Invalid chunks resolution/groups setting.");
const int32 chunkDispatchGroups = GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION / GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE;
context->BindSR(0, surfaceAtlasData.ObjectsBuffer.GetBuffer()->View());
context->BindSR(1, surfaceAtlasData.ObjectsListBuffer.GetBuffer()->View());
context->BindUA(0, surfaceAtlasData.ChunksBuffer->View());
context->BindUA(1, surfaceAtlasData.CulledObjectsBuffer->View());
context->Dispatch(_csCullObjects, chunkDispatchGroups, chunkDispatchGroups, chunkDispatchGroups);

View File

@@ -201,41 +201,67 @@ float4 PS_Lighting(AtlasVertexOutput input) : SV_Target
RWByteAddressBuffer RWGlobalSurfaceAtlasChunks : register(u0);
RWByteAddressBuffer RWGlobalSurfaceAtlasCulledObjects : register(u1);
Buffer<float4> GlobalSurfaceAtlasObjects : register(t0);
Buffer<uint> GlobalSurfaceAtlasObjectsList : register(t1);
#define GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE 32 // Amount of objects to cache locally per-thread for culling
#define GLOBAL_SURFACE_ATLAS_SHARED_CULL_SIZE 255 // Limit of objects that can be culled for a whole group of 4x4x4 threads (64 chunks)
groupshared uint SharedCulledObjectsCount;
groupshared uint SharedCulledObjects[GLOBAL_SURFACE_ATLAS_SHARED_CULL_SIZE];
// Compute shader for culling objects into chunks
META_CS(true, FEATURE_LEVEL_SM5)
[numthreads(GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE, GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE, GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE)]
void CS_CullObjects(uint3 DispatchThreadId : SV_DispatchThreadID)
void CS_CullObjects(uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupId : SV_GroupID, uint3 GroupThreadId : SV_GroupThreadID)
{
uint3 chunkCoord = DispatchThreadId;
uint chunkAddress = (chunkCoord.z * (GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION * GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION) + chunkCoord.y * GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION + chunkCoord.x) * 4;
float3 chunkMin = GlobalSurfaceAtlas.ViewPos + (chunkCoord - (GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION * 0.5f)) * GlobalSurfaceAtlas.ChunkSize;
float3 chunkMax = chunkMin + GlobalSurfaceAtlas.ChunkSize;
float3 chunkMax = chunkMin + GlobalSurfaceAtlas.ChunkSize.xxx;
uint groupIndex = (GroupThreadId.z * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE + GroupThreadId.y) * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE + GroupThreadId.x;
float3 groupMin = GlobalSurfaceAtlas.ViewPos + (GroupId * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE - (GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION * 0.5f)) * GlobalSurfaceAtlas.ChunkSize;
float3 groupMax = groupMin + (GlobalSurfaceAtlas.ChunkSize * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE).xxx;
// Count objects in this chunk
uint objectAddress = 0, objectsCount = 0;
// TODO: pre-cull objects within a thread group
uint localCulledObjects[GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE];
// Clear shared memory
if (groupIndex == 0)
{
SharedCulledObjectsCount = 0;
}
GroupMemoryBarrierWithGroupSync();
// Shared culling of all objects by all threads for a whole group
LOOP
for (uint objectIndex = 0; objectIndex < GlobalSurfaceAtlas.ObjectsCount; objectIndex++)
for (uint objectIndex = groupIndex; objectIndex < GlobalSurfaceAtlas.ObjectsCount; objectIndex += GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE)
{
uint objectAddress = GlobalSurfaceAtlasObjectsList.Load(objectIndex);
float4 objectBounds = LoadGlobalSurfaceAtlasObjectBounds(GlobalSurfaceAtlasObjects, objectAddress);
uint objectSize = LoadGlobalSurfaceAtlasObjectDataSize(GlobalSurfaceAtlasObjects, objectAddress);
if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w))
if (BoxIntersectsSphere(groupMin, groupMax, objectBounds.xyz, objectBounds.w))
{
localCulledObjects[objectsCount % GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE] = objectAddress;
objectsCount++;
uint sharedIndex;
InterlockedAdd(SharedCulledObjectsCount, 1, sharedIndex);
if (sharedIndex < GLOBAL_SURFACE_ATLAS_SHARED_CULL_SIZE)
SharedCulledObjects[sharedIndex] = objectAddress;
}
objectAddress += objectSize;
}
if (objectsCount == 0)
GroupMemoryBarrierWithGroupSync();
// Cull objects from the shared buffer against active thread's chunk
uint objectsCount = 0;
LOOP
for (uint i = 0; i < SharedCulledObjectsCount; i++)
{
// Empty chunk
RWGlobalSurfaceAtlasChunks.Store(chunkAddress, 0);
return;
uint objectAddress = SharedCulledObjects[i];
float4 objectBounds = LoadGlobalSurfaceAtlasObjectBounds(GlobalSurfaceAtlasObjects, objectAddress);
if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w))
{
objectsCount++;
}
}
if (objectsCount == 0)
{
// Empty chunk
RWGlobalSurfaceAtlasChunks.Store(chunkAddress, 0);
return;
}
// Allocate object data size in the buffer
uint objectsStart;
@@ -254,34 +280,17 @@ void CS_CullObjects(uint3 DispatchThreadId : SV_DispatchThreadID)
// Write objects count before actual objects indices
RWGlobalSurfaceAtlasCulledObjects.Store(objectsStart * 4, objectsCount);
// Copy objects data in this chunk
if (objectsCount <= GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE)
{
// Reuse locally cached objects
LOOP
for (uint objectIndex = 0; objectIndex < objectsCount; objectIndex++)
// Copy objects data in this chunk (cull from the shared buffer)
LOOP
for (uint i = 0; i < SharedCulledObjectsCount; i++)
{
uint objectAddress = SharedCulledObjects[i];
float4 objectBounds = LoadGlobalSurfaceAtlasObjectBounds(GlobalSurfaceAtlasObjects, objectAddress);
if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w))
{
objectAddress = localCulledObjects[objectIndex];
objectsStart++;
RWGlobalSurfaceAtlasCulledObjects.Store(objectsStart * 4, objectAddress);
}
}
else
{
// Brute-force culling
objectAddress = 0;
LOOP
for (uint objectIndex = 0; objectIndex < GlobalSurfaceAtlas.ObjectsCount; objectIndex++)
{
float4 objectBounds = LoadGlobalSurfaceAtlasObjectBounds(GlobalSurfaceAtlasObjects, objectAddress);
uint objectSize = LoadGlobalSurfaceAtlasObjectDataSize(GlobalSurfaceAtlasObjects, objectAddress);
if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w))
{
objectsStart++;
RWGlobalSurfaceAtlasCulledObjects.Store(objectsStart * 4, objectAddress);
}
objectAddress += objectSize;
}
}
}