Optimize Global Surface Atlas objects culling with shared thread group pre-cull
This commit is contained in:
@@ -201,41 +201,67 @@ float4 PS_Lighting(AtlasVertexOutput input) : SV_Target
|
||||
RWByteAddressBuffer RWGlobalSurfaceAtlasChunks : register(u0);
|
||||
RWByteAddressBuffer RWGlobalSurfaceAtlasCulledObjects : register(u1);
|
||||
Buffer<float4> GlobalSurfaceAtlasObjects : register(t0);
|
||||
Buffer<uint> GlobalSurfaceAtlasObjectsList : register(t1);
|
||||
|
||||
#define GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE 32 // Amount of objects to cache locally per-thread for culling
|
||||
#define GLOBAL_SURFACE_ATLAS_SHARED_CULL_SIZE 255 // Limit of objects that can be culled for a whole group of 4x4x4 threads (64 chunks)
|
||||
|
||||
groupshared uint SharedCulledObjectsCount;
|
||||
groupshared uint SharedCulledObjects[GLOBAL_SURFACE_ATLAS_SHARED_CULL_SIZE];
|
||||
|
||||
// Compute shader for culling objects into chunks
|
||||
META_CS(true, FEATURE_LEVEL_SM5)
|
||||
[numthreads(GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE, GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE, GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE)]
|
||||
void CS_CullObjects(uint3 DispatchThreadId : SV_DispatchThreadID)
|
||||
void CS_CullObjects(uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupId : SV_GroupID, uint3 GroupThreadId : SV_GroupThreadID)
|
||||
{
|
||||
uint3 chunkCoord = DispatchThreadId;
|
||||
uint chunkAddress = (chunkCoord.z * (GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION * GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION) + chunkCoord.y * GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION + chunkCoord.x) * 4;
|
||||
float3 chunkMin = GlobalSurfaceAtlas.ViewPos + (chunkCoord - (GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION * 0.5f)) * GlobalSurfaceAtlas.ChunkSize;
|
||||
float3 chunkMax = chunkMin + GlobalSurfaceAtlas.ChunkSize;
|
||||
float3 chunkMax = chunkMin + GlobalSurfaceAtlas.ChunkSize.xxx;
|
||||
uint groupIndex = (GroupThreadId.z * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE + GroupThreadId.y) * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE + GroupThreadId.x;
|
||||
float3 groupMin = GlobalSurfaceAtlas.ViewPos + (GroupId * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE - (GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION * 0.5f)) * GlobalSurfaceAtlas.ChunkSize;
|
||||
float3 groupMax = groupMin + (GlobalSurfaceAtlas.ChunkSize * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE).xxx;
|
||||
|
||||
// Count objects in this chunk
|
||||
uint objectAddress = 0, objectsCount = 0;
|
||||
// TODO: pre-cull objects within a thread group
|
||||
uint localCulledObjects[GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE];
|
||||
// Clear shared memory
|
||||
if (groupIndex == 0)
|
||||
{
|
||||
SharedCulledObjectsCount = 0;
|
||||
}
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
|
||||
// Shared culling of all objects by all threads for a whole group
|
||||
LOOP
|
||||
for (uint objectIndex = 0; objectIndex < GlobalSurfaceAtlas.ObjectsCount; objectIndex++)
|
||||
for (uint objectIndex = groupIndex; objectIndex < GlobalSurfaceAtlas.ObjectsCount; objectIndex += GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE * GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE)
|
||||
{
|
||||
uint objectAddress = GlobalSurfaceAtlasObjectsList.Load(objectIndex);
|
||||
float4 objectBounds = LoadGlobalSurfaceAtlasObjectBounds(GlobalSurfaceAtlasObjects, objectAddress);
|
||||
uint objectSize = LoadGlobalSurfaceAtlasObjectDataSize(GlobalSurfaceAtlasObjects, objectAddress);
|
||||
if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w))
|
||||
if (BoxIntersectsSphere(groupMin, groupMax, objectBounds.xyz, objectBounds.w))
|
||||
{
|
||||
localCulledObjects[objectsCount % GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE] = objectAddress;
|
||||
objectsCount++;
|
||||
uint sharedIndex;
|
||||
InterlockedAdd(SharedCulledObjectsCount, 1, sharedIndex);
|
||||
if (sharedIndex < GLOBAL_SURFACE_ATLAS_SHARED_CULL_SIZE)
|
||||
SharedCulledObjects[sharedIndex] = objectAddress;
|
||||
}
|
||||
objectAddress += objectSize;
|
||||
}
|
||||
if (objectsCount == 0)
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
|
||||
// Cull objects from the shared buffer against active thread's chunk
|
||||
uint objectsCount = 0;
|
||||
LOOP
|
||||
for (uint i = 0; i < SharedCulledObjectsCount; i++)
|
||||
{
|
||||
// Empty chunk
|
||||
RWGlobalSurfaceAtlasChunks.Store(chunkAddress, 0);
|
||||
return;
|
||||
uint objectAddress = SharedCulledObjects[i];
|
||||
float4 objectBounds = LoadGlobalSurfaceAtlasObjectBounds(GlobalSurfaceAtlasObjects, objectAddress);
|
||||
if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w))
|
||||
{
|
||||
objectsCount++;
|
||||
}
|
||||
}
|
||||
if (objectsCount == 0)
|
||||
{
|
||||
// Empty chunk
|
||||
RWGlobalSurfaceAtlasChunks.Store(chunkAddress, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocate object data size in the buffer
|
||||
uint objectsStart;
|
||||
@@ -254,34 +280,17 @@ void CS_CullObjects(uint3 DispatchThreadId : SV_DispatchThreadID)
|
||||
// Write objects count before actual objects indices
|
||||
RWGlobalSurfaceAtlasCulledObjects.Store(objectsStart * 4, objectsCount);
|
||||
|
||||
// Copy objects data in this chunk
|
||||
if (objectsCount <= GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE)
|
||||
{
|
||||
// Reuse locally cached objects
|
||||
LOOP
|
||||
for (uint objectIndex = 0; objectIndex < objectsCount; objectIndex++)
|
||||
// Copy objects data in this chunk (cull from the shared buffer)
|
||||
LOOP
|
||||
for (uint i = 0; i < SharedCulledObjectsCount; i++)
|
||||
{
|
||||
uint objectAddress = SharedCulledObjects[i];
|
||||
float4 objectBounds = LoadGlobalSurfaceAtlasObjectBounds(GlobalSurfaceAtlasObjects, objectAddress);
|
||||
if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w))
|
||||
{
|
||||
objectAddress = localCulledObjects[objectIndex];
|
||||
objectsStart++;
|
||||
RWGlobalSurfaceAtlasCulledObjects.Store(objectsStart * 4, objectAddress);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Brute-force culling
|
||||
objectAddress = 0;
|
||||
LOOP
|
||||
for (uint objectIndex = 0; objectIndex < GlobalSurfaceAtlas.ObjectsCount; objectIndex++)
|
||||
{
|
||||
float4 objectBounds = LoadGlobalSurfaceAtlasObjectBounds(GlobalSurfaceAtlasObjects, objectAddress);
|
||||
uint objectSize = LoadGlobalSurfaceAtlasObjectDataSize(GlobalSurfaceAtlasObjects, objectAddress);
|
||||
if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w))
|
||||
{
|
||||
objectsStart++;
|
||||
RWGlobalSurfaceAtlasCulledObjects.Store(objectsStart * 4, objectAddress);
|
||||
}
|
||||
objectAddress += objectSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user