Optimize Global Surface Atlas objects culling to cache up to 32 objects locally per-thread

This commit is contained in:
Wojciech Figat
2022-06-27 16:31:39 +02:00
parent d8e79101e7
commit d5a529e00a
2 changed files with 35 additions and 17 deletions

View File

@@ -34,7 +34,7 @@
#define GLOBAL_SURFACE_ATLAS_TILE_PROJ_PLANE_OFFSET 0.1f // Small offset to prevent clipping with the closest triangles (shifts near and far planes)
#define GLOBAL_SURFACE_ATLAS_DEBUG_FORCE_REDRAW_TILES 0 // Forces to redraw all object tiles every frame
#define GLOBAL_SURFACE_ATLAS_DEBUG_DRAW_OBJECTS 0 // Debug draws object bounds on redraw (and tile draw projection locations)
#define GLOBAL_SURFACE_ATLAS_DEBUG_DRAW_CHUNKS 0 // Debug draws culled chunks bounds (non-empty
#define GLOBAL_SURFACE_ATLAS_DEBUG_DRAW_CHUNKS 0 // Debug draws culled chunks bounds (non-empty)
#if GLOBAL_SURFACE_ATLAS_DEBUG_DRAW_OBJECTS || GLOBAL_SURFACE_ATLAS_DEBUG_DRAW_CHUNKS
#include "Engine/Debug/DebugDraw.h"
@@ -704,11 +704,11 @@ bool GlobalSurfaceAtlasPass::Render(RenderContext& renderContext, GPUContext* co
{
for (int32 x = 0; x < GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION; x++)
{
Float3 chunkCoord(x, y, z);
Float3 chunkMin = result.GlobalSurfaceAtlas.ViewPos + (chunkCoord - (GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION * 0.5f)) * result.GlobalSurfaceAtlas.ChunkSize;
Float3 chunkMax = chunkMin + result.GlobalSurfaceAtlas.ChunkSize;
Float3 chunkCoord((float)x, (float)y, (float)z);
Float3 chunkMin = result.Constants.ViewPos + (chunkCoord - (GLOBAL_SURFACE_ATLAS_CHUNKS_RESOLUTION * 0.5f)) * result.Constants.ChunkSize;
Float3 chunkMax = chunkMin + result.Constants.ChunkSize;
BoundingBox chunkBounds(chunkMin, chunkMax);
if (Float3::Distance(chunkBounds.GetCenter(), result.GlobalSurfaceAtlas.ViewPos) >= 2000.0f)
if (Float3::Distance(chunkBounds.GetCenter(), result.Constants.ViewPos) >= 2000.0f)
continue;
int32 count = 0;

View File

@@ -190,6 +190,8 @@ RWByteAddressBuffer RWGlobalSurfaceAtlasChunks : register(u0);
RWByteAddressBuffer RWGlobalSurfaceAtlasCulledObjects : register(u1);
Buffer<float4> GlobalSurfaceAtlasObjects : register(t0);
#define GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE 32 // Amount of objects to cache locally per-thread for culling
// Compute shader for culling objects into chunks
META_CS(true, FEATURE_LEVEL_SM5)
[numthreads(GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE, GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE, GLOBAL_SURFACE_ATLAS_CHUNKS_GROUP_SIZE)]
@@ -203,7 +205,7 @@ void CS_CullObjects(uint3 DispatchThreadId : SV_DispatchThreadID)
// Count objects in this chunk
uint objectAddress = 0, objectsCount = 0;
// TODO: pre-cull objects within a thread group
// TODO: maybe cache 20-30 culled object indices in thread memory to skip culling them again when copying data (maybe reuse chunk size to get smaller objects count per chunk)?
uint localCulledObjects[GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE];
LOOP
for (uint objectIndex = 0; objectIndex < GlobalSurfaceAtlas.ObjectsCount; objectIndex++)
{
@@ -211,6 +213,7 @@ void CS_CullObjects(uint3 DispatchThreadId : SV_DispatchThreadID)
uint objectSize = LoadGlobalSurfaceAtlasObjectDataSize(GlobalSurfaceAtlasObjects, objectAddress);
if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w))
{
localCulledObjects[objectsCount % GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE] = objectAddress;
objectsCount++;
}
objectAddress += objectSize;
@@ -240,19 +243,34 @@ void CS_CullObjects(uint3 DispatchThreadId : SV_DispatchThreadID)
RWGlobalSurfaceAtlasCulledObjects.Store(objectsStart * 4, objectsCount);
// Copy objects data in this chunk
objectAddress = 0;
LOOP
for (uint objectIndex = 0; objectIndex < GlobalSurfaceAtlas.ObjectsCount; objectIndex++)
if (objectsCount <= GLOBAL_SURFACE_ATLAS_CULL_LOCAL_SIZE)
{
float4 objectBounds = LoadGlobalSurfaceAtlasObjectBounds(GlobalSurfaceAtlasObjects, objectAddress);
uint objectSize = LoadGlobalSurfaceAtlasObjectDataSize(GlobalSurfaceAtlasObjects, objectAddress);
if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w))
{
objectsStart++;
RWGlobalSurfaceAtlasCulledObjects.Store(objectsStart * 4, objectAddress);
}
objectAddress += objectSize;
// Reuse locally cached objects
LOOP
for (uint objectIndex = 0; objectIndex < objectsCount; objectIndex++)
{
objectAddress = localCulledObjects[objectIndex];
objectsStart++;
RWGlobalSurfaceAtlasCulledObjects.Store(objectsStart * 4, objectAddress);
}
}
else
{
// Brute-force culling
objectAddress = 0;
LOOP
for (uint objectIndex = 0; objectIndex < GlobalSurfaceAtlas.ObjectsCount; objectIndex++)
{
float4 objectBounds = LoadGlobalSurfaceAtlasObjectBounds(GlobalSurfaceAtlasObjects, objectAddress);
uint objectSize = LoadGlobalSurfaceAtlasObjectDataSize(GlobalSurfaceAtlasObjects, objectAddress);
if (BoxIntersectsSphere(chunkMin, chunkMax, objectBounds.xyz, objectBounds.w))
{
objectsStart++;
RWGlobalSurfaceAtlasCulledObjects.Store(objectsStart * 4, objectAddress);
}
objectAddress += objectSize;
}
}
}
#endif