Optimize DDGI probes border pixels to be copied within probe update, rather than via separate dispatch

This commit is contained in:
Wojtek Figat
2024-07-18 19:51:01 +02:00
parent aeff147b6d
commit 03898a064a
3 changed files with 85 additions and 114 deletions

View File

@@ -219,10 +219,6 @@ bool DynamicDiffuseGlobalIlluminationPass::setupResources()
_csTraceRays[3] = shader->GetCS("CS_TraceRays", 3);
_csUpdateProbesIrradiance = shader->GetCS("CS_UpdateProbes", 0);
_csUpdateProbesDistance = shader->GetCS("CS_UpdateProbes", 1);
_csUpdateBordersIrradianceRow = shader->GetCS("CS_UpdateBorders", 0);
_csUpdateBordersIrradianceCollumn = shader->GetCS("CS_UpdateBorders", 1);
_csUpdateBordersDistanceRow = shader->GetCS("CS_UpdateBorders", 2);
_csUpdateBordersDistanceCollumn = shader->GetCS("CS_UpdateBorders", 3);
auto device = GPUDevice::Instance;
auto psDesc = GPUPipelineState::Description::DefaultFullscreenTriangle;
if (!_psIndirectLighting)
@@ -250,10 +246,6 @@ void DynamicDiffuseGlobalIlluminationPass::OnShaderReloading(Asset* obj)
_csTraceRays[3] = nullptr;
_csUpdateProbesIrradiance = nullptr;
_csUpdateProbesDistance = nullptr;
_csUpdateBordersIrradianceRow = nullptr;
_csUpdateBordersIrradianceCollumn = nullptr;
_csUpdateBordersDistanceRow = nullptr;
_csUpdateBordersDistanceCollumn = nullptr;
SAFE_DELETE_GPU_RESOURCE(_psIndirectLighting);
invalidateResources();
}
@@ -542,7 +534,6 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
// Update probes
{
PROFILE_GPU_CPU_NAMED("Probes Update");
bool anyDirty = false;
uint32 threadGroupsX, threadGroupsY;
#if DDGI_DEBUG_STATS
uint32 zero[4] = {};
@@ -552,7 +543,6 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
{
if (cascadeSkipUpdate[cascadeIndex])
continue;
anyDirty = true;
// Classify probes (activation/deactivation and relocation)
{
@@ -667,33 +657,6 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
}
}
#endif
// Update probes border pixels
if (anyDirty)
{
PROFILE_GPU_CPU_NAMED("Update Borders");
// Irradiance
context->BindUA(0, ddgiData.Result.ProbesIrradiance);
threadGroupsX = Math::DivideAndRoundUp(probesCountTotalX * (DDGI_PROBE_RESOLUTION_IRRADIANCE + 2), DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
threadGroupsY = Math::DivideAndRoundUp(probesCountTotalY, DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
context->Dispatch(_csUpdateBordersIrradianceRow, threadGroupsX, threadGroupsY, 1);
threadGroupsX = Math::DivideAndRoundUp(probesCountTotalX, DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
threadGroupsY = Math::DivideAndRoundUp(probesCountTotalY * (DDGI_PROBE_RESOLUTION_IRRADIANCE + 2), DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
context->Dispatch(_csUpdateBordersIrradianceCollumn, threadGroupsX, threadGroupsY, 1);
// Distance
context->BindUA(0, ddgiData.Result.ProbesDistance);
threadGroupsX = Math::DivideAndRoundUp(probesCountTotalX * (DDGI_PROBE_RESOLUTION_DISTANCE + 2), DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
threadGroupsY = Math::DivideAndRoundUp(probesCountTotalY, DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
context->Dispatch(_csUpdateBordersDistanceRow, threadGroupsX, threadGroupsY, 1);
threadGroupsX = Math::DivideAndRoundUp(probesCountTotalX, DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
threadGroupsY = Math::DivideAndRoundUp(probesCountTotalY * (DDGI_PROBE_RESOLUTION_DISTANCE + 2), DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
context->Dispatch(_csUpdateBordersDistanceCollumn, threadGroupsX, threadGroupsY, 1);
context->ResetUA();
context->ResetSR();
}
}
return false;

View File

@@ -47,10 +47,6 @@ private:
GPUShaderProgramCS* _csTraceRays[4];
GPUShaderProgramCS* _csUpdateProbesIrradiance;
GPUShaderProgramCS* _csUpdateProbesDistance;
GPUShaderProgramCS* _csUpdateBordersIrradianceRow;
GPUShaderProgramCS* _csUpdateBordersIrradianceCollumn;
GPUShaderProgramCS* _csUpdateBordersDistanceRow;
GPUShaderProgramCS* _csUpdateBordersDistanceCollumn;
GPUPipelineState* _psIndirectLighting;
#if USE_EDITOR
AssetReference<Model> _debugModel;

View File

@@ -415,16 +415,22 @@ void CS_TraceRays(uint3 DispatchThreadId : SV_DispatchThreadID)
#endif
#if defined(_CS_UpdateProbes) || defined(_CS_UpdateBorders)
#if defined(_CS_UpdateProbes)
#if DDGI_PROBE_UPDATE_MODE == 0
// Update irradiance
#define DDGI_PROBE_RESOLUTION DDGI_PROBE_RESOLUTION_IRRADIANCE
groupshared float4 CachedProbesTraceRadiance[DDGI_TRACE_RAYS_LIMIT];
groupshared float OutputInstability[DDGI_PROBE_RESOLUTION * DDGI_PROBE_RESOLUTION];
#else
// Update distance
#define DDGI_PROBE_RESOLUTION DDGI_PROBE_RESOLUTION_DISTANCE
groupshared float CachedProbesTraceDistance[DDGI_TRACE_RAYS_LIMIT];
#endif
// Source: https://github.com/turanszkij/WickedEngine
#define BorderOffsetsSize (4 * DDGI_PROBE_RESOLUTION + 4)
#if DDGI_PROBE_RESOLUTION == 6
static const uint4 BorderOffsets[BorderOffsetsSize] = {
uint4(6, 1, 1, 0),
uint4(5, 1, 2, 0),
@@ -457,12 +463,77 @@ static const uint4 BorderOffsets[BorderOffsetsSize] = {
uint4(1, 1, 7, 7),
uint4(6, 1, 0, 7),
uint4(1, 6, 7, 0),
uint4(6, 6, 0, 0),
uint4(6, 6, 0, 0)
};
#elif DDGI_PROBE_RESOLUTION == 14
static const uint4 BorderOffsets[BorderOffsetsSize] = {
uint4(14, 1, 1, 0),
uint4(13, 1, 2, 0),
uint4(12, 1, 3, 0),
uint4(11, 1, 4, 0),
uint4(10, 1, 5, 0),
uint4(9, 1, 6, 0),
uint4(8, 1, 7, 0),
uint4(7, 1, 8, 0),
uint4(6, 1, 9, 0),
uint4(5, 1, 10, 0),
uint4(4, 1, 11, 0),
uint4(3, 1, 12, 0),
uint4(2, 1, 13, 0),
uint4(1, 1, 14, 0),
uint4(14, 14, 1, 15),
uint4(13, 14, 2, 15),
uint4(12, 14, 3, 15),
uint4(11, 14, 4, 15),
uint4(10, 14, 5, 15),
uint4(9, 14, 6, 15),
uint4(8, 14, 7, 15),
uint4(7, 14, 8, 15),
uint4(6, 14, 9, 15),
uint4(5, 14, 10, 15),
uint4(4, 14, 11, 15),
uint4(3, 14, 12, 15),
uint4(2, 14, 13, 15),
uint4(1, 14, 14, 15),
uint4(1, 14, 0, 1),
uint4(1, 13, 0, 2),
uint4(1, 12, 0, 3),
uint4(1, 11, 0, 4),
uint4(1, 10, 0, 5),
uint4(1, 9, 0, 6),
uint4(1, 8, 0, 7),
uint4(1, 7, 0, 8),
uint4(1, 6, 0, 9),
uint4(1, 5, 0, 10),
uint4(1, 4, 0, 11),
uint4(1, 3, 0, 12),
uint4(1, 2, 0, 13),
uint4(1, 1, 0, 14),
uint4(14, 14, 15, 1),
uint4(14, 13, 15, 2),
uint4(14, 12, 15, 3),
uint4(14, 11, 15, 4),
uint4(14, 10, 15, 5),
uint4(14, 9, 15, 6),
uint4(14, 8, 15, 7),
uint4(14, 7, 15, 8),
uint4(14, 6, 15, 9),
uint4(14, 5, 15, 10),
uint4(14, 4, 15, 11),
uint4(14, 3, 15, 12),
uint4(14, 2, 15, 13),
uint4(14, 1, 15, 14),
uint4(14, 14, 0, 0),
uint4(1, 14, 15, 0),
uint4(14, 1, 0, 15),
uint4(1, 1, 15, 15)
};
#else
// Update distance
#define DDGI_PROBE_RESOLUTION DDGI_PROBE_RESOLUTION_DISTANCE
groupshared float CachedProbesTraceDistance[DDGI_TRACE_RAYS_LIMIT];
#error "Unsupported probe size for border values copy."
#endif
groupshared float3 CachedProbesTraceDirection[DDGI_TRACE_RAYS_LIMIT];
@@ -635,9 +706,11 @@ void CS_UpdateProbes(uint3 GroupThreadId : SV_GroupThreadID, uint3 GroupId : SV_
RWOutput[outputCoords] = result;
GroupMemoryBarrierWithGroupSync();
uint2 baseCoords = GetDDGIProbeTexelCoords(DDGI, CascadeIndex, probeIndex) * (DDGI_PROBE_RESOLUTION + 2);
#if DDGI_PROBE_UPDATE_MODE == 0
// The first thread updates the probe attention based on the instability of all texels
GroupMemoryBarrierWithGroupSync();
BRANCH
if (GroupIndex == 0 && probeState != DDGI_PROBE_STATE_INACTIVE)
{
@@ -665,7 +738,6 @@ void CS_UpdateProbes(uint3 GroupThreadId : SV_GroupThreadID, uint3 GroupId : SV_
#if DDGI_DEBUG_INSTABILITY
// Copy border pixels
uint2 baseCoords = GetDDGIProbeTexelCoords(DDGI, CascadeIndex, probeIndex) * (DDGI_PROBE_RESOLUTION + 2);
for (uint borderIndex = GroupIndex; borderIndex < BorderOffsetsSize; borderIndex += DDGI_PROBE_RESOLUTION * DDGI_PROBE_RESOLUTION)
{
uint4 borderOffsets = BorderOffsets[borderIndex];
@@ -673,73 +745,13 @@ void CS_UpdateProbes(uint3 GroupThreadId : SV_GroupThreadID, uint3 GroupId : SV_
}
#endif
#endif
}
// Compute shader for updating probes irradiance or distance texture borders (fills gaps between probes to support bilinear filtering)
META_CS(true, FEATURE_LEVEL_SM5)
META_PERMUTATION_2(DDGI_PROBE_UPDATE_MODE=0, BORDER_ROW=1)
META_PERMUTATION_2(DDGI_PROBE_UPDATE_MODE=0, BORDER_ROW=0)
META_PERMUTATION_2(DDGI_PROBE_UPDATE_MODE=1, BORDER_ROW=1)
META_PERMUTATION_2(DDGI_PROBE_UPDATE_MODE=1, BORDER_ROW=0)
[numthreads(DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE, DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE, 1)]
void CS_UpdateBorders(uint3 DispatchThreadId : SV_DispatchThreadID)
{
#define COPY_PIXEL RWOutput[threadCoordinates] = RWOutput[copyCoordinates]
#define COPY_PIXEL_DEBUG RWOutput[threadCoordinates] = float4(5, 0, 0, 1)
uint probeSideLength = DDGI_PROBE_RESOLUTION + 2;
uint probeSideLengthMinusOne = probeSideLength - 1;
uint2 copyCoordinates = uint2(0, 0);
uint2 threadCoordinates = DispatchThreadId.xy;
#if BORDER_ROW
threadCoordinates.y *= probeSideLength;
uint corner = DispatchThreadId.x % probeSideLength;
#else
threadCoordinates.x *= probeSideLength;
uint corner = threadCoordinates.y % probeSideLength;
#endif
if (corner == 0 || corner == probeSideLengthMinusOne)
{
#if !BORDER_ROW
// Left corner
copyCoordinates.x = threadCoordinates.x + DDGI_PROBE_RESOLUTION;
copyCoordinates.y = threadCoordinates.y - sign((int)corner - 1) * DDGI_PROBE_RESOLUTION;
COPY_PIXEL;
// Right corner
threadCoordinates.x += probeSideLengthMinusOne;
copyCoordinates.x = threadCoordinates.x - DDGI_PROBE_RESOLUTION;
COPY_PIXEL;
#endif
return;
}
#if BORDER_ROW
// Top row
uint probeStart = uint(threadCoordinates.x / probeSideLength) * probeSideLength;
uint offset = probeSideLengthMinusOne - (threadCoordinates.x % probeSideLength);
copyCoordinates = uint2(probeStart + offset, threadCoordinates.y + 1);
#else
// Left column
uint probeStart = uint(threadCoordinates.y / probeSideLength) * probeSideLength;
uint offset = probeSideLengthMinusOne - (threadCoordinates.y % probeSideLength);
copyCoordinates = uint2(threadCoordinates.x + 1, probeStart + offset);
#endif
COPY_PIXEL;
#if BORDER_ROW
// Bottom row
threadCoordinates.y += probeSideLengthMinusOne;
copyCoordinates = uint2(probeStart + offset, threadCoordinates.y - 1);
#else
// Right column
threadCoordinates.x += probeSideLengthMinusOne;
copyCoordinates = uint2(threadCoordinates.x - 1, probeStart + offset);
#endif
COPY_PIXEL;
#undef COPY_PIXEL
#undef COPY_PIXEL_DEBUG
// Copy border pixels
for (uint borderIndex = GroupIndex; borderIndex < BorderOffsetsSize; borderIndex += DDGI_PROBE_RESOLUTION * DDGI_PROBE_RESOLUTION)
{
uint4 borderOffsets = BorderOffsets[borderIndex];
RWOutput[baseCoords + borderOffsets.zw] = RWOutput[baseCoords + borderOffsets.xy];
}
}
#endif