Optimize DDGI probes border pixels to be copied within probe update, rather than via separate dispatch
This commit is contained in:
@@ -219,10 +219,6 @@ bool DynamicDiffuseGlobalIlluminationPass::setupResources()
|
||||
_csTraceRays[3] = shader->GetCS("CS_TraceRays", 3);
|
||||
_csUpdateProbesIrradiance = shader->GetCS("CS_UpdateProbes", 0);
|
||||
_csUpdateProbesDistance = shader->GetCS("CS_UpdateProbes", 1);
|
||||
_csUpdateBordersIrradianceRow = shader->GetCS("CS_UpdateBorders", 0);
|
||||
_csUpdateBordersIrradianceCollumn = shader->GetCS("CS_UpdateBorders", 1);
|
||||
_csUpdateBordersDistanceRow = shader->GetCS("CS_UpdateBorders", 2);
|
||||
_csUpdateBordersDistanceCollumn = shader->GetCS("CS_UpdateBorders", 3);
|
||||
auto device = GPUDevice::Instance;
|
||||
auto psDesc = GPUPipelineState::Description::DefaultFullscreenTriangle;
|
||||
if (!_psIndirectLighting)
|
||||
@@ -250,10 +246,6 @@ void DynamicDiffuseGlobalIlluminationPass::OnShaderReloading(Asset* obj)
|
||||
_csTraceRays[3] = nullptr;
|
||||
_csUpdateProbesIrradiance = nullptr;
|
||||
_csUpdateProbesDistance = nullptr;
|
||||
_csUpdateBordersIrradianceRow = nullptr;
|
||||
_csUpdateBordersIrradianceCollumn = nullptr;
|
||||
_csUpdateBordersDistanceRow = nullptr;
|
||||
_csUpdateBordersDistanceCollumn = nullptr;
|
||||
SAFE_DELETE_GPU_RESOURCE(_psIndirectLighting);
|
||||
invalidateResources();
|
||||
}
|
||||
@@ -542,7 +534,6 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
|
||||
// Update probes
|
||||
{
|
||||
PROFILE_GPU_CPU_NAMED("Probes Update");
|
||||
bool anyDirty = false;
|
||||
uint32 threadGroupsX, threadGroupsY;
|
||||
#if DDGI_DEBUG_STATS
|
||||
uint32 zero[4] = {};
|
||||
@@ -552,7 +543,6 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
|
||||
{
|
||||
if (cascadeSkipUpdate[cascadeIndex])
|
||||
continue;
|
||||
anyDirty = true;
|
||||
|
||||
// Classify probes (activation/deactivation and relocation)
|
||||
{
|
||||
@@ -667,33 +657,6 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Update probes border pixels
|
||||
if (anyDirty)
|
||||
{
|
||||
PROFILE_GPU_CPU_NAMED("Update Borders");
|
||||
|
||||
// Irradiance
|
||||
context->BindUA(0, ddgiData.Result.ProbesIrradiance);
|
||||
threadGroupsX = Math::DivideAndRoundUp(probesCountTotalX * (DDGI_PROBE_RESOLUTION_IRRADIANCE + 2), DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
|
||||
threadGroupsY = Math::DivideAndRoundUp(probesCountTotalY, DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
|
||||
context->Dispatch(_csUpdateBordersIrradianceRow, threadGroupsX, threadGroupsY, 1);
|
||||
threadGroupsX = Math::DivideAndRoundUp(probesCountTotalX, DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
|
||||
threadGroupsY = Math::DivideAndRoundUp(probesCountTotalY * (DDGI_PROBE_RESOLUTION_IRRADIANCE + 2), DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
|
||||
context->Dispatch(_csUpdateBordersIrradianceCollumn, threadGroupsX, threadGroupsY, 1);
|
||||
|
||||
// Distance
|
||||
context->BindUA(0, ddgiData.Result.ProbesDistance);
|
||||
threadGroupsX = Math::DivideAndRoundUp(probesCountTotalX * (DDGI_PROBE_RESOLUTION_DISTANCE + 2), DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
|
||||
threadGroupsY = Math::DivideAndRoundUp(probesCountTotalY, DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
|
||||
context->Dispatch(_csUpdateBordersDistanceRow, threadGroupsX, threadGroupsY, 1);
|
||||
threadGroupsX = Math::DivideAndRoundUp(probesCountTotalX, DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
|
||||
threadGroupsY = Math::DivideAndRoundUp(probesCountTotalY * (DDGI_PROBE_RESOLUTION_DISTANCE + 2), DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE);
|
||||
context->Dispatch(_csUpdateBordersDistanceCollumn, threadGroupsX, threadGroupsY, 1);
|
||||
|
||||
context->ResetUA();
|
||||
context->ResetSR();
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
@@ -47,10 +47,6 @@ private:
|
||||
GPUShaderProgramCS* _csTraceRays[4];
|
||||
GPUShaderProgramCS* _csUpdateProbesIrradiance;
|
||||
GPUShaderProgramCS* _csUpdateProbesDistance;
|
||||
GPUShaderProgramCS* _csUpdateBordersIrradianceRow;
|
||||
GPUShaderProgramCS* _csUpdateBordersIrradianceCollumn;
|
||||
GPUShaderProgramCS* _csUpdateBordersDistanceRow;
|
||||
GPUShaderProgramCS* _csUpdateBordersDistanceCollumn;
|
||||
GPUPipelineState* _psIndirectLighting;
|
||||
#if USE_EDITOR
|
||||
AssetReference<Model> _debugModel;
|
||||
|
||||
@@ -415,16 +415,22 @@ void CS_TraceRays(uint3 DispatchThreadId : SV_DispatchThreadID)
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(_CS_UpdateProbes) || defined(_CS_UpdateBorders)
|
||||
#if defined(_CS_UpdateProbes)
|
||||
|
||||
#if DDGI_PROBE_UPDATE_MODE == 0
|
||||
// Update irradiance
|
||||
#define DDGI_PROBE_RESOLUTION DDGI_PROBE_RESOLUTION_IRRADIANCE
|
||||
groupshared float4 CachedProbesTraceRadiance[DDGI_TRACE_RAYS_LIMIT];
|
||||
groupshared float OutputInstability[DDGI_PROBE_RESOLUTION * DDGI_PROBE_RESOLUTION];
|
||||
#else
|
||||
// Update distance
|
||||
#define DDGI_PROBE_RESOLUTION DDGI_PROBE_RESOLUTION_DISTANCE
|
||||
groupshared float CachedProbesTraceDistance[DDGI_TRACE_RAYS_LIMIT];
|
||||
#endif
|
||||
|
||||
// Source: https://github.com/turanszkij/WickedEngine
|
||||
#define BorderOffsetsSize (4 * DDGI_PROBE_RESOLUTION + 4)
|
||||
#if DDGI_PROBE_RESOLUTION == 6
|
||||
static const uint4 BorderOffsets[BorderOffsetsSize] = {
|
||||
uint4(6, 1, 1, 0),
|
||||
uint4(5, 1, 2, 0),
|
||||
@@ -457,12 +463,77 @@ static const uint4 BorderOffsets[BorderOffsetsSize] = {
|
||||
uint4(1, 1, 7, 7),
|
||||
uint4(6, 1, 0, 7),
|
||||
uint4(1, 6, 7, 0),
|
||||
uint4(6, 6, 0, 0),
|
||||
uint4(6, 6, 0, 0)
|
||||
};
|
||||
#elif DDGI_PROBE_RESOLUTION == 14
|
||||
static const uint4 BorderOffsets[BorderOffsetsSize] = {
|
||||
uint4(14, 1, 1, 0),
|
||||
uint4(13, 1, 2, 0),
|
||||
uint4(12, 1, 3, 0),
|
||||
uint4(11, 1, 4, 0),
|
||||
uint4(10, 1, 5, 0),
|
||||
uint4(9, 1, 6, 0),
|
||||
uint4(8, 1, 7, 0),
|
||||
uint4(7, 1, 8, 0),
|
||||
uint4(6, 1, 9, 0),
|
||||
uint4(5, 1, 10, 0),
|
||||
uint4(4, 1, 11, 0),
|
||||
uint4(3, 1, 12, 0),
|
||||
uint4(2, 1, 13, 0),
|
||||
uint4(1, 1, 14, 0),
|
||||
|
||||
uint4(14, 14, 1, 15),
|
||||
uint4(13, 14, 2, 15),
|
||||
uint4(12, 14, 3, 15),
|
||||
uint4(11, 14, 4, 15),
|
||||
uint4(10, 14, 5, 15),
|
||||
uint4(9, 14, 6, 15),
|
||||
uint4(8, 14, 7, 15),
|
||||
uint4(7, 14, 8, 15),
|
||||
uint4(6, 14, 9, 15),
|
||||
uint4(5, 14, 10, 15),
|
||||
uint4(4, 14, 11, 15),
|
||||
uint4(3, 14, 12, 15),
|
||||
uint4(2, 14, 13, 15),
|
||||
uint4(1, 14, 14, 15),
|
||||
|
||||
uint4(1, 14, 0, 1),
|
||||
uint4(1, 13, 0, 2),
|
||||
uint4(1, 12, 0, 3),
|
||||
uint4(1, 11, 0, 4),
|
||||
uint4(1, 10, 0, 5),
|
||||
uint4(1, 9, 0, 6),
|
||||
uint4(1, 8, 0, 7),
|
||||
uint4(1, 7, 0, 8),
|
||||
uint4(1, 6, 0, 9),
|
||||
uint4(1, 5, 0, 10),
|
||||
uint4(1, 4, 0, 11),
|
||||
uint4(1, 3, 0, 12),
|
||||
uint4(1, 2, 0, 13),
|
||||
uint4(1, 1, 0, 14),
|
||||
|
||||
uint4(14, 14, 15, 1),
|
||||
uint4(14, 13, 15, 2),
|
||||
uint4(14, 12, 15, 3),
|
||||
uint4(14, 11, 15, 4),
|
||||
uint4(14, 10, 15, 5),
|
||||
uint4(14, 9, 15, 6),
|
||||
uint4(14, 8, 15, 7),
|
||||
uint4(14, 7, 15, 8),
|
||||
uint4(14, 6, 15, 9),
|
||||
uint4(14, 5, 15, 10),
|
||||
uint4(14, 4, 15, 11),
|
||||
uint4(14, 3, 15, 12),
|
||||
uint4(14, 2, 15, 13),
|
||||
uint4(14, 1, 15, 14),
|
||||
|
||||
uint4(14, 14, 0, 0),
|
||||
uint4(1, 14, 15, 0),
|
||||
uint4(14, 1, 0, 15),
|
||||
uint4(1, 1, 15, 15)
|
||||
};
|
||||
#else
|
||||
// Update distance
|
||||
#define DDGI_PROBE_RESOLUTION DDGI_PROBE_RESOLUTION_DISTANCE
|
||||
groupshared float CachedProbesTraceDistance[DDGI_TRACE_RAYS_LIMIT];
|
||||
#error "Unsupported probe size for border values copy."
|
||||
#endif
|
||||
|
||||
groupshared float3 CachedProbesTraceDirection[DDGI_TRACE_RAYS_LIMIT];
|
||||
@@ -635,9 +706,11 @@ void CS_UpdateProbes(uint3 GroupThreadId : SV_GroupThreadID, uint3 GroupId : SV_
|
||||
|
||||
RWOutput[outputCoords] = result;
|
||||
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
uint2 baseCoords = GetDDGIProbeTexelCoords(DDGI, CascadeIndex, probeIndex) * (DDGI_PROBE_RESOLUTION + 2);
|
||||
|
||||
#if DDGI_PROBE_UPDATE_MODE == 0
|
||||
// The first thread updates the probe attention based on the instability of all texels
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
BRANCH
|
||||
if (GroupIndex == 0 && probeState != DDGI_PROBE_STATE_INACTIVE)
|
||||
{
|
||||
@@ -665,7 +738,6 @@ void CS_UpdateProbes(uint3 GroupThreadId : SV_GroupThreadID, uint3 GroupId : SV_
|
||||
|
||||
#if DDGI_DEBUG_INSTABILITY
|
||||
// Copy border pixels
|
||||
uint2 baseCoords = GetDDGIProbeTexelCoords(DDGI, CascadeIndex, probeIndex) * (DDGI_PROBE_RESOLUTION + 2);
|
||||
for (uint borderIndex = GroupIndex; borderIndex < BorderOffsetsSize; borderIndex += DDGI_PROBE_RESOLUTION * DDGI_PROBE_RESOLUTION)
|
||||
{
|
||||
uint4 borderOffsets = BorderOffsets[borderIndex];
|
||||
@@ -673,73 +745,13 @@ void CS_UpdateProbes(uint3 GroupThreadId : SV_GroupThreadID, uint3 GroupId : SV_
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
// Compute shader for updating probes irradiance or distance texture borders (fills gaps between probes to support bilinear filtering)
|
||||
META_CS(true, FEATURE_LEVEL_SM5)
|
||||
META_PERMUTATION_2(DDGI_PROBE_UPDATE_MODE=0, BORDER_ROW=1)
|
||||
META_PERMUTATION_2(DDGI_PROBE_UPDATE_MODE=0, BORDER_ROW=0)
|
||||
META_PERMUTATION_2(DDGI_PROBE_UPDATE_MODE=1, BORDER_ROW=1)
|
||||
META_PERMUTATION_2(DDGI_PROBE_UPDATE_MODE=1, BORDER_ROW=0)
|
||||
[numthreads(DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE, DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE, 1)]
|
||||
void CS_UpdateBorders(uint3 DispatchThreadId : SV_DispatchThreadID)
|
||||
{
|
||||
#define COPY_PIXEL RWOutput[threadCoordinates] = RWOutput[copyCoordinates]
|
||||
#define COPY_PIXEL_DEBUG RWOutput[threadCoordinates] = float4(5, 0, 0, 1)
|
||||
|
||||
uint probeSideLength = DDGI_PROBE_RESOLUTION + 2;
|
||||
uint probeSideLengthMinusOne = probeSideLength - 1;
|
||||
uint2 copyCoordinates = uint2(0, 0);
|
||||
uint2 threadCoordinates = DispatchThreadId.xy;
|
||||
#if BORDER_ROW
|
||||
threadCoordinates.y *= probeSideLength;
|
||||
uint corner = DispatchThreadId.x % probeSideLength;
|
||||
#else
|
||||
threadCoordinates.x *= probeSideLength;
|
||||
uint corner = threadCoordinates.y % probeSideLength;
|
||||
#endif
|
||||
if (corner == 0 || corner == probeSideLengthMinusOne)
|
||||
{
|
||||
#if !BORDER_ROW
|
||||
// Left corner
|
||||
copyCoordinates.x = threadCoordinates.x + DDGI_PROBE_RESOLUTION;
|
||||
copyCoordinates.y = threadCoordinates.y - sign((int)corner - 1) * DDGI_PROBE_RESOLUTION;
|
||||
COPY_PIXEL;
|
||||
|
||||
// Right corner
|
||||
threadCoordinates.x += probeSideLengthMinusOne;
|
||||
copyCoordinates.x = threadCoordinates.x - DDGI_PROBE_RESOLUTION;
|
||||
COPY_PIXEL;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
#if BORDER_ROW
|
||||
// Top row
|
||||
uint probeStart = uint(threadCoordinates.x / probeSideLength) * probeSideLength;
|
||||
uint offset = probeSideLengthMinusOne - (threadCoordinates.x % probeSideLength);
|
||||
copyCoordinates = uint2(probeStart + offset, threadCoordinates.y + 1);
|
||||
#else
|
||||
// Left column
|
||||
uint probeStart = uint(threadCoordinates.y / probeSideLength) * probeSideLength;
|
||||
uint offset = probeSideLengthMinusOne - (threadCoordinates.y % probeSideLength);
|
||||
copyCoordinates = uint2(threadCoordinates.x + 1, probeStart + offset);
|
||||
#endif
|
||||
COPY_PIXEL;
|
||||
|
||||
#if BORDER_ROW
|
||||
// Bottom row
|
||||
threadCoordinates.y += probeSideLengthMinusOne;
|
||||
copyCoordinates = uint2(probeStart + offset, threadCoordinates.y - 1);
|
||||
#else
|
||||
// Right column
|
||||
threadCoordinates.x += probeSideLengthMinusOne;
|
||||
copyCoordinates = uint2(threadCoordinates.x - 1, probeStart + offset);
|
||||
#endif
|
||||
COPY_PIXEL;
|
||||
|
||||
#undef COPY_PIXEL
|
||||
#undef COPY_PIXEL_DEBUG
|
||||
// Copy border pixels
|
||||
for (uint borderIndex = GroupIndex; borderIndex < BorderOffsetsSize; borderIndex += DDGI_PROBE_RESOLUTION * DDGI_PROBE_RESOLUTION)
|
||||
{
|
||||
uint4 borderOffsets = BorderOffsets[borderIndex];
|
||||
RWOutput[baseCoords + borderOffsets.zw] = RWOutput[baseCoords + borderOffsets.xy];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user