Optimize thread group size for DDGI trace rays compute shader

This commit is contained in:
Wojciech Figat
2022-06-21 10:32:25 +02:00
parent 10adf1eba1
commit 4637017707
3 changed files with 19 additions and 12 deletions

View File

@@ -34,7 +34,6 @@
// This must match HLSL
#define DDGI_TRACE_RAYS_PROBES_COUNT_LIMIT 4096 // Maximum amount of probes to update at once during rays tracing and blending
#define DDGI_TRACE_RAYS_GROUP_SIZE_X 32
#define DDGI_TRACE_RAYS_LIMIT 256 // Limit of rays per-probe (runtime value can be smaller)
#define DDGI_PROBE_RESOLUTION_IRRADIANCE 6 // Resolution (in texels) for probe irradiance data (excluding 1px padding on each side)
#define DDGI_PROBE_RESOLUTION_DISTANCE 14 // Resolution (in texels) for probe distance data (excluding 1px padding on each side)
@@ -175,7 +174,10 @@ bool DynamicDiffuseGlobalIlluminationPass::setupResources()
if (!_cb0 || !_cb1)
return true;
_csClassify = shader->GetCS("CS_Classify");
_csTraceRays = shader->GetCS("CS_TraceRays");
_csTraceRays[0] = shader->GetCS("CS_TraceRays", 0);
_csTraceRays[1] = shader->GetCS("CS_TraceRays", 1);
_csTraceRays[2] = shader->GetCS("CS_TraceRays", 2);
_csTraceRays[3] = shader->GetCS("CS_TraceRays", 3);
_csUpdateProbesIrradiance = shader->GetCS("CS_UpdateProbes", 0);
_csUpdateProbesDistance = shader->GetCS("CS_UpdateProbes", 1);
_csUpdateBordersIrradianceRow = shader->GetCS("CS_UpdateBorders", 0);
@@ -202,7 +204,10 @@ void DynamicDiffuseGlobalIlluminationPass::OnShaderReloading(Asset* obj)
{
LastFrameShaderReload = Engine::FrameCount;
_csClassify = nullptr;
_csTraceRays = nullptr;
_csTraceRays[0] = nullptr;
_csTraceRays[1] = nullptr;
_csTraceRays[2] = nullptr;
_csTraceRays[3] = nullptr;
_csUpdateProbesIrradiance = nullptr;
_csUpdateProbesDistance = nullptr;
_csUpdateBordersIrradianceRow = nullptr;
@@ -222,7 +227,6 @@ void DynamicDiffuseGlobalIlluminationPass::Dispose()
// Cleanup
_cb0 = nullptr;
_cb1 = nullptr;
_csTraceRays = nullptr;
_shader = nullptr;
SAFE_DELETE_GPU_RESOURCE(_psIndirectLighting);
#if USE_EDITOR
@@ -272,7 +276,7 @@ bool DynamicDiffuseGlobalIlluminationPass::Render(RenderContext& renderContext,
auto* graphicsSettings = GraphicsSettings::Get();
const float probesSpacing = Math::Clamp(graphicsSettings->GIProbesSpacing, 10.0f, 1000.0f); // GI probes placement spacing nearby camera (for closest cascade; gets automatically reduced for further cascades)
int32 probeRaysCount; // Amount of rays to trace randomly around each probe
switch (Graphics::GIQuality)
switch (Graphics::GIQuality) // Ensure to match CS_TraceRays permutations
{
case Quality::Low:
probeRaysCount = 96;
@@ -284,11 +288,12 @@ bool DynamicDiffuseGlobalIlluminationPass::Render(RenderContext& renderContext,
probeRaysCount = 192;
break;
case Quality::Ultra:
default:
probeRaysCount = 256;
break;
default:
return true;
}
ASSERT_LOW_LAYER(Math::Min(Math::AlignUp(probeRaysCount, DDGI_TRACE_RAYS_GROUP_SIZE_X), DDGI_TRACE_RAYS_LIMIT) == probeRaysCount);
ASSERT_LOW_LAYER(probeRaysCount <= DDGI_TRACE_RAYS_LIMIT);
bool debugProbes = renderContext.View.Mode == ViewMode::GlobalIllumination;
const float indirectLightingIntensity = settings.Intensity;
const float probeHistoryWeight = Math::Clamp(settings.TemporalResponse, 0.0f, 0.98f);
@@ -534,7 +539,6 @@ bool DynamicDiffuseGlobalIlluminationPass::Render(RenderContext& renderContext,
PROFILE_GPU_CPU("Trace Rays");
// Global SDF with Global Surface Atlas software raytracing (thread X - per probe ray, thread Y - per probe)
ASSERT_LOW_LAYER((probeRaysCount % DDGI_TRACE_RAYS_GROUP_SIZE_X) == 0);
context->BindSR(0, bindingDataSDF.Texture ? bindingDataSDF.Texture->ViewVolume() : nullptr);
context->BindSR(1, bindingDataSDF.TextureMip ? bindingDataSDF.TextureMip->ViewVolume() : nullptr);
context->BindSR(2, bindingDataSurfaceAtlas.Chunks ? bindingDataSurfaceAtlas.Chunks->View() : nullptr);
@@ -544,7 +548,7 @@ bool DynamicDiffuseGlobalIlluminationPass::Render(RenderContext& renderContext,
context->BindSR(6, ddgiData.Result.ProbesState);
context->BindSR(7, skybox);
context->BindUA(0, ddgiData.ProbesTrace->View());
context->Dispatch(_csTraceRays, probeRaysCount / DDGI_TRACE_RAYS_GROUP_SIZE_X, probesBatchSize, 1);
context->Dispatch(_csTraceRays[(int32)Graphics::GIQuality], 1, probesBatchSize, 1);
context->ResetUA();
context->ResetSR();
#if 0

View File

@@ -46,7 +46,7 @@ private:
GPUConstantBuffer* _cb0 = nullptr;
GPUConstantBuffer* _cb1 = nullptr;
GPUShaderProgramCS* _csClassify;
GPUShaderProgramCS* _csTraceRays;
GPUShaderProgramCS* _csTraceRays[4];
GPUShaderProgramCS* _csUpdateProbesIrradiance;
GPUShaderProgramCS* _csUpdateProbesDistance;
GPUShaderProgramCS* _csUpdateBordersIrradianceRow;

View File

@@ -18,7 +18,6 @@
// This must match C++
#define DDGI_TRACE_RAYS_LIMIT 256 // Limit of rays per-probe (runtime value can be smaller)
#define DDGI_TRACE_RAYS_GROUP_SIZE_X 32
#define DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE 8
#define DDGI_PROBE_CLASSIFY_GROUP_SIZE 32
@@ -151,7 +150,11 @@ TextureCube Skybox : register(t7);
// Compute shader for tracing rays for probes using Global SDF and Global Surface Atlas.
META_CS(true, FEATURE_LEVEL_SM5)
[numthreads(DDGI_TRACE_RAYS_GROUP_SIZE_X, 1, 1)]
META_PERMUTATION_1(DDGI_TRACE_RAYS_COUNT=96)
META_PERMUTATION_1(DDGI_TRACE_RAYS_COUNT=128)
META_PERMUTATION_1(DDGI_TRACE_RAYS_COUNT=192)
META_PERMUTATION_1(DDGI_TRACE_RAYS_COUNT=256)
[numthreads(DDGI_TRACE_RAYS_COUNT, 1, 1)]
void CS_TraceRays(uint3 DispatchThreadId : SV_DispatchThreadID)
{
uint rayIndex = DispatchThreadId.x;