Cleanup DOF shader to use shader permutations for vertical/horizontal passes

This commit is contained in:
Wojtek Figat
2022-08-21 13:19:17 +02:00
parent 0b71549834
commit d50908e10b
3 changed files with 67 additions and 196 deletions

BIN
Content/Shaders/DepthOfField.flax (Stored with Git LFS)

Binary file not shown.

View File

@@ -9,7 +9,6 @@
#include "Engine/Graphics/GPUContext.h"
#include "Engine/Graphics/GPUDevice.h"
#include "Engine/Graphics/GPULimits.h"
#include "Engine/Graphics/PostProcessBase.h"
#include "Engine/Graphics/RenderTargetPool.h"
#include "Engine/Graphics/RenderBuffers.h"
#include "Engine/Graphics/RenderTask.h"
@@ -300,7 +299,7 @@ GPUTexture* DepthOfFieldPass::Render(RenderContext& renderContext, GPUTexture* i
uint32 groupCountX = (cocWidth / DOF_GRID_SIZE) + ((cocWidth % DOF_GRID_SIZE) > 0 ? 1 : 0);
uint32 groupCountY = cocHeight;
//
context->Dispatch(shader->GetCS("CS_CoCSpreadH"), groupCountX, groupCountY, 1);
context->Dispatch(shader->GetCS("CS_CoCSpread", 0), groupCountX, groupCountY, 1);
// Vertical pass
context->BindSR(0, tempTarget);
@@ -310,7 +309,7 @@ GPUTexture* DepthOfFieldPass::Render(RenderContext& renderContext, GPUTexture* i
groupCountX = cocWidth;
groupCountY = (cocHeight / DOF_GRID_SIZE) + (cocHeight % DOF_GRID_SIZE) > 0 ? 1 : 0;
//
context->Dispatch(shader->GetCS("CS_CoCSpreadV"), groupCountX, groupCountY, 1);
context->Dispatch(shader->GetCS("CS_CoCSpread", 1), groupCountX, groupCountY, 1);
// Cleanup
context->ResetRenderTarget();
@@ -389,7 +388,7 @@ GPUTexture* DepthOfFieldPass::Render(RenderContext& renderContext, GPUTexture* i
uint32 groupCountX = (dofWidth / DOF_GRID_SIZE) + ((dofWidth % DOF_GRID_SIZE) > 0 ? 1 : 0);
uint32 groupCountY = dofHeight;
//
context->Dispatch(shader->GetCS("CS_DepthOfFieldH"), groupCountX, groupCountY, 1);
context->Dispatch(shader->GetCS("CS_DepthOfField", 0), groupCountX, groupCountY, 1);
// Cleanup
context->ResetRenderTarget();
@@ -406,7 +405,7 @@ GPUTexture* DepthOfFieldPass::Render(RenderContext& renderContext, GPUTexture* i
groupCountY = (dofHeight / DOF_GRID_SIZE) + ((dofHeight % DOF_GRID_SIZE) > 0 ? 1 : 0);
//
// TODO: cache Compute Shaders
context->Dispatch(shader->GetCS("CS_DepthOfFieldV"), groupCountX, groupCountY, 1);
context->Dispatch(shader->GetCS("CS_DepthOfField", 1), groupCountX, groupCountY, 1);
context->ResetRenderTarget();
// Cleanup

View File

@@ -117,7 +117,7 @@ float4 PS_DofDepthBlurGeneration(Quad_VS2PS input) : SV_Target
return float4(depth, blur, 1.0f, 1.0f);
}
#if defined(_CS_DepthOfFieldH) || defined(_CS_DepthOfFieldV)
#if defined(_CS_DepthOfField)
RWTexture2D<float4> OutputTexture : register(u0);
@@ -131,24 +131,37 @@ struct DOFSample
// Shared memory for actial depth of field pass
groupshared DOFSample Samples[DOF_THREAD_GROUP_SIZE];
// Performs the horizontal pass for the DOF blur
#if HORIZONTAL
#define _CS_DepthOfField_X DOF_THREAD_GROUP_SIZE
#define _CS_DepthOfField_Y 1
#define DOF_COMP x
#else
#define _CS_DepthOfField_X 1
#define _CS_DepthOfField_Y DOF_THREAD_GROUP_SIZE
#define DOF_COMP y
#endif
// Performs the blur pass for the DOF
META_CS(true, FEATURE_LEVEL_SM5)
[numthreads(DOF_THREAD_GROUP_SIZE, 1, 1)]
void CS_DepthOfFieldH(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
META_PERMUTATION_1(HORIZONTAL=1)
META_PERMUTATION_1(HORIZONTAL=0)
[numthreads(_CS_DepthOfField_X, _CS_DepthOfField_Y, 1)]
void CS_DepthOfField(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
// These positions are relative to the "grid", AKA the horizontal group of pixels that this thread group is writing to
const int gridStartX = groupID.x * DOF_GRID_SIZE;
const int gridX = groupThreadID.x - DOF_APRON_SIZE;
const int gridStart = groupID.DOF_COMP * DOF_GRID_SIZE;
const int grid = groupThreadID.DOF_COMP - DOF_APRON_SIZE;
// These positions are relative to the pixel coordinates
const uint sampleX = max(gridStartX + gridX, 0);
const uint sampleY = groupID.y;
#if HORIZONTAL
const uint2 samplePos = uint2(max(gridStart + grid, 0), groupID.y);
#else
const uint2 samplePos = uint2(groupID.x, max(gridStart + grid, 0));
#endif
uint2 textureSize;
Input0.GetDimensions(textureSize.x, textureSize.y);
const uint2 samplePos = uint2(sampleX, sampleY);
// Sample the textures
#if USE_CS_HALF_PIXEL_OFFSET
float2 sampleCoord = saturate(((float2)samplePos + 0.5f) / float2(textureSize));
@@ -167,14 +180,13 @@ void CS_DepthOfFieldH(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_Group
float cocSize = blur * DOF_MAX_SAMPLE_RADIUS;
// Store in shared memory
Samples[groupThreadID.x].Color = color.rgb;
Samples[groupThreadID.x].Depth = depth;
Samples[groupThreadID.x].Blur = blur;
Samples[groupThreadID.DOF_COMP].Color = color.rgb;
Samples[groupThreadID.DOF_COMP].Depth = depth;
Samples[groupThreadID.DOF_COMP].Blur = blur;
GroupMemoryBarrierWithGroupSync();
// Don't continue for threads in the apron, and threads outside the render target size
if (gridX >= 0 && gridX < DOF_GRID_SIZE && sampleX < textureSize.x)
if (grid >= 0 && grid < DOF_GRID_SIZE && samplePos.DOF_COMP < textureSize.DOF_COMP)
{
BRANCH
if (cocSize > 0.0f)
@@ -183,14 +195,14 @@ void CS_DepthOfFieldH(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_Group
float totalContribution = 0.0f;
// Gather sample taps inside the radius
for (int x = -DOF_MAX_SAMPLE_RADIUS; x <= DOF_MAX_SAMPLE_RADIUS; x++)
for (int i = -DOF_MAX_SAMPLE_RADIUS; i <= DOF_MAX_SAMPLE_RADIUS; i++)
{
// Grab the sample from shared memory
uint groupTapX = groupThreadID.x + x;
DOFSample tap = Samples[groupTapX];
uint groupTap = groupThreadID.DOF_COMP + i;
DOFSample tap = Samples[groupTap];
// Reject the sample if it's outside the CoC radius
float cocWeight = saturate(cocSize + 1.0f - abs(float(x)));
float cocWeight = saturate(cocSize + 1.0f - abs(float(i)));
// Reject foreground samples, unless they're blurred as well
float depthWeight = tap.Depth >= depth;
@@ -212,88 +224,7 @@ void CS_DepthOfFieldH(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_Group
}
}
// Performs the vertical DOF pass
META_CS(true, FEATURE_LEVEL_SM5)
[numthreads(1, DOF_THREAD_GROUP_SIZE, 1)]
void CS_DepthOfFieldV(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
// These positions are relative to the "grid", AKA the vertical group of pixels that this thread group is writing to
const int gridStartY = groupID.y * DOF_GRID_SIZE;
const int gridY = groupThreadID.y - DOF_APRON_SIZE;
// These positions are relative to the pixel coordinates
const uint sampleX = groupID.x;
const uint sampleY = max(gridStartY + gridY, 0);
uint2 textureSize;
Input0.GetDimensions(textureSize.x, textureSize.y);
const uint2 samplePos = uint2(sampleX, sampleY);
// Sample the textures
#if USE_CS_HALF_PIXEL_OFFSET
float2 sampleCoord = saturate(((float2)samplePos + 0.5f) / float2(textureSize));
#else
float2 sampleCoord = saturate(samplePos / float2(textureSize));
#endif
#if USE_CS_LINEAR_SAMPLING
float4 color = Input0.SampleLevel(SamplerLinearClamp, sampleCoord, 0.0f).rgba;
float2 depthBlur = Input1.SampleLevel(SamplerLinearClamp, sampleCoord, 0.0f).xy;
#else
float4 color = Input0.SampleLevel(SamplerPointClamp, sampleCoord, 0.0f).rgba;
float2 depthBlur = Input1.SampleLevel(SamplerPointClamp, sampleCoord, 0.0f).xy;
#endif
float depth = depthBlur.x;
float blur = depthBlur.y;
float cocSize = blur * DOF_MAX_SAMPLE_RADIUS;
// Store in shared memory
Samples[groupThreadID.y].Color = color.rgb;
Samples[groupThreadID.y].Depth = depth;
Samples[groupThreadID.y].Blur = blur;
GroupMemoryBarrierWithGroupSync();
// Don't continue for threads in the apron, and threads outside the render target size
if (gridY >= 0 && gridY < DOF_GRID_SIZE && sampleY < textureSize.y)
{
BRANCH
if (cocSize > 0.0f)
{
float3 outputColor = 0.0f;
float totalContribution = 0.0f;
// Gather sample taps inside the radius
for (int y = -DOF_MAX_SAMPLE_RADIUS; y <= DOF_MAX_SAMPLE_RADIUS; y++)
{
// Grab the sample from shared memory
uint groupTapY = groupThreadID.y + y;
DOFSample tap = Samples[groupTapY];
// Reject the sample if it's outside the CoC radius
float cocWeight = saturate(cocSize + 1.0f - abs(float(y)));
// Reject foreground samples, unless they're blurred as well
float depthWeight = tap.Depth >= depth;
float blurWeight = tap.Blur;
float tapWeight = cocWeight * saturate(depthWeight + blurWeight);
outputColor += tap.Color * tapWeight;
totalContribution += tapWeight;
}
// Write out the result
outputColor /= totalContribution;
OutputTexture[samplePos] = float4(max(outputColor, 0), color.a);
}
else
{
OutputTexture[samplePos] = color;
}
}
}
#elif defined(_CS_CoCSpreadH) || defined(_CS_CoCSpreadV)
#elif defined(_CS_CoCSpread)
struct CoCSample
{
@@ -305,24 +236,37 @@ RWTexture2D<float2> OutputTexture : register(u0);
groupshared CoCSample Samples[DOF_THREAD_GROUP_SIZE];
// Performs the horizontal CoC spread
#if HORIZONTAL
#define _CS_CoCSpread_X DOF_THREAD_GROUP_SIZE
#define _CS_CoCSpread_Y 1
#define DOF_COMP x
#else
#define _CS_CoCSpread_X 1
#define _CS_CoCSpread_Y DOF_THREAD_GROUP_SIZE
#define DOF_COMP y
#endif
// Performs the CoC spread
META_CS(true, FEATURE_LEVEL_SM5)
[numthreads(DOF_THREAD_GROUP_SIZE, 1, 1)]
void CS_CoCSpreadH(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
META_PERMUTATION_1(HORIZONTAL=1)
META_PERMUTATION_1(HORIZONTAL=0)
[numthreads(_CS_CoCSpread_X, _CS_CoCSpread_Y, 1)]
void CS_CoCSpread(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
// These positions are relative to the "grid", AKA the horizontal group of pixels that this thread group is writing to
const int gridStartX = groupID.x * DOF_GRID_SIZE;
const int gridX = groupThreadID.x - DOF_APRON_SIZE;
const int gridStart = groupID.DOF_COMP * DOF_GRID_SIZE;
const int grid = groupThreadID.DOF_COMP - DOF_APRON_SIZE;
// These positions are relative to the pixel coordinates
const uint sampleX = max(gridStartX + gridX, 0);
const uint sampleY = groupID.y;
#if HORIZONTAL
const uint2 samplePos = uint2(max(gridStart + grid, 0), groupID.y);
#else
const uint2 samplePos = uint2(groupID.x, max(gridStart + grid, 0));
#endif
uint2 textureSize;
Input0.GetDimensions(textureSize.x, textureSize.y);
const uint2 samplePos = uint2(sampleX, sampleY);
// Sample the textures
#if USE_CS_HALF_PIXEL_OFFSET
float2 sampleCoord = saturate(((float2)samplePos + 0.5f) / float2(textureSize));
@@ -334,29 +278,27 @@ void CS_CoCSpreadH(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThr
#else
float2 depthBlur = Input0.SampleLevel(SamplerPointClamp, sampleCoord, 0.0f).xy;
#endif
float depth = depthBlur.x;
float blur = depthBlur.y;
float cocSize = blur * DOF_MAX_SAMPLE_RADIUS;
// Store in shared memory
Samples[groupThreadID.x].Depth = depth;
Samples[groupThreadID.x].Blur = blur;
Samples[groupThreadID.DOF_COMP].Depth = depth;
Samples[groupThreadID.DOF_COMP].Blur = blur;
GroupMemoryBarrierWithGroupSync();
// Don't continue for threads in the apron, and threads outside the render target size
if (gridX >= 0 && gridX < DOF_GRID_SIZE && sampleX < textureSize.x)
if (grid >= 0 && grid < DOF_GRID_SIZE && samplePos.DOF_COMP < textureSize.DOF_COMP)
{
float outputBlur = 0.0f;
float totalContribution = 0.0f;
// Gather sample taps inside the radius
for (int x = -DOF_MAX_SAMPLE_RADIUS; x <= DOF_MAX_SAMPLE_RADIUS; x++)
for (int i = -DOF_MAX_SAMPLE_RADIUS; i <= DOF_MAX_SAMPLE_RADIUS; i++)
{
// Grab the sample from shared memory
uint groupTapX = groupThreadID.x + x;
CoCSample tap = Samples[groupTapX];
uint groupTap = groupThreadID.DOF_COMP + i;
CoCSample tap = Samples[groupTap];
// Only accept samples if they're from the foreground, and have a higher blur amount
float depthWeight = tap.Depth <= depth;
@@ -364,77 +306,7 @@ void CS_CoCSpreadH(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThr
float tapWeight = depthWeight * blurWeight;
// If it's the center tap, set the weight to 1 so and don't reject it
float centerWeight = x == 0 ? 1.0 : 0.0f;
tapWeight = saturate(tapWeight + centerWeight);
outputBlur += tap.Blur * tapWeight;
totalContribution += tapWeight;
}
// Write out the result
OutputTexture[samplePos] = float2(depth, outputBlur / totalContribution);
}
}
// Performs the vertical CoC spread
META_CS(true, FEATURE_LEVEL_SM5)
[numthreads(1, DOF_THREAD_GROUP_SIZE, 1)]
void CS_CoCSpreadV(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
{
// These positions are relative to the "grid", AKA the vertical group of pixels that this thread group is writing to
const int gridStartY = groupID.y * DOF_GRID_SIZE;
const int gridY = groupThreadID.y - DOF_APRON_SIZE;
// These positions are relative to the pixel coordinates
const uint sampleX = groupID.x;
const uint sampleY = max(gridStartY + gridY, 0);
uint2 textureSize;
Input0.GetDimensions(textureSize.x, textureSize.y);
const uint2 samplePos = uint2(sampleX, sampleY);
// Sample the textures
#if USE_CS_HALF_PIXEL_OFFSET
float2 sampleCoord = saturate(((float2)samplePos + 0.5f) / float2(textureSize));
#else
float2 sampleCoord = saturate(samplePos / float2(textureSize));
#endif
#if USE_CS_LINEAR_SAMPLING
float2 depthBlur = Input0.SampleLevel(SamplerLinearClamp, sampleCoord, 0.0f).xy;
#else
float2 depthBlur = Input0.SampleLevel(SamplerPointClamp, sampleCoord, 0.0f).xy;
#endif
float depth = depthBlur.x;
float blur = depthBlur.y;
float cocSize = blur * DOF_MAX_SAMPLE_RADIUS;
// Store in shared memory
Samples[groupThreadID.y].Depth = depth;
Samples[groupThreadID.y].Blur = blur;
GroupMemoryBarrierWithGroupSync();
// Don't continue for threads in the apron, and threads outside the render target size
if (gridY >= 0 && gridY < DOF_GRID_SIZE && sampleY < textureSize.y)
{
float outputBlur = 0.0f;
float totalContribution = 0.0f;
// Gather sample taps inside the radius
for (int y = -DOF_MAX_SAMPLE_RADIUS; y <= DOF_MAX_SAMPLE_RADIUS; y++)
{
// Grab the sample from shared memory
uint groupTapY = groupThreadID.y + y;
CoCSample tap = Samples[groupTapY];
// Only accept samples if they're from the foreground, and have a higher blur amount
float depthWeight = tap.Depth <= depth;
float blurWeight = saturate(tap.Blur - blur);
float tapWeight = depthWeight * blurWeight;
// If it's the center tap, set the weight to 1 and don't reject it
float centerWeight = y == 0 ? 1.0 : 0.0f;
float centerWeight = i == 0 ? 1.0 : 0.0f;
tapWeight = saturate(tapWeight + centerWeight);
outputBlur += tap.Blur * tapWeight;