Add minor optimizations to particles drawing

This commit is contained in:
Wojtek Figat
2025-08-08 13:11:05 +02:00
parent 0369d9b2cb
commit 90d1e63b58
5 changed files with 33 additions and 38 deletions

BIN
Content/Shaders/BitonicSort.flax (Stored with Git LFS)

Binary file not shown.

View File

@@ -845,6 +845,7 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
if (sorting)
{
PROFILE_GPU_CPU_NAMED("Sort Particles");
context->BindCB(0, GPUParticlesSortingCB);
// Generate sort keys for each particle
for (const GPUEmitterDraw& draw : GPUEmitterDraws)
@@ -917,7 +918,6 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
#endif
}
context->UpdateCB(GPUParticlesSortingCB, &data);
context->BindCB(0, GPUParticlesSortingCB);
context->BindSR(0, draw.Buffer->GPU.Buffer->View());
context->BindUA(0, draw.Buffer->GPU.SortingKeysBuffer->View());
const int32 threadGroupSize = 1024;
@@ -939,7 +939,7 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
auto module = emitter->Graph.SortModules[moduleIndex];
const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
bool sortAscending = sortMode == ParticleSortMode::CustomAscending;
BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices);
BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.ParticlesCountMax);
// TODO: split sorted keys copy with another loop to give time for UAV transition
// TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier
}

View File

@@ -6,15 +6,9 @@
#include "Engine/Graphics/GPUContext.h"
#include "Engine/Graphics/GPULimits.h"
// The sorting keys buffer item structure template. Matches the shader type.
struct Item
{
float Key;
uint32 Value;
};
GPU_CB_STRUCT(Data {
Item NullItem;
float NullItemKey;
uint32 NullItemValue;
uint32 CounterOffset;
uint32 MaxIterations;
uint32 LoopK;
@@ -86,22 +80,22 @@ void BitonicSort::Dispose()
_shader = nullptr;
}
void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer)
void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements)
{
ASSERT(context && sortingKeysBuffer && countBuffer);
if (checkIfSkipPass())
return;
PROFILE_GPU_CPU("Bitonic Sort");
const uint32 elementSizeBytes = sizeof(uint64);
const uint32 maxNumElements = sortingKeysBuffer->GetSize() / elementSizeBytes;
const uint32 maxNumElements = maxElements != 0 ? maxElements : sortingKeysBuffer->GetSize() / elementSizeBytes;
const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements);
const uint32 maxIterations = (uint32)Math::Log2((float)Math::Max(2048u, alignedMaxNumElements)) - 10;
// Setup constants buffer
Data data;
data.CounterOffset = counterOffset;
data.NullItem.Key = sortAscending ? MAX_float : -MAX_float;
data.NullItem.Value = 0;
data.NullItemKey = sortAscending ? MAX_float : -MAX_float;
data.NullItemValue = 0;
data.KeySign = sortAscending ? -1.0f : 1.0f;
data.MaxIterations = maxIterations;
data.LoopK = 0;
@@ -128,7 +122,6 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
data.LoopK = k;
data.LoopJ = j;
context->UpdateCB(_cb, &data);
context->BindCB(0, _cb);
context->DispatchIndirect(_outerSortCS, _dispatchArgsBuffer, indirectArgsOffset);
indirectArgsOffset += sizeof(GPUDispatchIndirectArgs);

View File

@@ -34,7 +34,8 @@ public:
/// <param name="counterOffset">The offset into counter buffer to find count for this list. Must be a multiple of 4 bytes.</param>
/// <param name="sortAscending">True to sort in ascending order (smallest to largest), otherwise false to sort in descending order.</param>
/// <param name="sortedIndicesBuffer">The output buffer for sorted values extracted from the sorted sortingKeysBuffer after algorithm run. Valid for uint value types - used as RWBuffer.</param>
void Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer);
/// <param name="maxElements">Optional upper limit of elements to sort. Cna be used to optimize indirect dispatches allocation. If non-zero, then it gets calculated based on the input item buffer size.</param>
void Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements = 0);
public:

View File

@@ -36,14 +36,14 @@ uint InsertOneBit(uint value, uint oneBitMask)
// (effectively a negation) or leave the value alone. When the KeySign is
// 1, we are sorting descending, so when A < B, they should swap. For an
// ascending sort, -A < -B should swap.
bool ShouldSwap(Item a, Item b)
bool ShouldSwap(Item a, Item b, float keySign)
{
//return (a ^ NullItem) < (b ^ NullItem);
//return (a.Key) < (b.Key);
return (a.Key * KeySign) < (b.Key * KeySign);
return (a.Key * keySign) < (b.Key * keySign);
//return asfloat(a) < asfloat(b);
//return (asfloat(a) * KeySign) < (asfloat(b) * KeySign);
//return (asfloat(a) * keySign) < (asfloat(b) * keySign);
}
#ifdef _CS_IndirectArgs
@@ -136,6 +136,7 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
GroupMemoryBarrierWithGroupSync();
float keySign = KeySign;
UNROLL
for (uint k = 2; k <= 2048; k <<= 1)
{
@@ -144,14 +145,14 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
uint index2 = InsertOneBit(groupIndex, j);
uint index1 = index2 ^ (k == 2 * j ? k - 1 : j);
Item A = SortData[index1];
Item B = SortData[index2];
Item a = SortData[index1];
Item b = SortData[index2];
if (ShouldSwap(A, B))
if (ShouldSwap(a, b, keySign))
{
// Swap the items
SortData[index1] = B;
SortData[index2] = A;
SortData[index1] = b;
SortData[index2] = a;
}
GroupMemoryBarrierWithGroupSync();
@@ -182,20 +183,21 @@ void CS_InnerSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
GroupMemoryBarrierWithGroupSync();
float keySign = KeySign;
UNROLL
for (uint j = 1024; j > 0; j /= 2)
{
uint index2 = InsertOneBit(groupIndex, j);
uint index1 = index2 ^ j;
Item A = SortData[index1];
Item B = SortData[index2];
Item a = SortData[index1];
Item b = SortData[index2];
if (ShouldSwap(A, B))
if (ShouldSwap(a, b, keySign))
{
// Swap the items
SortData[index1] = B;
SortData[index2] = A;
SortData[index1] = b;
SortData[index2] = a;
}
GroupMemoryBarrierWithGroupSync();
@@ -224,14 +226,15 @@ void CS_OuterSort(uint3 dispatchThreadId : SV_DispatchThreadID)
if (index2 >= count)
return;
Item A = SortBuffer[index1];
Item B = SortBuffer[index2];
Item a = SortBuffer[index1];
Item b = SortBuffer[index2];
if (ShouldSwap(A, B))
float keySign = KeySign;
if (ShouldSwap(a, b, keySign))
{
// Swap the items
SortBuffer[index1] = B;
SortBuffer[index2] = A;
SortBuffer[index1] = b;
SortBuffer[index2] = a;
}
}
@@ -248,12 +251,10 @@ void CS_CopyIndices(uint3 dispatchThreadId : SV_DispatchThreadID)
{
const uint count = CounterBuffer.Load(CounterOffset);
uint index = dispatchThreadId.x;
if (index >= count)
return;
Item element = SortBuffer[index];
SortedIndices[index] = element.Value;
}