Add minor optimizations to particles drawing
This commit is contained in:
BIN
Content/Shaders/BitonicSort.flax
(Stored with Git LFS)
BIN
Content/Shaders/BitonicSort.flax
(Stored with Git LFS)
Binary file not shown.
@@ -845,6 +845,7 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
|
||||
if (sorting)
|
||||
{
|
||||
PROFILE_GPU_CPU_NAMED("Sort Particles");
|
||||
context->BindCB(0, GPUParticlesSortingCB);
|
||||
|
||||
// Generate sort keys for each particle
|
||||
for (const GPUEmitterDraw& draw : GPUEmitterDraws)
|
||||
@@ -917,7 +918,6 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
|
||||
#endif
|
||||
}
|
||||
context->UpdateCB(GPUParticlesSortingCB, &data);
|
||||
context->BindCB(0, GPUParticlesSortingCB);
|
||||
context->BindSR(0, draw.Buffer->GPU.Buffer->View());
|
||||
context->BindUA(0, draw.Buffer->GPU.SortingKeysBuffer->View());
|
||||
const int32 threadGroupSize = 1024;
|
||||
@@ -939,7 +939,7 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
|
||||
auto module = emitter->Graph.SortModules[moduleIndex];
|
||||
const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
|
||||
bool sortAscending = sortMode == ParticleSortMode::CustomAscending;
|
||||
BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices);
|
||||
BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.ParticlesCountMax);
|
||||
// TODO: split sorted keys copy with another loop to give time for UAV transition
|
||||
// TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier
|
||||
}
|
||||
|
||||
@@ -6,15 +6,9 @@
|
||||
#include "Engine/Graphics/GPUContext.h"
|
||||
#include "Engine/Graphics/GPULimits.h"
|
||||
|
||||
// The sorting keys buffer item structure template. Matches the shader type.
|
||||
struct Item
|
||||
{
|
||||
float Key;
|
||||
uint32 Value;
|
||||
};
|
||||
|
||||
GPU_CB_STRUCT(Data {
|
||||
Item NullItem;
|
||||
float NullItemKey;
|
||||
uint32 NullItemValue;
|
||||
uint32 CounterOffset;
|
||||
uint32 MaxIterations;
|
||||
uint32 LoopK;
|
||||
@@ -86,22 +80,22 @@ void BitonicSort::Dispose()
|
||||
_shader = nullptr;
|
||||
}
|
||||
|
||||
void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer)
|
||||
void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements)
|
||||
{
|
||||
ASSERT(context && sortingKeysBuffer && countBuffer);
|
||||
if (checkIfSkipPass())
|
||||
return;
|
||||
PROFILE_GPU_CPU("Bitonic Sort");
|
||||
const uint32 elementSizeBytes = sizeof(uint64);
|
||||
const uint32 maxNumElements = sortingKeysBuffer->GetSize() / elementSizeBytes;
|
||||
const uint32 maxNumElements = maxElements != 0 ? maxElements : sortingKeysBuffer->GetSize() / elementSizeBytes;
|
||||
const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements);
|
||||
const uint32 maxIterations = (uint32)Math::Log2((float)Math::Max(2048u, alignedMaxNumElements)) - 10;
|
||||
|
||||
// Setup constants buffer
|
||||
Data data;
|
||||
data.CounterOffset = counterOffset;
|
||||
data.NullItem.Key = sortAscending ? MAX_float : -MAX_float;
|
||||
data.NullItem.Value = 0;
|
||||
data.NullItemKey = sortAscending ? MAX_float : -MAX_float;
|
||||
data.NullItemValue = 0;
|
||||
data.KeySign = sortAscending ? -1.0f : 1.0f;
|
||||
data.MaxIterations = maxIterations;
|
||||
data.LoopK = 0;
|
||||
@@ -128,7 +122,6 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
|
||||
data.LoopK = k;
|
||||
data.LoopJ = j;
|
||||
context->UpdateCB(_cb, &data);
|
||||
context->BindCB(0, _cb);
|
||||
|
||||
context->DispatchIndirect(_outerSortCS, _dispatchArgsBuffer, indirectArgsOffset);
|
||||
indirectArgsOffset += sizeof(GPUDispatchIndirectArgs);
|
||||
|
||||
@@ -34,7 +34,8 @@ public:
|
||||
/// <param name="counterOffset">The offset into counter buffer to find count for this list. Must be a multiple of 4 bytes.</param>
|
||||
/// <param name="sortAscending">True to sort in ascending order (smallest to largest), otherwise false to sort in descending order.</param>
|
||||
/// <param name="sortedIndicesBuffer">The output buffer for sorted values extracted from the sorted sortingKeysBuffer after algorithm run. Valid for uint value types - used as RWBuffer.</param>
|
||||
void Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer);
|
||||
/// <param name="maxElements">Optional upper limit of elements to sort. Cna be used to optimize indirect dispatches allocation. If non-zero, then it gets calculated based on the input item buffer size.</param>
|
||||
void Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements = 0);
|
||||
|
||||
public:
|
||||
|
||||
|
||||
@@ -36,14 +36,14 @@ uint InsertOneBit(uint value, uint oneBitMask)
|
||||
// (effectively a negation) or leave the value alone. When the KeySign is
|
||||
// 1, we are sorting descending, so when A < B, they should swap. For an
|
||||
// ascending sort, -A < -B should swap.
|
||||
bool ShouldSwap(Item a, Item b)
|
||||
bool ShouldSwap(Item a, Item b, float keySign)
|
||||
{
|
||||
//return (a ^ NullItem) < (b ^ NullItem);
|
||||
|
||||
//return (a.Key) < (b.Key);
|
||||
return (a.Key * KeySign) < (b.Key * KeySign);
|
||||
return (a.Key * keySign) < (b.Key * keySign);
|
||||
//return asfloat(a) < asfloat(b);
|
||||
//return (asfloat(a) * KeySign) < (asfloat(b) * KeySign);
|
||||
//return (asfloat(a) * keySign) < (asfloat(b) * keySign);
|
||||
}
|
||||
|
||||
#ifdef _CS_IndirectArgs
|
||||
@@ -136,6 +136,7 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
|
||||
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
|
||||
float keySign = KeySign;
|
||||
UNROLL
|
||||
for (uint k = 2; k <= 2048; k <<= 1)
|
||||
{
|
||||
@@ -144,14 +145,14 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
|
||||
uint index2 = InsertOneBit(groupIndex, j);
|
||||
uint index1 = index2 ^ (k == 2 * j ? k - 1 : j);
|
||||
|
||||
Item A = SortData[index1];
|
||||
Item B = SortData[index2];
|
||||
Item a = SortData[index1];
|
||||
Item b = SortData[index2];
|
||||
|
||||
if (ShouldSwap(A, B))
|
||||
if (ShouldSwap(a, b, keySign))
|
||||
{
|
||||
// Swap the items
|
||||
SortData[index1] = B;
|
||||
SortData[index2] = A;
|
||||
SortData[index1] = b;
|
||||
SortData[index2] = a;
|
||||
}
|
||||
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
@@ -182,20 +183,21 @@ void CS_InnerSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
|
||||
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
|
||||
float keySign = KeySign;
|
||||
UNROLL
|
||||
for (uint j = 1024; j > 0; j /= 2)
|
||||
{
|
||||
uint index2 = InsertOneBit(groupIndex, j);
|
||||
uint index1 = index2 ^ j;
|
||||
|
||||
Item A = SortData[index1];
|
||||
Item B = SortData[index2];
|
||||
Item a = SortData[index1];
|
||||
Item b = SortData[index2];
|
||||
|
||||
if (ShouldSwap(A, B))
|
||||
if (ShouldSwap(a, b, keySign))
|
||||
{
|
||||
// Swap the items
|
||||
SortData[index1] = B;
|
||||
SortData[index2] = A;
|
||||
SortData[index1] = b;
|
||||
SortData[index2] = a;
|
||||
}
|
||||
|
||||
GroupMemoryBarrierWithGroupSync();
|
||||
@@ -224,14 +226,15 @@ void CS_OuterSort(uint3 dispatchThreadId : SV_DispatchThreadID)
|
||||
if (index2 >= count)
|
||||
return;
|
||||
|
||||
Item A = SortBuffer[index1];
|
||||
Item B = SortBuffer[index2];
|
||||
Item a = SortBuffer[index1];
|
||||
Item b = SortBuffer[index2];
|
||||
|
||||
if (ShouldSwap(A, B))
|
||||
float keySign = KeySign;
|
||||
if (ShouldSwap(a, b, keySign))
|
||||
{
|
||||
// Swap the items
|
||||
SortBuffer[index1] = B;
|
||||
SortBuffer[index2] = A;
|
||||
SortBuffer[index1] = b;
|
||||
SortBuffer[index2] = a;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -248,12 +251,10 @@ void CS_CopyIndices(uint3 dispatchThreadId : SV_DispatchThreadID)
|
||||
{
|
||||
const uint count = CounterBuffer.Load(CounterOffset);
|
||||
uint index = dispatchThreadId.x;
|
||||
|
||||
if (index >= count)
|
||||
return;
|
||||
|
||||
Item element = SortBuffer[index];
|
||||
|
||||
SortedIndices[index] = element.Value;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user