diff --git a/Content/Shaders/BitonicSort.flax b/Content/Shaders/BitonicSort.flax
index ee7db3c74..fa9adc1ef 100644
--- a/Content/Shaders/BitonicSort.flax
+++ b/Content/Shaders/BitonicSort.flax
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:924884da1dfef7a802b7190fd148eebbeece50d6fa4d69295c38238dd96331e6
-size 6538
+oid sha256:db9ca2435baf7cba079e22af86feca8397723688107fd4abd4f11466a445791e
+size 6669
diff --git a/Source/Engine/Particles/Particles.cpp b/Source/Engine/Particles/Particles.cpp
index 47f172636..5ea195a00 100644
--- a/Source/Engine/Particles/Particles.cpp
+++ b/Source/Engine/Particles/Particles.cpp
@@ -845,6 +845,7 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
if (sorting)
{
PROFILE_GPU_CPU_NAMED("Sort Particles");
+ context->BindCB(0, GPUParticlesSortingCB);
// Generate sort keys for each particle
for (const GPUEmitterDraw& draw : GPUEmitterDraws)
@@ -917,7 +918,6 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
#endif
}
context->UpdateCB(GPUParticlesSortingCB, &data);
- context->BindCB(0, GPUParticlesSortingCB);
context->BindSR(0, draw.Buffer->GPU.Buffer->View());
context->BindUA(0, draw.Buffer->GPU.SortingKeysBuffer->View());
const int32 threadGroupSize = 1024;
@@ -939,7 +939,7 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
auto module = emitter->Graph.SortModules[moduleIndex];
const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
bool sortAscending = sortMode == ParticleSortMode::CustomAscending;
- BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices);
+ BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.ParticlesCountMax);
// TODO: split sorted keys copy with another loop to give time for UAV transition
// TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier
}
diff --git a/Source/Engine/Renderer/Utils/BitonicSort.cpp b/Source/Engine/Renderer/Utils/BitonicSort.cpp
index 0834588ba..93f8dc97d 100644
--- a/Source/Engine/Renderer/Utils/BitonicSort.cpp
+++ b/Source/Engine/Renderer/Utils/BitonicSort.cpp
@@ -6,15 +6,9 @@
#include "Engine/Graphics/GPUContext.h"
#include "Engine/Graphics/GPULimits.h"
-// The sorting keys buffer item structure template. Matches the shader type.
-struct Item
-{
- float Key;
- uint32 Value;
-};
-
GPU_CB_STRUCT(Data {
- Item NullItem;
+ float NullItemKey;
+ uint32 NullItemValue;
uint32 CounterOffset;
uint32 MaxIterations;
uint32 LoopK;
@@ -86,22 +80,22 @@ void BitonicSort::Dispose()
_shader = nullptr;
}
-void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer)
+void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements)
{
ASSERT(context && sortingKeysBuffer && countBuffer);
if (checkIfSkipPass())
return;
PROFILE_GPU_CPU("Bitonic Sort");
const uint32 elementSizeBytes = sizeof(uint64);
- const uint32 maxNumElements = sortingKeysBuffer->GetSize() / elementSizeBytes;
+ const uint32 maxNumElements = maxElements != 0 ? maxElements : sortingKeysBuffer->GetSize() / elementSizeBytes;
const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements);
const uint32 maxIterations = (uint32)Math::Log2((float)Math::Max(2048u, alignedMaxNumElements)) - 10;
// Setup constants buffer
Data data;
data.CounterOffset = counterOffset;
- data.NullItem.Key = sortAscending ? MAX_float : -MAX_float;
- data.NullItem.Value = 0;
+ data.NullItemKey = sortAscending ? MAX_float : -MAX_float;
+ data.NullItemValue = 0;
data.KeySign = sortAscending ? -1.0f : 1.0f;
data.MaxIterations = maxIterations;
data.LoopK = 0;
@@ -128,7 +122,6 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
data.LoopK = k;
data.LoopJ = j;
context->UpdateCB(_cb, &data);
- context->BindCB(0, _cb);
context->DispatchIndirect(_outerSortCS, _dispatchArgsBuffer, indirectArgsOffset);
indirectArgsOffset += sizeof(GPUDispatchIndirectArgs);
diff --git a/Source/Engine/Renderer/Utils/BitonicSort.h b/Source/Engine/Renderer/Utils/BitonicSort.h
index 289905b09..99069e182 100644
--- a/Source/Engine/Renderer/Utils/BitonicSort.h
+++ b/Source/Engine/Renderer/Utils/BitonicSort.h
@@ -34,7 +34,8 @@ public:
/// The offset into counter buffer to find count for this list. Must be a multiple of 4 bytes.
/// True to sort in ascending order (smallest to largest), otherwise false to sort in descending order.
/// The output buffer for sorted values extracted from the sorted sortingKeysBuffer after algorithm run. Valid for uint value types - used as RWBuffer.
- void Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer);
+ /// Optional upper limit of elements to sort. Cna be used to optimize indirect dispatches allocation. If non-zero, then it gets calculated based on the input item buffer size.
+ void Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements = 0);
public:
diff --git a/Source/Shaders/BitonicSort.shader b/Source/Shaders/BitonicSort.shader
index c4a275862..9e49a8185 100644
--- a/Source/Shaders/BitonicSort.shader
+++ b/Source/Shaders/BitonicSort.shader
@@ -36,14 +36,14 @@ uint InsertOneBit(uint value, uint oneBitMask)
// (effectively a negation) or leave the value alone. When the KeySign is
// 1, we are sorting descending, so when A < B, they should swap. For an
// ascending sort, -A < -B should swap.
-bool ShouldSwap(Item a, Item b)
+bool ShouldSwap(Item a, Item b, float keySign)
{
//return (a ^ NullItem) < (b ^ NullItem);
//return (a.Key) < (b.Key);
- return (a.Key * KeySign) < (b.Key * KeySign);
+ return (a.Key * keySign) < (b.Key * keySign);
//return asfloat(a) < asfloat(b);
- //return (asfloat(a) * KeySign) < (asfloat(b) * KeySign);
+ //return (asfloat(a) * keySign) < (asfloat(b) * keySign);
}
#ifdef _CS_IndirectArgs
@@ -136,6 +136,7 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
GroupMemoryBarrierWithGroupSync();
+ float keySign = KeySign;
UNROLL
for (uint k = 2; k <= 2048; k <<= 1)
{
@@ -144,14 +145,14 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
uint index2 = InsertOneBit(groupIndex, j);
uint index1 = index2 ^ (k == 2 * j ? k - 1 : j);
- Item A = SortData[index1];
- Item B = SortData[index2];
+ Item a = SortData[index1];
+ Item b = SortData[index2];
- if (ShouldSwap(A, B))
+ if (ShouldSwap(a, b, keySign))
{
// Swap the items
- SortData[index1] = B;
- SortData[index2] = A;
+ SortData[index1] = b;
+ SortData[index2] = a;
}
GroupMemoryBarrierWithGroupSync();
@@ -182,20 +183,21 @@ void CS_InnerSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
GroupMemoryBarrierWithGroupSync();
+ float keySign = KeySign;
UNROLL
for (uint j = 1024; j > 0; j /= 2)
{
uint index2 = InsertOneBit(groupIndex, j);
uint index1 = index2 ^ j;
- Item A = SortData[index1];
- Item B = SortData[index2];
+ Item a = SortData[index1];
+ Item b = SortData[index2];
- if (ShouldSwap(A, B))
+ if (ShouldSwap(a, b, keySign))
{
// Swap the items
- SortData[index1] = B;
- SortData[index2] = A;
+ SortData[index1] = b;
+ SortData[index2] = a;
}
GroupMemoryBarrierWithGroupSync();
@@ -224,14 +226,15 @@ void CS_OuterSort(uint3 dispatchThreadId : SV_DispatchThreadID)
if (index2 >= count)
return;
- Item A = SortBuffer[index1];
- Item B = SortBuffer[index2];
+ Item a = SortBuffer[index1];
+ Item b = SortBuffer[index2];
- if (ShouldSwap(A, B))
+ float keySign = KeySign;
+ if (ShouldSwap(a, b, keySign))
{
// Swap the items
- SortBuffer[index1] = B;
- SortBuffer[index2] = A;
+ SortBuffer[index1] = b;
+ SortBuffer[index2] = a;
}
}
@@ -248,12 +251,10 @@ void CS_CopyIndices(uint3 dispatchThreadId : SV_DispatchThreadID)
{
const uint count = CounterBuffer.Load(CounterOffset);
uint index = dispatchThreadId.x;
-
if (index >= count)
return;
Item element = SortBuffer[index];
-
SortedIndices[index] = element.Value;
}