From 90d1e63b58afe32a5515b3506eb218d0f2fa5edc Mon Sep 17 00:00:00 2001
From: Wojtek Figat <wojtek@figat.pl>
Date: Fri, 8 Aug 2025 13:11:05 +0200
Subject: [PATCH] Add minor optimizations to particles drawing

---
 Content/Shaders/BitonicSort.flax             |  4 +-
 Source/Engine/Particles/Particles.cpp        |  4 +-
 Source/Engine/Renderer/Utils/BitonicSort.cpp | 19 +++------
 Source/Engine/Renderer/Utils/BitonicSort.h   |  3 +-
 Source/Shaders/BitonicSort.shader            | 41 ++++++++++----------
 5 files changed, 33 insertions(+), 38 deletions(-)

diff --git a/Content/Shaders/BitonicSort.flax b/Content/Shaders/BitonicSort.flax
index ee7db3c74..fa9adc1ef 100644
--- a/Content/Shaders/BitonicSort.flax
+++ b/Content/Shaders/BitonicSort.flax
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:924884da1dfef7a802b7190fd148eebbeece50d6fa4d69295c38238dd96331e6
-size 6538
+oid sha256:db9ca2435baf7cba079e22af86feca8397723688107fd4abd4f11466a445791e
+size 6669
diff --git a/Source/Engine/Particles/Particles.cpp b/Source/Engine/Particles/Particles.cpp
index 47f172636..5ea195a00 100644
--- a/Source/Engine/Particles/Particles.cpp
+++ b/Source/Engine/Particles/Particles.cpp
@@ -845,6 +845,7 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
     if (sorting)
     {
         PROFILE_GPU_CPU_NAMED("Sort Particles");
+        context->BindCB(0, GPUParticlesSortingCB);
 
         // Generate sort keys for each particle
         for (const GPUEmitterDraw& draw : GPUEmitterDraws)
@@ -917,7 +918,6 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
 #endif
                 }
                 context->UpdateCB(GPUParticlesSortingCB, &data);
-                context->BindCB(0, GPUParticlesSortingCB);
                 context->BindSR(0, draw.Buffer->GPU.Buffer->View());
                 context->BindUA(0, draw.Buffer->GPU.SortingKeysBuffer->View());
                 const int32 threadGroupSize = 1024;
@@ -939,7 +939,7 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
                 auto module = emitter->Graph.SortModules[moduleIndex];
                 const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
                 bool sortAscending = sortMode == ParticleSortMode::CustomAscending;
-                BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices);
+                BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.ParticlesCountMax);
                 // TODO: split sorted keys copy with another loop to give time for UAV transition
                 // TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier
             }
diff --git a/Source/Engine/Renderer/Utils/BitonicSort.cpp b/Source/Engine/Renderer/Utils/BitonicSort.cpp
index 0834588ba..93f8dc97d 100644
--- a/Source/Engine/Renderer/Utils/BitonicSort.cpp
+++ b/Source/Engine/Renderer/Utils/BitonicSort.cpp
@@ -6,15 +6,9 @@
 #include "Engine/Graphics/GPUContext.h"
 #include "Engine/Graphics/GPULimits.h"
 
-// The sorting keys buffer item structure template. Matches the shader type.
-struct Item
-{
-    float Key;
-    uint32 Value;
-};
-
 GPU_CB_STRUCT(Data {
-    Item NullItem;
+    float NullItemKey;
+    uint32 NullItemValue;
     uint32 CounterOffset;
     uint32 MaxIterations;
     uint32 LoopK;
@@ -86,22 +80,22 @@ void BitonicSort::Dispose()
     _shader = nullptr;
 }
 
-void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer)
+void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements)
 {
     ASSERT(context && sortingKeysBuffer && countBuffer);
     if (checkIfSkipPass())
         return;
     PROFILE_GPU_CPU("Bitonic Sort");
     const uint32 elementSizeBytes = sizeof(uint64);
-    const uint32 maxNumElements = sortingKeysBuffer->GetSize() / elementSizeBytes;
+    const uint32 maxNumElements = maxElements != 0 ? maxElements : sortingKeysBuffer->GetSize() / elementSizeBytes;
     const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements);
     const uint32 maxIterations = (uint32)Math::Log2((float)Math::Max(2048u, alignedMaxNumElements)) - 10;
 
     // Setup constants buffer
     Data data;
     data.CounterOffset = counterOffset;
-    data.NullItem.Key = sortAscending ? MAX_float : -MAX_float;
-    data.NullItem.Value = 0;
+    data.NullItemKey = sortAscending ? MAX_float : -MAX_float;
+    data.NullItemValue = 0;
     data.KeySign = sortAscending ? -1.0f : 1.0f;
     data.MaxIterations = maxIterations;
     data.LoopK = 0;
@@ -128,7 +122,6 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
             data.LoopK = k;
             data.LoopJ = j;
             context->UpdateCB(_cb, &data);
-            context->BindCB(0, _cb);
 
             context->DispatchIndirect(_outerSortCS, _dispatchArgsBuffer, indirectArgsOffset);
             indirectArgsOffset += sizeof(GPUDispatchIndirectArgs);
diff --git a/Source/Engine/Renderer/Utils/BitonicSort.h b/Source/Engine/Renderer/Utils/BitonicSort.h
index 289905b09..99069e182 100644
--- a/Source/Engine/Renderer/Utils/BitonicSort.h
+++ b/Source/Engine/Renderer/Utils/BitonicSort.h
@@ -34,7 +34,8 @@ public:
     /// <param name="counterOffset">The offset into counter buffer to find count for this list. Must be a multiple of 4 bytes.</param>
     /// <param name="sortAscending">True to sort in ascending order (smallest to largest), otherwise false to sort in descending order.</param>
     /// <param name="sortedIndicesBuffer">The output buffer for sorted values extracted from the sorted sortingKeysBuffer after algorithm run. Valid for uint value types - used as RWBuffer.</param>
-    void Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer);
+    /// <param name="maxElements">Optional upper limit of elements to sort. Cna be used to optimize indirect dispatches allocation. If non-zero, then it gets calculated based on the input item buffer size.</param>
+    void Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements = 0);
 
 public:
 
diff --git a/Source/Shaders/BitonicSort.shader b/Source/Shaders/BitonicSort.shader
index c4a275862..9e49a8185 100644
--- a/Source/Shaders/BitonicSort.shader
+++ b/Source/Shaders/BitonicSort.shader
@@ -36,14 +36,14 @@ uint InsertOneBit(uint value, uint oneBitMask)
 // (effectively a negation) or leave the value alone. When the KeySign is
 // 1, we are sorting descending, so when A < B, they should swap. For an
 // ascending sort, -A < -B should swap.
-bool ShouldSwap(Item a, Item b)
+bool ShouldSwap(Item a, Item b, float keySign)
 {
 	//return (a ^ NullItem) < (b ^ NullItem);
 
 	//return (a.Key) < (b.Key);
-	return (a.Key * KeySign) < (b.Key * KeySign);
+	return (a.Key * keySign) < (b.Key * keySign);
 	//return asfloat(a) < asfloat(b);
-	//return (asfloat(a) * KeySign) < (asfloat(b) * KeySign);
+	//return (asfloat(a) * keySign) < (asfloat(b) * keySign);
 }
 
 #ifdef _CS_IndirectArgs
@@ -136,6 +136,7 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
 
 	GroupMemoryBarrierWithGroupSync();
 
+    float keySign = KeySign;
 	UNROLL
 	for (uint k = 2; k <= 2048; k <<= 1)
 	{
@@ -144,14 +145,14 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
 			uint index2 = InsertOneBit(groupIndex, j);
 			uint index1 = index2 ^ (k == 2 * j ? k - 1 : j);
 
-			Item A = SortData[index1];
-			Item B = SortData[index2];
+			Item a = SortData[index1];
+			Item b = SortData[index2];
 
-			if (ShouldSwap(A, B))
+			if (ShouldSwap(a, b, keySign))
 			{
 				// Swap the items
-				SortData[index1] = B;
-				SortData[index2] = A;
+				SortData[index1] = b;
+				SortData[index2] = a;
 			}
 
 			GroupMemoryBarrierWithGroupSync();
@@ -182,20 +183,21 @@ void CS_InnerSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
 
 	GroupMemoryBarrierWithGroupSync();
 
+    float keySign = KeySign;
 	UNROLL
 	for (uint j = 1024; j > 0; j /= 2)
 	{
 		uint index2 = InsertOneBit(groupIndex, j);
 		uint index1 = index2 ^ j;
 
-		Item A = SortData[index1];
-		Item B = SortData[index2];
+		Item a = SortData[index1];
+		Item b = SortData[index2];
 
-		if (ShouldSwap(A, B))
+		if (ShouldSwap(a, b, keySign))
 		{
 			// Swap the items
-			SortData[index1] = B;
-			SortData[index2] = A;
+			SortData[index1] = b;
+			SortData[index2] = a;
 		}
 
 		GroupMemoryBarrierWithGroupSync();
@@ -224,14 +226,15 @@ void CS_OuterSort(uint3 dispatchThreadId : SV_DispatchThreadID)
 	if (index2 >= count)
 		return;
 
-	Item A = SortBuffer[index1];
-	Item B = SortBuffer[index2];
+	Item a = SortBuffer[index1];
+	Item b = SortBuffer[index2];
 
-	if (ShouldSwap(A, B))
+    float keySign = KeySign;
+	if (ShouldSwap(a, b, keySign))
 	{
 		// Swap the items
-		SortBuffer[index1] = B;
-		SortBuffer[index2] = A;
+		SortBuffer[index1] = b;
+		SortBuffer[index2] = a;
 	}
 }
 
@@ -248,12 +251,10 @@ void CS_CopyIndices(uint3 dispatchThreadId : SV_DispatchThreadID)
 {
 	const uint count = CounterBuffer.Load(CounterOffset);
 	uint index = dispatchThreadId.x;
-
 	if (index >= count)
 		return;
 
 	Item element = SortBuffer[index];
-
 	SortedIndices[index] = element.Value;
 }