From 90d1e63b58afe32a5515b3506eb218d0f2fa5edc Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Fri, 8 Aug 2025 13:11:05 +0200 Subject: [PATCH] Add minor optimizations to particles drawing --- Content/Shaders/BitonicSort.flax | 4 +- Source/Engine/Particles/Particles.cpp | 4 +- Source/Engine/Renderer/Utils/BitonicSort.cpp | 19 +++------ Source/Engine/Renderer/Utils/BitonicSort.h | 3 +- Source/Shaders/BitonicSort.shader | 41 ++++++++++---------- 5 files changed, 33 insertions(+), 38 deletions(-) diff --git a/Content/Shaders/BitonicSort.flax b/Content/Shaders/BitonicSort.flax index ee7db3c74..fa9adc1ef 100644 --- a/Content/Shaders/BitonicSort.flax +++ b/Content/Shaders/BitonicSort.flax @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:924884da1dfef7a802b7190fd148eebbeece50d6fa4d69295c38238dd96331e6 -size 6538 +oid sha256:db9ca2435baf7cba079e22af86feca8397723688107fd4abd4f11466a445791e +size 6669 diff --git a/Source/Engine/Particles/Particles.cpp b/Source/Engine/Particles/Particles.cpp index 47f172636..5ea195a00 100644 --- a/Source/Engine/Particles/Particles.cpp +++ b/Source/Engine/Particles/Particles.cpp @@ -845,6 +845,7 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch) if (sorting) { PROFILE_GPU_CPU_NAMED("Sort Particles"); + context->BindCB(0, GPUParticlesSortingCB); // Generate sort keys for each particle for (const GPUEmitterDraw& draw : GPUEmitterDraws) @@ -917,7 +918,6 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch) #endif } context->UpdateCB(GPUParticlesSortingCB, &data); - context->BindCB(0, GPUParticlesSortingCB); context->BindSR(0, draw.Buffer->GPU.Buffer->View()); context->BindUA(0, draw.Buffer->GPU.SortingKeysBuffer->View()); const int32 threadGroupSize = 1024; @@ -939,7 +939,7 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch) auto module = emitter->Graph.SortModules[moduleIndex]; const auto sortMode = (ParticleSortMode)module->Values[2].AsInt; bool sortAscending = sortMode == ParticleSortMode::CustomAscending; - BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices); + BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.ParticlesCountMax); // TODO: split sorted keys copy with another loop to give time for UAV transition // TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier } diff --git a/Source/Engine/Renderer/Utils/BitonicSort.cpp b/Source/Engine/Renderer/Utils/BitonicSort.cpp index 0834588ba..93f8dc97d 100644 --- a/Source/Engine/Renderer/Utils/BitonicSort.cpp +++ b/Source/Engine/Renderer/Utils/BitonicSort.cpp @@ -6,15 +6,9 @@ #include "Engine/Graphics/GPUContext.h" #include "Engine/Graphics/GPULimits.h" -// The sorting keys buffer item structure template. Matches the shader type. -struct Item -{ - float Key; - uint32 Value; -}; - GPU_CB_STRUCT(Data { - Item NullItem; + float NullItemKey; + uint32 NullItemValue; uint32 CounterOffset; uint32 MaxIterations; uint32 LoopK; @@ -86,22 +80,22 @@ void BitonicSort::Dispose() _shader = nullptr; } -void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer) +void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements) { ASSERT(context && sortingKeysBuffer && countBuffer); if (checkIfSkipPass()) return; PROFILE_GPU_CPU("Bitonic Sort"); const uint32 elementSizeBytes = sizeof(uint64); - const uint32 maxNumElements = sortingKeysBuffer->GetSize() / elementSizeBytes; + const uint32 maxNumElements = maxElements != 0 ? maxElements : sortingKeysBuffer->GetSize() / elementSizeBytes; const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements); const uint32 maxIterations = (uint32)Math::Log2((float)Math::Max(2048u, alignedMaxNumElements)) - 10; // Setup constants buffer Data data; data.CounterOffset = counterOffset; - data.NullItem.Key = sortAscending ? MAX_float : -MAX_float; - data.NullItem.Value = 0; + data.NullItemKey = sortAscending ? MAX_float : -MAX_float; + data.NullItemValue = 0; data.KeySign = sortAscending ? -1.0f : 1.0f; data.MaxIterations = maxIterations; data.LoopK = 0; @@ -128,7 +122,6 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf data.LoopK = k; data.LoopJ = j; context->UpdateCB(_cb, &data); - context->BindCB(0, _cb); context->DispatchIndirect(_outerSortCS, _dispatchArgsBuffer, indirectArgsOffset); indirectArgsOffset += sizeof(GPUDispatchIndirectArgs); diff --git a/Source/Engine/Renderer/Utils/BitonicSort.h b/Source/Engine/Renderer/Utils/BitonicSort.h index 289905b09..99069e182 100644 --- a/Source/Engine/Renderer/Utils/BitonicSort.h +++ b/Source/Engine/Renderer/Utils/BitonicSort.h @@ -34,7 +34,8 @@ public: /// The offset into counter buffer to find count for this list. Must be a multiple of 4 bytes. /// True to sort in ascending order (smallest to largest), otherwise false to sort in descending order. /// The output buffer for sorted values extracted from the sorted sortingKeysBuffer after algorithm run. Valid for uint value types - used as RWBuffer. - void Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer); + /// Optional upper limit of elements to sort. Cna be used to optimize indirect dispatches allocation. If non-zero, then it gets calculated based on the input item buffer size. + void Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements = 0); public: diff --git a/Source/Shaders/BitonicSort.shader b/Source/Shaders/BitonicSort.shader index c4a275862..9e49a8185 100644 --- a/Source/Shaders/BitonicSort.shader +++ b/Source/Shaders/BitonicSort.shader @@ -36,14 +36,14 @@ uint InsertOneBit(uint value, uint oneBitMask) // (effectively a negation) or leave the value alone. When the KeySign is // 1, we are sorting descending, so when A < B, they should swap. For an // ascending sort, -A < -B should swap. -bool ShouldSwap(Item a, Item b) +bool ShouldSwap(Item a, Item b, float keySign) { //return (a ^ NullItem) < (b ^ NullItem); //return (a.Key) < (b.Key); - return (a.Key * KeySign) < (b.Key * KeySign); + return (a.Key * keySign) < (b.Key * keySign); //return asfloat(a) < asfloat(b); - //return (asfloat(a) * KeySign) < (asfloat(b) * KeySign); + //return (asfloat(a) * keySign) < (asfloat(b) * keySign); } #ifdef _CS_IndirectArgs @@ -136,6 +136,7 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex) GroupMemoryBarrierWithGroupSync(); + float keySign = KeySign; UNROLL for (uint k = 2; k <= 2048; k <<= 1) { @@ -144,14 +145,14 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex) uint index2 = InsertOneBit(groupIndex, j); uint index1 = index2 ^ (k == 2 * j ? k - 1 : j); - Item A = SortData[index1]; - Item B = SortData[index2]; + Item a = SortData[index1]; + Item b = SortData[index2]; - if (ShouldSwap(A, B)) + if (ShouldSwap(a, b, keySign)) { // Swap the items - SortData[index1] = B; - SortData[index2] = A; + SortData[index1] = b; + SortData[index2] = a; } GroupMemoryBarrierWithGroupSync(); @@ -182,20 +183,21 @@ void CS_InnerSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex) GroupMemoryBarrierWithGroupSync(); + float keySign = KeySign; UNROLL for (uint j = 1024; j > 0; j /= 2) { uint index2 = InsertOneBit(groupIndex, j); uint index1 = index2 ^ j; - Item A = SortData[index1]; - Item B = SortData[index2]; + Item a = SortData[index1]; + Item b = SortData[index2]; - if (ShouldSwap(A, B)) + if (ShouldSwap(a, b, keySign)) { // Swap the items - SortData[index1] = B; - SortData[index2] = A; + SortData[index1] = b; + SortData[index2] = a; } GroupMemoryBarrierWithGroupSync(); @@ -224,14 +226,15 @@ void CS_OuterSort(uint3 dispatchThreadId : SV_DispatchThreadID) if (index2 >= count) return; - Item A = SortBuffer[index1]; - Item B = SortBuffer[index2]; + Item a = SortBuffer[index1]; + Item b = SortBuffer[index2]; - if (ShouldSwap(A, B)) + float keySign = KeySign; + if (ShouldSwap(a, b, keySign)) { // Swap the items - SortBuffer[index1] = B; - SortBuffer[index2] = A; + SortBuffer[index1] = b; + SortBuffer[index2] = a; } } @@ -248,12 +251,10 @@ void CS_CopyIndices(uint3 dispatchThreadId : SV_DispatchThreadID) { const uint count = CounterBuffer.Load(CounterOffset); uint index = dispatchThreadId.x; - if (index >= count) return; Item element = SortBuffer[index]; - SortedIndices[index] = element.Value; }