diff --git a/Content/Shaders/BitonicSort.flax b/Content/Shaders/BitonicSort.flax index fa9adc1ef..69d773379 100644 --- a/Content/Shaders/BitonicSort.flax +++ b/Content/Shaders/BitonicSort.flax @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db9ca2435baf7cba079e22af86feca8397723688107fd4abd4f11466a445791e -size 6669 +oid sha256:07d45b7f2085a28938e3bef090e259c0698a1987f9cd69df952168524ce07193 +size 6877 diff --git a/Source/Engine/Particles/Particles.cpp b/Source/Engine/Particles/Particles.cpp index 9236efd9d..2423feed2 100644 --- a/Source/Engine/Particles/Particles.cpp +++ b/Source/Engine/Particles/Particles.cpp @@ -940,8 +940,7 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch) const auto sortMode = (ParticleSortMode)module->Values[2].AsInt; bool sortAscending = sortMode == ParticleSortMode::CustomAscending; BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.ParticlesCountMax); - // TODO: split sorted keys copy with another loop to give time for UAV transition - // TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier + // TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier (run all sorting in parallel) } } } diff --git a/Source/Engine/Renderer/Utils/BitonicSort.cpp b/Source/Engine/Renderer/Utils/BitonicSort.cpp index 93f8dc97d..ed7ece05b 100644 --- a/Source/Engine/Renderer/Utils/BitonicSort.cpp +++ b/Source/Engine/Renderer/Utils/BitonicSort.cpp @@ -56,7 +56,7 @@ bool BitonicSort::setupResources() // Cache compute shaders _indirectArgsCS = shader->GetCS("CS_IndirectArgs"); - _preSortCS = shader->GetCS("CS_PreSort"); + _preSortCS.Get(shader, "CS_PreSort"); _innerSortCS = shader->GetCS("CS_InnerSort"); _outerSortCS = shader->GetCS("CS_OuterSort"); _copyIndicesCS = shader->GetCS("CS_CopyIndices"); @@ -73,7 +73,7 @@ void BitonicSort::Dispose() SAFE_DELETE_GPU_RESOURCE(_dispatchArgsBuffer); _cb = nullptr; _indirectArgsCS = nullptr; - _preSortCS = nullptr; + _preSortCS.Clear(); _innerSortCS = nullptr; _outerSortCS = nullptr; _copyIndicesCS = nullptr; @@ -86,8 +86,9 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf if (checkIfSkipPass()) return; PROFILE_GPU_CPU("Bitonic Sort"); - const uint32 elementSizeBytes = sizeof(uint64); - const uint32 maxNumElements = maxElements != 0 ? maxElements : sortingKeysBuffer->GetSize() / elementSizeBytes; + uint32 maxNumElements = sortingKeysBuffer->GetSize() / sizeof(uint64); + if (maxElements > 0 && maxElements < maxNumElements) + maxNumElements = maxElements; const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements); const uint32 maxIterations = (uint32)Math::Log2((float)Math::Max(2048u, alignedMaxNumElements)) - 10; @@ -102,33 +103,44 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf data.LoopJ = 0; context->UpdateCB(_cb, &data); context->BindCB(0, _cb); - - // Generate execute indirect arguments context->BindSR(0, countBuffer->View()); - context->BindUA(0, _dispatchArgsBuffer->View()); - context->Dispatch(_indirectArgsCS, 1, 1, 1); - // Pre-Sort the buffer up to k = 2048 (this also pads the list with invalid indices that will drift to the end of the sorted list) - context->BindUA(0, sortingKeysBuffer->View()); - context->DispatchIndirect(_preSortCS, _dispatchArgsBuffer, 0); - - // We have already pre-sorted up through k = 2048 when first writing our list, so we continue sorting with k = 4096 - // For really large values of k, these indirect dispatches will be skipped over with thread counts of 0 - uint32 indirectArgsOffset = sizeof(GPUDispatchIndirectArgs); - for (uint32 k = 4096; k <= alignedMaxNumElements; k *= 2) + // If item count is small we can do only presorting within a single dispatch thread group + if (maxNumElements <= 2048) { - for (uint32 j = k / 2; j >= 2048; j /= 2) - { - data.LoopK = k; - data.LoopJ = j; - context->UpdateCB(_cb, &data); + // Use pre-sort with smaller thread group size (eg. for small particle emitters sorting) + const int32 permutation = maxNumElements < 128 ? 1 : 0; + context->BindUA(0, sortingKeysBuffer->View()); + context->Dispatch(_preSortCS.Get(permutation), 1, 1, 1); + } + else + { + // Generate execute indirect arguments + context->BindUA(0, _dispatchArgsBuffer->View()); + context->Dispatch(_indirectArgsCS, 1, 1, 1); - context->DispatchIndirect(_outerSortCS, _dispatchArgsBuffer, indirectArgsOffset); + // Pre-Sort the buffer up to k = 2048 (this also pads the list with invalid indices that will drift to the end of the sorted list) + context->BindUA(0, sortingKeysBuffer->View()); + context->DispatchIndirect(_preSortCS.Get(0), _dispatchArgsBuffer, 0); + + // We have already pre-sorted up through k = 2048 when first writing our list, so we continue sorting with k = 4096 + // For really large values of k, these indirect dispatches will be skipped over with thread counts of 0 + uint32 indirectArgsOffset = sizeof(GPUDispatchIndirectArgs); + for (uint32 k = 4096; k <= alignedMaxNumElements; k *= 2) + { + for (uint32 j = k / 2; j >= 2048; j /= 2) + { + data.LoopK = k; + data.LoopJ = j; + context->UpdateCB(_cb, &data); + + context->DispatchIndirect(_outerSortCS, _dispatchArgsBuffer, indirectArgsOffset); + indirectArgsOffset += sizeof(GPUDispatchIndirectArgs); + } + + context->DispatchIndirect(_innerSortCS, _dispatchArgsBuffer, indirectArgsOffset); indirectArgsOffset += sizeof(GPUDispatchIndirectArgs); } - - context->DispatchIndirect(_innerSortCS, _dispatchArgsBuffer, indirectArgsOffset); - indirectArgsOffset += sizeof(GPUDispatchIndirectArgs); } context->ResetUA(); diff --git a/Source/Engine/Renderer/Utils/BitonicSort.h b/Source/Engine/Renderer/Utils/BitonicSort.h index 99069e182..4280d5965 100644 --- a/Source/Engine/Renderer/Utils/BitonicSort.h +++ b/Source/Engine/Renderer/Utils/BitonicSort.h @@ -18,7 +18,7 @@ private: GPUBuffer* _dispatchArgsBuffer = nullptr; GPUConstantBuffer* _cb; GPUShaderProgramCS* _indirectArgsCS; - GPUShaderProgramCS* _preSortCS; + ComputeShaderPermutation<2> _preSortCS; GPUShaderProgramCS* _innerSortCS; GPUShaderProgramCS* _outerSortCS; GPUShaderProgramCS* _copyIndicesCS; @@ -46,7 +46,7 @@ public: #if COMPILE_WITH_DEV_ENV void OnShaderReloading(Asset* obj) { - _preSortCS = nullptr; + _preSortCS.Clear(); _innerSortCS = nullptr; _outerSortCS = nullptr; invalidateResources(); diff --git a/Source/Shaders/BitonicSort.shader b/Source/Shaders/BitonicSort.shader index 9e49a8185..0f9d5e656 100644 --- a/Source/Shaders/BitonicSort.shader +++ b/Source/Shaders/BitonicSort.shader @@ -3,6 +3,10 @@ #include "./Flax/Common.hlsl" #include "./Flax/Math.hlsl" +#ifndef THREAD_GROUP_SIZE +#define THREAD_GROUP_SIZE 1024 +#endif + struct Item { float Key; @@ -36,14 +40,14 @@ uint InsertOneBit(uint value, uint oneBitMask) // (effectively a negation) or leave the value alone. When the KeySign is // 1, we are sorting descending, so when A < B, they should swap. For an // ascending sort, -A < -B should swap. -bool ShouldSwap(Item a, Item b, float keySign) +bool ShouldSwap(Item a, Item b) { //return (a ^ NullItem) < (b ^ NullItem); //return (a.Key) < (b.Key); - return (a.Key * keySign) < (b.Key * keySign); + return (a.Key * KeySign) < (b.Key * KeySign); //return asfloat(a) < asfloat(b); - //return (asfloat(a) * keySign) < (asfloat(b) * keySign); + //return (asfloat(a) * KeySign) < (asfloat(b) * KeySign); } #ifdef _CS_IndirectArgs @@ -91,7 +95,7 @@ void CS_IndirectArgs(uint groupIndex : SV_GroupIndex) RWStructuredBuffer SortBuffer : register(u0); -groupshared Item SortData[2048]; +groupshared Item SortData[THREAD_GROUP_SIZE * 2]; void LoadItem(uint element, uint count) { @@ -106,7 +110,7 @@ void LoadItem(uint element, uint count) item.Key = NullItemKey; item.Value = NullItemValue; } - SortData[element & 2047] = item; + SortData[element & (THREAD_GROUP_SIZE * 2 - 1)] = item; } void StoreItem(uint element, uint count) @@ -122,23 +126,24 @@ void StoreItem(uint element, uint count) #ifdef _CS_PreSort META_CS(true, FEATURE_LEVEL_SM5) -[numthreads(1024, 1, 1)] +META_PERMUTATION_1(THREAD_GROUP_SIZE=1024) +META_PERMUTATION_1(THREAD_GROUP_SIZE=64) +[numthreads(THREAD_GROUP_SIZE, 1, 1)] void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex) { // Item index of the start of this group - const uint groupStart = groupID.x * 2048; + const uint groupStart = groupID.x * (THREAD_GROUP_SIZE * 2); // Actual number of items that need sorting const uint count = CounterBuffer.Load(CounterOffset); LoadItem(groupStart + groupIndex, count); - LoadItem(groupStart + groupIndex + 1024, count); + LoadItem(groupStart + groupIndex + THREAD_GROUP_SIZE, count); GroupMemoryBarrierWithGroupSync(); - float keySign = KeySign; UNROLL - for (uint k = 2; k <= 2048; k <<= 1) + for (uint k = 2; k <= THREAD_GROUP_SIZE * 2; k <<= 1) { for (uint j = k / 2; j > 0; j /= 2) { @@ -148,7 +153,7 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex) Item a = SortData[index1]; Item b = SortData[index2]; - if (ShouldSwap(a, b, keySign)) + if (ShouldSwap(a, b)) { // Swap the items SortData[index1] = b; @@ -161,7 +166,7 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex) // Write sorted results to memory StoreItem(groupStart + groupIndex, count); - StoreItem(groupStart + groupIndex + 1024, count); + StoreItem(groupStart + groupIndex + THREAD_GROUP_SIZE, count); } #endif @@ -169,23 +174,22 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex) #ifdef _CS_InnerSort META_CS(true, FEATURE_LEVEL_SM5) -[numthreads(1024, 1, 1)] +[numthreads(THREAD_GROUP_SIZE, 1, 1)] void CS_InnerSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex) { const uint count = CounterBuffer.Load(CounterOffset); // Item index of the start of this group - const uint groupStart = groupID.x * 2048; + const uint groupStart = groupID.x * (THREAD_GROUP_SIZE * 2); // Load from memory into LDS to prepare sort LoadItem(groupStart + groupIndex, count); - LoadItem(groupStart + groupIndex + 1024, count); + LoadItem(groupStart + groupIndex + THREAD_GROUP_SIZE, count); GroupMemoryBarrierWithGroupSync(); - float keySign = KeySign; UNROLL - for (uint j = 1024; j > 0; j /= 2) + for (uint j = THREAD_GROUP_SIZE; j > 0; j /= 2) { uint index2 = InsertOneBit(groupIndex, j); uint index1 = index2 ^ j; @@ -193,7 +197,7 @@ void CS_InnerSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex) Item a = SortData[index1]; Item b = SortData[index2]; - if (ShouldSwap(a, b, keySign)) + if (ShouldSwap(a, b)) { // Swap the items SortData[index1] = b; @@ -204,7 +208,7 @@ void CS_InnerSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex) } StoreItem(groupStart + groupIndex, count); - StoreItem(groupStart + groupIndex + 1024, count); + StoreItem(groupStart + groupIndex + THREAD_GROUP_SIZE, count); } #endif @@ -229,8 +233,7 @@ void CS_OuterSort(uint3 dispatchThreadId : SV_DispatchThreadID) Item a = SortBuffer[index1]; Item b = SortBuffer[index2]; - float keySign = KeySign; - if (ShouldSwap(a, b, keySign)) + if (ShouldSwap(a, b)) { // Swap the items SortBuffer[index1] = b;