From 854f3acd4c80ce5c6cb0f92bb661ea62bb1de899 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Fri, 8 Aug 2025 18:24:44 +0200 Subject: [PATCH] Optimize GPU particles Bitonic sort to use separate buffers for indices and keys to avoid additional buffer copy --- Content/Shaders/BitonicSort.flax | 4 +- Content/Shaders/GPUParticlesSorting.flax | 4 +- Source/Engine/Particles/Particles.cpp | 128 +++++++++---------- Source/Engine/Particles/ParticlesData.cpp | 16 +-- Source/Engine/Particles/ParticlesData.h | 2 +- Source/Engine/Renderer/Utils/BitonicSort.cpp | 43 ++----- Source/Engine/Renderer/Utils/BitonicSort.h | 8 +- Source/Shaders/BitonicSort.shader | 63 ++++----- Source/Shaders/GPUParticlesSorting.shader | 16 +-- 9 files changed, 115 insertions(+), 169 deletions(-) diff --git a/Content/Shaders/BitonicSort.flax b/Content/Shaders/BitonicSort.flax index 69d773379..4d388b3fc 100644 --- a/Content/Shaders/BitonicSort.flax +++ b/Content/Shaders/BitonicSort.flax @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07d45b7f2085a28938e3bef090e259c0698a1987f9cd69df952168524ce07193 -size 6877 +oid sha256:190867e40ef793168988f358edddeb92819cc4f972f4cf9ac34cc764a06eb6e3 +size 6824 diff --git a/Content/Shaders/GPUParticlesSorting.flax b/Content/Shaders/GPUParticlesSorting.flax index 2045fd649..35cebf7b6 100644 --- a/Content/Shaders/GPUParticlesSorting.flax +++ b/Content/Shaders/GPUParticlesSorting.flax @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a16a973f4be075f8531a1b1551e33423b014da1e8b348f2672464ee21692e57a -size 2556 +oid sha256:80ed5f51cd982ea521e3588708db54d79b905ee41e88cfd41eff976b9b50514a +size 2518 diff --git a/Source/Engine/Particles/Particles.cpp b/Source/Engine/Particles/Particles.cpp index 2423feed2..dc2d9054d 100644 --- a/Source/Engine/Particles/Particles.cpp +++ b/Source/Engine/Particles/Particles.cpp @@ -848,81 +848,68 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch) context->BindCB(0, GPUParticlesSortingCB); // Generate sort keys for each particle - for (const GPUEmitterDraw& draw : GPUEmitterDraws) { - if (!draw.Sorting) - continue; - ASSERT(draw.Buffer->GPU.SortingKeysBuffer); - - // Generate sort keys for particles - ParticleEmitter* emitter = draw.Buffer->Emitter; - for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++) + PROFILE_GPU("Gen Sort Keys"); + for (const GPUEmitterDraw& draw : GPUEmitterDraws) { - auto module = emitter->Graph.SortModules[moduleIndex]; - const auto sortMode = (ParticleSortMode)module->Values[2].AsInt; - - // Generate sorting keys based on sorting mode - GPUParticlesSortingData data; - data.ParticleCounterOffset = draw.Buffer->GPU.ParticleCounterOffset; - data.ParticleStride = draw.Buffer->Stride; - data.ParticleCapacity = draw.Buffer->Capacity; - int32 permutationIndex; - switch (sortMode) + if (!draw.Sorting) + continue; + ASSERT(draw.Buffer->GPU.SortingKeys); + ParticleEmitter* emitter = draw.Buffer->Emitter; + for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++) { - case ParticleSortMode::ViewDepth: - { - permutationIndex = 0; - data.PositionOffset = emitter->Graph.GetPositionAttributeOffset(); - const Matrix viewProjection = renderContextBatch.GetMainContext().View.ViewProjection(); - if (emitter->SimulationSpace == ParticlesSimulationSpace::Local) + auto module = emitter->Graph.SortModules[moduleIndex]; + // TODO: add support for module->SortedIndicesOffset (multiple sort modules) + const auto sortMode = (ParticleSortMode)module->Values[2].AsInt; + GPUParticlesSortingData data; + data.ParticleCounterOffset = draw.Buffer->GPU.ParticleCounterOffset; + data.ParticleStride = draw.Buffer->Stride; + data.ParticleCapacity = draw.Buffer->Capacity; + int32 permutationIndex; + switch (sortMode) { - Matrix matrix; - Matrix::Multiply(draw.DrawCall.World, viewProjection, matrix); - Matrix::Transpose(matrix, data.PositionTransform); - } - else + case ParticleSortMode::ViewDepth: { - Matrix::Transpose(viewProjection, data.PositionTransform); - } - break; - } - case ParticleSortMode::ViewDistance: - { - permutationIndex = 1; - data.PositionOffset = emitter->Graph.GetPositionAttributeOffset(); - data.ViewPosition = renderContextBatch.GetMainContext().View.Position; - if (emitter->SimulationSpace == ParticlesSimulationSpace::Local) - { - Matrix::Transpose(draw.DrawCall.World, data.PositionTransform); - } - else - { - Matrix::Transpose(Matrix::Identity, data.PositionTransform); - } - break; - } - case ParticleSortMode::CustomAscending: - case ParticleSortMode::CustomDescending: - { - permutationIndex = 2; - int32 attributeIdx = module->Attributes[0]; - if (attributeIdx == -1) + permutationIndex = 0; + data.PositionOffset = emitter->Graph.GetPositionAttributeOffset(); + const Matrix viewProjection = renderContextBatch.GetMainContext().View.ViewProjection(); + if (emitter->SimulationSpace == ParticlesSimulationSpace::Local) + Matrix::Transpose(draw.DrawCall.World * viewProjection, data.PositionTransform); + else + Matrix::Transpose(viewProjection, data.PositionTransform); break; - data.CustomOffset = emitter->Graph.Layout.Attributes[attributeIdx].Offset; - break; + } + case ParticleSortMode::ViewDistance: + { + permutationIndex = 1; + data.PositionOffset = emitter->Graph.GetPositionAttributeOffset(); + data.ViewPosition = renderContextBatch.GetMainContext().View.Position; + if (emitter->SimulationSpace == ParticlesSimulationSpace::Local) + Matrix::Transpose(draw.DrawCall.World, data.PositionTransform); + else + Matrix::Transpose(Matrix::Identity, data.PositionTransform); + break; + } + case ParticleSortMode::CustomAscending: + case ParticleSortMode::CustomDescending: + { + permutationIndex = 2; + int32 attributeIdx = module->Attributes[0]; + if (attributeIdx == -1) + break; + data.CustomOffset = emitter->Graph.Layout.Attributes[attributeIdx].Offset; + break; + } + } + context->UpdateCB(GPUParticlesSortingCB, &data); + context->BindSR(0, draw.Buffer->GPU.Buffer->View()); + context->BindUA(0, draw.Buffer->GPU.SortedIndices->View()); + context->BindUA(1, draw.Buffer->GPU.SortingKeys->View()); + const int32 threadGroupSize = 1024; + context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(draw.Buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1); } -#if !BUILD_RELEASE - default: - CRASH; - return; -#endif - } - context->UpdateCB(GPUParticlesSortingCB, &data); - context->BindSR(0, draw.Buffer->GPU.Buffer->View()); - context->BindUA(0, draw.Buffer->GPU.SortingKeysBuffer->View()); - const int32 threadGroupSize = 1024; - context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(draw.Buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1); } + context->ResetUA(); } // Run sorting @@ -930,17 +917,18 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch) { if (!draw.Sorting) continue; - ASSERT(draw.Buffer->GPU.SortingKeysBuffer); // Execute all sorting modules ParticleEmitter* emitter = draw.Buffer->Emitter; for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++) { auto module = emitter->Graph.SortModules[moduleIndex]; + // TODO: add support for module->SortedIndicesOffset (multiple sort modules) const auto sortMode = (ParticleSortMode)module->Values[2].AsInt; bool sortAscending = sortMode == ParticleSortMode::CustomAscending; - BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.ParticlesCountMax); - // TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier (run all sorting in parallel) + BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.SortingKeys, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.ParticlesCountMax); + // TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier (all sorting in parallel) + // TODO: run small emitters sorting (less than 2k particles) sorting in separate loop as pass without UAV barriers (all sorting in parallel) } } } diff --git a/Source/Engine/Particles/ParticlesData.cpp b/Source/Engine/Particles/ParticlesData.cpp index 074cc73d6..226d06d3d 100644 --- a/Source/Engine/Particles/ParticlesData.cpp +++ b/Source/Engine/Particles/ParticlesData.cpp @@ -98,7 +98,7 @@ ParticleBuffer::~ParticleBuffer() { SAFE_DELETE_GPU_RESOURCE(GPU.Buffer); SAFE_DELETE_GPU_RESOURCE(GPU.BufferSecondary); - SAFE_DELETE_GPU_RESOURCE(GPU.SortingKeysBuffer); + SAFE_DELETE_GPU_RESOURCE(GPU.SortingKeys); SAFE_DELETE_GPU_RESOURCE(GPU.SortedIndices); SAFE_DELETE(GPU.RibbonIndexBufferDynamic); SAFE_DELETE(GPU.RibbonVertexBufferDynamic); @@ -161,7 +161,7 @@ bool ParticleBuffer::Init(ParticleEmitter* emitter) bool ParticleBuffer::AllocateSortBuffer() { - ASSERT(Emitter && GPU.SortedIndices == nullptr && GPU.SortingKeysBuffer == nullptr); + ASSERT(Emitter && GPU.SortedIndices == nullptr && GPU.SortingKeys == nullptr); if (Emitter->Graph.SortModules.IsEmpty()) return false; @@ -170,7 +170,7 @@ bool ParticleBuffer::AllocateSortBuffer() case ParticlesSimulationMode::CPU: { const int32 sortedIndicesSize = Capacity * sizeof(uint32) * Emitter->Graph.SortModules.Count(); - GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("SortedIndices")); + GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortedIndices")); if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesSize, GPUBufferFlags::ShaderResource, PixelFormat::R32_UInt, nullptr, sizeof(uint32), GPUResourceUsage::Dynamic))) return true; break; @@ -178,12 +178,12 @@ bool ParticleBuffer::AllocateSortBuffer() #if COMPILE_WITH_GPU_PARTICLES case ParticlesSimulationMode::GPU: { - const int32 sortedIndicesSize = Capacity * sizeof(uint32) * Emitter->Graph.SortModules.Count(); - GPU.SortingKeysBuffer = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortingKeysBuffer")); - if (GPU.SortingKeysBuffer->Init(GPUBufferDescription::Structured(Capacity, sizeof(float) + sizeof(uint32), true))) + const int32 sortedIndicesCount = Capacity * Emitter->Graph.SortModules.Count(); + GPU.SortingKeys = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortingKeys")); + if (GPU.SortingKeys->Init(GPUBufferDescription::Buffer(sortedIndicesCount * sizeof(float), GPUBufferFlags::UnorderedAccess, PixelFormat::R32_Float, nullptr, sizeof(float)))) return true; - GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("SortedIndices")); - if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesSize, GPUBufferFlags::ShaderResource | GPUBufferFlags::UnorderedAccess, PixelFormat::R32_UInt, nullptr, sizeof(uint32)))) + GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortedIndices")); + if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesCount * sizeof(uint32), GPUBufferFlags::ShaderResource | GPUBufferFlags::UnorderedAccess, PixelFormat::R32_UInt, nullptr, sizeof(uint32)))) return true; break; } diff --git a/Source/Engine/Particles/ParticlesData.h b/Source/Engine/Particles/ParticlesData.h index 5a5ebcba4..521138f37 100644 --- a/Source/Engine/Particles/ParticlesData.h +++ b/Source/Engine/Particles/ParticlesData.h @@ -206,7 +206,7 @@ public: /// /// The GPU particles sorting buffer. Contains structure of particle index and the sorting key for every particle. Used to sort particles. /// - GPUBuffer* SortingKeysBuffer = nullptr; + GPUBuffer* SortingKeys = nullptr; /// /// The particles indices buffer (GPU side). diff --git a/Source/Engine/Renderer/Utils/BitonicSort.cpp b/Source/Engine/Renderer/Utils/BitonicSort.cpp index ed7ece05b..cd0f627f5 100644 --- a/Source/Engine/Renderer/Utils/BitonicSort.cpp +++ b/Source/Engine/Renderer/Utils/BitonicSort.cpp @@ -8,7 +8,7 @@ GPU_CB_STRUCT(Data { float NullItemKey; - uint32 NullItemValue; + uint32 NullItemIndex; uint32 CounterOffset; uint32 MaxIterations; uint32 LoopK; @@ -47,7 +47,6 @@ bool BitonicSort::Init() bool BitonicSort::setupResources() { - // Check if shader has not been loaded if (!_shader->IsLoaded()) return true; const auto shader = _shader->GetShader(); @@ -59,14 +58,12 @@ bool BitonicSort::setupResources() _preSortCS.Get(shader, "CS_PreSort"); _innerSortCS = shader->GetCS("CS_InnerSort"); _outerSortCS = shader->GetCS("CS_OuterSort"); - _copyIndicesCS = shader->GetCS("CS_CopyIndices"); return false; } void BitonicSort::Dispose() { - // Base RendererPass::Dispose(); // Cleanup @@ -76,17 +73,16 @@ void BitonicSort::Dispose() _preSortCS.Clear(); _innerSortCS = nullptr; _outerSortCS = nullptr; - _copyIndicesCS = nullptr; _shader = nullptr; } -void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements) +void BitonicSort::Sort(GPUContext* context, GPUBuffer* indicesBuffer, GPUBuffer* keysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, int32 maxElements) { - ASSERT(context && sortingKeysBuffer && countBuffer); + ASSERT(context && indicesBuffer && keysBuffer && countBuffer); if (checkIfSkipPass()) return; PROFILE_GPU_CPU("Bitonic Sort"); - uint32 maxNumElements = sortingKeysBuffer->GetSize() / sizeof(uint64); + uint32 maxNumElements = indicesBuffer->GetElementsCount(); if (maxElements > 0 && maxElements < maxNumElements) maxNumElements = maxElements; const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements); @@ -96,7 +92,7 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf Data data; data.CounterOffset = counterOffset; data.NullItemKey = sortAscending ? MAX_float : -MAX_float; - data.NullItemValue = 0; + data.NullItemIndex = 0; data.KeySign = sortAscending ? -1.0f : 1.0f; data.MaxIterations = maxIterations; data.LoopK = 0; @@ -110,7 +106,8 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf { // Use pre-sort with smaller thread group size (eg. for small particle emitters sorting) const int32 permutation = maxNumElements < 128 ? 1 : 0; - context->BindUA(0, sortingKeysBuffer->View()); + context->BindUA(0, indicesBuffer->View()); + context->BindUA(1, keysBuffer->View()); context->Dispatch(_preSortCS.Get(permutation), 1, 1, 1); } else @@ -120,7 +117,8 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf context->Dispatch(_indirectArgsCS, 1, 1, 1); // Pre-Sort the buffer up to k = 2048 (this also pads the list with invalid indices that will drift to the end of the sorted list) - context->BindUA(0, sortingKeysBuffer->View()); + context->BindUA(0, indicesBuffer->View()); + context->BindUA(1, keysBuffer->View()); context->DispatchIndirect(_preSortCS.Get(0), _dispatchArgsBuffer, 0); // We have already pre-sorted up through k = 2048 when first writing our list, so we continue sorting with k = 4096 @@ -144,27 +142,4 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf } context->ResetUA(); - - if (sortedIndicesBuffer) - { - // Copy indices to another buffer -#if !BUILD_RELEASE - switch (sortedIndicesBuffer->GetDescription().Format) - { - case PixelFormat::R32_UInt: - case PixelFormat::R16_UInt: - case PixelFormat::R8_UInt: - break; - default: - LOG(Warning, "Invalid format {0} of sortedIndicesBuffer for BitonicSort. It needs to be UInt type.", (int32)sortedIndicesBuffer->GetDescription().Format); - } -#endif - context->BindSR(1, sortingKeysBuffer->View()); - context->BindUA(0, sortedIndicesBuffer->View()); - // TODO: use indirect dispatch to match the items count for copy - context->Dispatch(_copyIndicesCS, (alignedMaxNumElements + 1023) / 1024, 1, 1); - } - - context->ResetUA(); - context->ResetSR(); } diff --git a/Source/Engine/Renderer/Utils/BitonicSort.h b/Source/Engine/Renderer/Utils/BitonicSort.h index 4280d5965..1fd5d50cd 100644 --- a/Source/Engine/Renderer/Utils/BitonicSort.h +++ b/Source/Engine/Renderer/Utils/BitonicSort.h @@ -26,16 +26,16 @@ private: public: /// - /// Sorts the specified buffer of index-key pairs. + /// Sorts the specified buffers of index-key pairs. /// /// The GPU context. - /// The sorting keys buffer. Used as a structured buffer of type Item (see above). + /// The sorting indices buffer with an index for each item (sequence of: 0, 1, 2, 3...). After sorting represents actual items order based on their keys. Valid for uint value types - used as RWBuffer. + /// The sorting keys buffer with a sort value for each item (must match order of items in indicesBuffer). Valid for float value types - used as RWBuffer. /// The buffer that contains a items counter value. /// The offset into counter buffer to find count for this list. Must be a multiple of 4 bytes. /// True to sort in ascending order (smallest to largest), otherwise false to sort in descending order. - /// The output buffer for sorted values extracted from the sorted sortingKeysBuffer after algorithm run. Valid for uint value types - used as RWBuffer. /// Optional upper limit of elements to sort. Cna be used to optimize indirect dispatches allocation. If non-zero, then it gets calculated based on the input item buffer size. - void Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements = 0); + void Sort(GPUContext* context, GPUBuffer* indicesBuffer, GPUBuffer* keysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, int32 maxElements = 0); public: diff --git a/Source/Shaders/BitonicSort.shader b/Source/Shaders/BitonicSort.shader index 0f9d5e656..6538ff7ff 100644 --- a/Source/Shaders/BitonicSort.shader +++ b/Source/Shaders/BitonicSort.shader @@ -10,12 +10,12 @@ struct Item { float Key; - uint Value; + uint Index; }; META_CB_BEGIN(0, Data) float NullItemKey; -uint NullItemValue; +uint NullItemIndex; uint CounterOffset; uint MaxIterations; uint LoopK; @@ -40,12 +40,12 @@ uint InsertOneBit(uint value, uint oneBitMask) // (effectively a negation) or leave the value alone. When the KeySign is // 1, we are sorting descending, so when A < B, they should swap. For an // ascending sort, -A < -B should swap. -bool ShouldSwap(Item a, Item b) +bool ShouldSwap(float a, float b) { //return (a ^ NullItem) < (b ^ NullItem); - //return (a.Key) < (b.Key); - return (a.Key * KeySign) < (b.Key * KeySign); + //return (a) < (b); + return (a * KeySign) < (b * KeySign); //return asfloat(a) < asfloat(b); //return (asfloat(a) * KeySign) < (asfloat(b) * KeySign); } @@ -93,7 +93,8 @@ void CS_IndirectArgs(uint groupIndex : SV_GroupIndex) #if defined(_CS_PreSort) || defined(_CS_InnerSort) -RWStructuredBuffer SortBuffer : register(u0); +RWBuffer SortedIndices : register(u0); +RWBuffer SortingKeys : register(u1); groupshared Item SortData[THREAD_GROUP_SIZE * 2]; @@ -103,12 +104,13 @@ void LoadItem(uint element, uint count) Item item; if (element < count) { - item = SortBuffer[element]; + item.Key = SortingKeys[element]; + item.Index = SortedIndices[element]; } else { item.Key = NullItemKey; - item.Value = NullItemValue; + item.Index = NullItemIndex; } SortData[element & (THREAD_GROUP_SIZE * 2 - 1)] = item; } @@ -117,7 +119,9 @@ void StoreItem(uint element, uint count) { if (element < count) { - SortBuffer[element] = SortData[element & 2047]; + Item item = SortData[element & ((THREAD_GROUP_SIZE * 2 - 1))]; + SortingKeys[element] = item.Key; + SortedIndices[element] = item.Index; } } @@ -153,7 +157,7 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex) Item a = SortData[index1]; Item b = SortData[index2]; - if (ShouldSwap(a, b)) + if (ShouldSwap(a.Key, b.Key)) { // Swap the items SortData[index1] = b; @@ -197,7 +201,7 @@ void CS_InnerSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex) Item a = SortData[index1]; Item b = SortData[index2]; - if (ShouldSwap(a, b)) + if (ShouldSwap(a.Key, b.Key)) { // Swap the items SortData[index1] = b; @@ -215,7 +219,8 @@ void CS_InnerSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex) #ifdef _CS_OuterSort -RWStructuredBuffer SortBuffer : register(u0); +RWBuffer SortedIndices : register(u0); +RWBuffer SortingKeys : register(u1); META_CS(true, FEATURE_LEVEL_SM5) [numthreads(1024, 1, 1)] @@ -230,35 +235,19 @@ void CS_OuterSort(uint3 dispatchThreadId : SV_DispatchThreadID) if (index2 >= count) return; - Item a = SortBuffer[index1]; - Item b = SortBuffer[index2]; + float aKey = SortingKeys[index1]; + float bKey = SortingKeys[index2]; - if (ShouldSwap(a, b)) + if (ShouldSwap(aKey, bKey)) { // Swap the items - SortBuffer[index1] = b; - SortBuffer[index2] = a; + SortingKeys[index1] = bKey; + SortingKeys[index2] = aKey; + uint aIndex = SortedIndices[index1]; + uint bIndex = SortedIndices[index2]; + SortedIndices[index1] = bIndex; + SortedIndices[index2] = aIndex; } } #endif - -#ifdef _CS_CopyIndices - -StructuredBuffer SortBuffer : register(t1); -RWBuffer SortedIndices : register(u0); - -META_CS(true, FEATURE_LEVEL_SM5) -[numthreads(1024, 1, 1)] -void CS_CopyIndices(uint3 dispatchThreadId : SV_DispatchThreadID) -{ - const uint count = CounterBuffer.Load(CounterOffset); - uint index = dispatchThreadId.x; - if (index >= count) - return; - - Item element = SortBuffer[index]; - SortedIndices[index] = element.Value; -} - -#endif diff --git a/Source/Shaders/GPUParticlesSorting.shader b/Source/Shaders/GPUParticlesSorting.shader index 395172327..113096421 100644 --- a/Source/Shaders/GPUParticlesSorting.shader +++ b/Source/Shaders/GPUParticlesSorting.shader @@ -20,13 +20,9 @@ META_CB_END // Particles data buffer ByteAddressBuffer ParticlesData : register(t0); -// Output sorting keys buffer (index + key) -struct Item -{ - float Key; - uint Value; -}; -RWStructuredBuffer SortingKeys : register(u0); +// Sorting data (per-particle) +RWBuffer SortedIndices : register(u0); +RWBuffer SortingKeys : register(u1); float GetParticleFloat(uint particleIndex, int offset) { @@ -78,8 +74,6 @@ void CS_Sort(uint3 dispatchThreadId : SV_DispatchThreadID) #endif // Write sorting index-key pair - Item item; - item.Key = sortKey; - item.Value = index; - SortingKeys[index] = item; + SortedIndices[index] = index; + SortingKeys[index] = sortKey; }