Optimize GPU particles Bitonic sort to use separate buffers for indices and keys to avoid additional buffer copy

2025-08-08 18:24:44 +02:00
parent 519a9c0a14
commit 854f3acd4c
9 changed files with 115 additions and 169 deletions
--- a/Content/Shaders/BitonicSort.flax
+++ b/Content/Shaders/BitonicSort.flax
--- a/Content/Shaders/GPUParticlesSorting.flax
+++ b/Content/Shaders/GPUParticlesSorting.flax
--- a/Source/Engine/Particles/Particles.cpp
+++ b/Source/Engine/Particles/Particles.cpp
@@ -848,81 +848,68 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
        context->BindCB(0, GPUParticlesSortingCB);
        // Generate sort keys for each particle
        for (const GPUEmitterDraw& draw : GPUEmitterDraws)
        {
-            if (!draw.Sorting)
+            PROFILE_GPU("Gen Sort Keys");
-                continue;
+            for (const GPUEmitterDraw& draw : GPUEmitterDraws)
            ASSERT(draw.Buffer->GPU.SortingKeysBuffer);
            // Generate sort keys for particles
            ParticleEmitter* emitter = draw.Buffer->Emitter;
            for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++)
            {
-                auto module = emitter->Graph.SortModules[moduleIndex];
+                if (!draw.Sorting)
-                const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
+                    continue;
-
+                ASSERT(draw.Buffer->GPU.SortingKeys);
-                // Generate sorting keys based on sorting mode
+                ParticleEmitter* emitter = draw.Buffer->Emitter;
-                GPUParticlesSortingData data;
+                for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++)
                data.ParticleCounterOffset = draw.Buffer->GPU.ParticleCounterOffset;
                data.ParticleStride = draw.Buffer->Stride;
                data.ParticleCapacity = draw.Buffer->Capacity;
                int32 permutationIndex;
                switch (sortMode)
                {
-                case ParticleSortMode::ViewDepth:
+                    auto module = emitter->Graph.SortModules[moduleIndex];
-                {
+                    // TODO: add support for module->SortedIndicesOffset (multiple sort modules)
-                    permutationIndex = 0;
+                    const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
-                    data.PositionOffset = emitter->Graph.GetPositionAttributeOffset();
+                    GPUParticlesSortingData data;
-                    const Matrix viewProjection = renderContextBatch.GetMainContext().View.ViewProjection();
+                    data.ParticleCounterOffset = draw.Buffer->GPU.ParticleCounterOffset;
-                    if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
+                    data.ParticleStride = draw.Buffer->Stride;
                    data.ParticleCapacity = draw.Buffer->Capacity;
                    int32 permutationIndex;
                    switch (sortMode)
                    {
-                        Matrix matrix;
+                    case ParticleSortMode::ViewDepth:
                        Matrix::Multiply(draw.DrawCall.World, viewProjection, matrix);
                        Matrix::Transpose(matrix, data.PositionTransform);
                    }
                    else
                    {
-                        Matrix::Transpose(viewProjection, data.PositionTransform);
+                        permutationIndex = 0;
-                    }
+                        data.PositionOffset = emitter->Graph.GetPositionAttributeOffset();
-                    break;
+                        const Matrix viewProjection = renderContextBatch.GetMainContext().View.ViewProjection();
-                }
+                        if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
-                case ParticleSortMode::ViewDistance:
+                            Matrix::Transpose(draw.DrawCall.World * viewProjection, data.PositionTransform);
-                {
+                        else
-                    permutationIndex = 1;
+                            Matrix::Transpose(viewProjection, data.PositionTransform);
                    data.PositionOffset = emitter->Graph.GetPositionAttributeOffset();
                    data.ViewPosition = renderContextBatch.GetMainContext().View.Position;
                    if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
                    {
                        Matrix::Transpose(draw.DrawCall.World, data.PositionTransform);
                    }
                    else
                    {
                        Matrix::Transpose(Matrix::Identity, data.PositionTransform);
                    }
                    break;
                }
                case ParticleSortMode::CustomAscending:
                case ParticleSortMode::CustomDescending:
                {
                    permutationIndex = 2;
                    int32 attributeIdx = module->Attributes[0];
                    if (attributeIdx == -1)
                        break;
-                    data.CustomOffset = emitter->Graph.Layout.Attributes[attributeIdx].Offset;
+                    }
-                    break;
+                    case ParticleSortMode::ViewDistance:
                    {
                        permutationIndex = 1;
                        data.PositionOffset = emitter->Graph.GetPositionAttributeOffset();
                        data.ViewPosition = renderContextBatch.GetMainContext().View.Position;
                        if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
                            Matrix::Transpose(draw.DrawCall.World, data.PositionTransform);
                        else
                            Matrix::Transpose(Matrix::Identity, data.PositionTransform);
                        break;
                    }
                    case ParticleSortMode::CustomAscending:
                    case ParticleSortMode::CustomDescending:
                    {
                        permutationIndex = 2;
                        int32 attributeIdx = module->Attributes[0];
                        if (attributeIdx == -1)
                            break;
                        data.CustomOffset = emitter->Graph.Layout.Attributes[attributeIdx].Offset;
                        break;
                    }
                    }
                    context->UpdateCB(GPUParticlesSortingCB, &data);
                    context->BindSR(0, draw.Buffer->GPU.Buffer->View());
                    context->BindUA(0, draw.Buffer->GPU.SortedIndices->View());
                    context->BindUA(1, draw.Buffer->GPU.SortingKeys->View());
                    const int32 threadGroupSize = 1024;
                    context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(draw.Buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1);
                }
 #if !BUILD_RELEASE
                default:
                    CRASH;
                    return;
 #endif
                }
                context->UpdateCB(GPUParticlesSortingCB, &data);
                context->BindSR(0, draw.Buffer->GPU.Buffer->View());
                context->BindUA(0, draw.Buffer->GPU.SortingKeysBuffer->View());
                const int32 threadGroupSize = 1024;
                context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(draw.Buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1);
            }
            context->ResetUA();
        }
        // Run sorting
@@ -930,17 +917,18 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
        {
            if (!draw.Sorting)
                continue;
            ASSERT(draw.Buffer->GPU.SortingKeysBuffer);
            // Execute all sorting modules
            ParticleEmitter* emitter = draw.Buffer->Emitter;
            for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++)
            {
                auto module = emitter->Graph.SortModules[moduleIndex];
                // TODO: add support for module->SortedIndicesOffset (multiple sort modules)
                const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
                bool sortAscending = sortMode == ParticleSortMode::CustomAscending;
-                BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.ParticlesCountMax);
+                BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.SortingKeys, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.ParticlesCountMax);
-                // TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier (run all sorting in parallel)
+                // TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier (all sorting in parallel)
                // TODO: run small emitters sorting (less than 2k particles) sorting in separate loop as pass without UAV barriers (all sorting in parallel)
            }
        }
    }
--- a/Source/Engine/Particles/ParticlesData.cpp
+++ b/Source/Engine/Particles/ParticlesData.cpp
@@ -98,7 +98,7 @@ ParticleBuffer::~ParticleBuffer()
 {
    SAFE_DELETE_GPU_RESOURCE(GPU.Buffer);
    SAFE_DELETE_GPU_RESOURCE(GPU.BufferSecondary);
-    SAFE_DELETE_GPU_RESOURCE(GPU.SortingKeysBuffer);
+    SAFE_DELETE_GPU_RESOURCE(GPU.SortingKeys);
    SAFE_DELETE_GPU_RESOURCE(GPU.SortedIndices);
    SAFE_DELETE(GPU.RibbonIndexBufferDynamic);
    SAFE_DELETE(GPU.RibbonVertexBufferDynamic);
@@ -161,7 +161,7 @@ bool ParticleBuffer::Init(ParticleEmitter* emitter)
 bool ParticleBuffer::AllocateSortBuffer()
 {
-    ASSERT(Emitter && GPU.SortedIndices == nullptr && GPU.SortingKeysBuffer == nullptr);
+    ASSERT(Emitter && GPU.SortedIndices == nullptr && GPU.SortingKeys == nullptr);
    if (Emitter->Graph.SortModules.IsEmpty())
        return false;
@@ -170,7 +170,7 @@ bool ParticleBuffer::AllocateSortBuffer()
    case ParticlesSimulationMode::CPU:
    {
        const int32 sortedIndicesSize = Capacity * sizeof(uint32) * Emitter->Graph.SortModules.Count();
-        GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("SortedIndices"));
+        GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortedIndices"));
        if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesSize, GPUBufferFlags::ShaderResource, PixelFormat::R32_UInt, nullptr, sizeof(uint32), GPUResourceUsage::Dynamic)))
            return true;
        break;
@@ -178,12 +178,12 @@ bool ParticleBuffer::AllocateSortBuffer()
 #if COMPILE_WITH_GPU_PARTICLES
    case ParticlesSimulationMode::GPU:
    {
-        const int32 sortedIndicesSize = Capacity * sizeof(uint32) * Emitter->Graph.SortModules.Count();
+        const int32 sortedIndicesCount = Capacity * Emitter->Graph.SortModules.Count();
-        GPU.SortingKeysBuffer = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortingKeysBuffer"));
+        GPU.SortingKeys = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortingKeys"));
-        if (GPU.SortingKeysBuffer->Init(GPUBufferDescription::Structured(Capacity, sizeof(float) + sizeof(uint32), true)))
+        if (GPU.SortingKeys->Init(GPUBufferDescription::Buffer(sortedIndicesCount * sizeof(float), GPUBufferFlags::UnorderedAccess, PixelFormat::R32_Float, nullptr, sizeof(float))))
            return true;
-        GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("SortedIndices"));
+        GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortedIndices"));
-        if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesSize, GPUBufferFlags::ShaderResource | GPUBufferFlags::UnorderedAccess, PixelFormat::R32_UInt, nullptr, sizeof(uint32))))
+        if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesCount * sizeof(uint32), GPUBufferFlags::ShaderResource | GPUBufferFlags::UnorderedAccess, PixelFormat::R32_UInt, nullptr, sizeof(uint32))))
            return true;
        break;
    }
--- a/Source/Engine/Particles/ParticlesData.h
+++ b/Source/Engine/Particles/ParticlesData.h
@@ -206,7 +206,7 @@ public:
        /// <summary>
        /// The GPU particles sorting buffer. Contains structure of particle index and the sorting key for every particle. Used to sort particles.
        /// </summary>
-        GPUBuffer* SortingKeysBuffer = nullptr;
+        GPUBuffer* SortingKeys = nullptr;
        /// <summary>
        /// The particles indices buffer (GPU side).
--- a/Source/Engine/Renderer/Utils/BitonicSort.cpp
+++ b/Source/Engine/Renderer/Utils/BitonicSort.cpp
@@ -8,7 +8,7 @@
 GPU_CB_STRUCT(Data {
    float NullItemKey;
-    uint32 NullItemValue;
+    uint32 NullItemIndex;
    uint32 CounterOffset;
    uint32 MaxIterations;
    uint32 LoopK;
@@ -47,7 +47,6 @@ bool BitonicSort::Init()
 bool BitonicSort::setupResources()
 {
    // Check if shader has not been loaded
    if (!_shader->IsLoaded())
        return true;
    const auto shader = _shader->GetShader();
@@ -59,14 +58,12 @@ bool BitonicSort::setupResources()
    _preSortCS.Get(shader, "CS_PreSort");
    _innerSortCS = shader->GetCS("CS_InnerSort");
    _outerSortCS = shader->GetCS("CS_OuterSort");
    _copyIndicesCS = shader->GetCS("CS_CopyIndices");
    return false;
 }
 void BitonicSort::Dispose()
 {
    // Base
    RendererPass::Dispose();
    // Cleanup
@@ -76,17 +73,16 @@ void BitonicSort::Dispose()
    _preSortCS.Clear();
    _innerSortCS = nullptr;
    _outerSortCS = nullptr;
    _copyIndicesCS = nullptr;
    _shader = nullptr;
 }
-void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements)
+void BitonicSort::Sort(GPUContext* context, GPUBuffer* indicesBuffer, GPUBuffer* keysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, int32 maxElements)
 {
-    ASSERT(context && sortingKeysBuffer && countBuffer);
+    ASSERT(context && indicesBuffer && keysBuffer && countBuffer);
    if (checkIfSkipPass())
        return;
    PROFILE_GPU_CPU("Bitonic Sort");
-    uint32 maxNumElements = sortingKeysBuffer->GetSize() / sizeof(uint64);
+    uint32 maxNumElements = indicesBuffer->GetElementsCount();
    if (maxElements > 0 && maxElements < maxNumElements)
        maxNumElements = maxElements;
    const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements);
@@ -96,7 +92,7 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
    Data data;
    data.CounterOffset = counterOffset;
    data.NullItemKey = sortAscending ? MAX_float : -MAX_float;
-    data.NullItemValue = 0;
+    data.NullItemIndex = 0;
    data.KeySign = sortAscending ? -1.0f : 1.0f;
    data.MaxIterations = maxIterations;
    data.LoopK = 0;
@@ -110,7 +106,8 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
    {
        // Use pre-sort with smaller thread group size (eg. for small particle emitters sorting)
        const int32 permutation = maxNumElements < 128 ? 1 : 0;
-        context->BindUA(0, sortingKeysBuffer->View());
+        context->BindUA(0, indicesBuffer->View());
        context->BindUA(1, keysBuffer->View());
        context->Dispatch(_preSortCS.Get(permutation), 1, 1, 1);
    }
    else
@@ -120,7 +117,8 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
        context->Dispatch(_indirectArgsCS, 1, 1, 1);
        // Pre-Sort the buffer up to k = 2048 (this also pads the list with invalid indices that will drift to the end of the sorted list)
-        context->BindUA(0, sortingKeysBuffer->View());
+        context->BindUA(0, indicesBuffer->View());
        context->BindUA(1, keysBuffer->View());
        context->DispatchIndirect(_preSortCS.Get(0), _dispatchArgsBuffer, 0);
        // We have already pre-sorted up through k = 2048 when first writing our list, so we continue sorting with k = 4096
@@ -144,27 +142,4 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
    }
    context->ResetUA();
    if (sortedIndicesBuffer)
    {
        // Copy indices to another buffer
 #if !BUILD_RELEASE
        switch (sortedIndicesBuffer->GetDescription().Format)
        {
        case PixelFormat::R32_UInt:
        case PixelFormat::R16_UInt:
        case PixelFormat::R8_UInt:
            break;
        default:
            LOG(Warning, "Invalid format {0} of sortedIndicesBuffer for BitonicSort. It needs to be UInt type.", (int32)sortedIndicesBuffer->GetDescription().Format);
        }
 #endif
        context->BindSR(1, sortingKeysBuffer->View());
        context->BindUA(0, sortedIndicesBuffer->View());
        // TODO: use indirect dispatch to match the items count for copy
        context->Dispatch(_copyIndicesCS, (alignedMaxNumElements + 1023) / 1024, 1, 1);
    }
    context->ResetUA();
    context->ResetSR();
 }
--- a/Source/Engine/Renderer/Utils/BitonicSort.h
+++ b/Source/Engine/Renderer/Utils/BitonicSort.h
@@ -26,16 +26,16 @@ private:
 public:
    /// <summary>
-    /// Sorts the specified buffer of index-key pairs.
+    /// Sorts the specified buffers of index-key pairs.
    /// </summary>
    /// <param name="context">The GPU context.</param>
-    /// <param name="sortingKeysBuffer">The sorting keys buffer. Used as a structured buffer of type Item (see above).</param>
+    /// <param name="indicesBuffer">The sorting indices buffer with an index for each item (sequence of: 0, 1, 2, 3...). After sorting represents actual items order based on their keys. Valid for uint value types - used as RWBuffer.</param>
    /// <param name="keysBuffer">The sorting keys buffer with a sort value for each item (must match order of items in indicesBuffer). Valid for float value types - used as RWBuffer.</param>
    /// <param name="countBuffer">The buffer that contains a items counter value.</param>
    /// <param name="counterOffset">The offset into counter buffer to find count for this list. Must be a multiple of 4 bytes.</param>
    /// <param name="sortAscending">True to sort in ascending order (smallest to largest), otherwise false to sort in descending order.</param>
    /// <param name="sortedIndicesBuffer">The output buffer for sorted values extracted from the sorted sortingKeysBuffer after algorithm run. Valid for uint value types - used as RWBuffer.</param>
    /// <param name="maxElements">Optional upper limit of elements to sort. Cna be used to optimize indirect dispatches allocation. If non-zero, then it gets calculated based on the input item buffer size.</param>
-    void Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements = 0);
+    void Sort(GPUContext* context, GPUBuffer* indicesBuffer, GPUBuffer* keysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, int32 maxElements = 0);
 public:
--- a/Source/Shaders/BitonicSort.shader
+++ b/Source/Shaders/BitonicSort.shader
@@ -10,12 +10,12 @@
 struct Item
 {
 	float Key;
-	uint Value;
+	uint Index;
 };
 META_CB_BEGIN(0, Data)
 float NullItemKey;
-uint NullItemValue;
+uint NullItemIndex;
 uint CounterOffset;
 uint MaxIterations;
 uint LoopK;
@@ -40,12 +40,12 @@ uint InsertOneBit(uint value, uint oneBitMask)
 // (effectively a negation) or leave the value alone. When the KeySign is
 // 1, we are sorting descending, so when A < B, they should swap. For an
 // ascending sort, -A < -B should swap.
-bool ShouldSwap(Item a, Item b)
+bool ShouldSwap(float a, float b)
 {
 	//return (a ^ NullItem) < (b ^ NullItem);
-	//return (a.Key) < (b.Key);
+	//return (a) < (b);
-	return (a.Key * KeySign) < (b.Key * KeySign);
+	return (a * KeySign) < (b * KeySign);
 	//return asfloat(a) < asfloat(b);
 	//return (asfloat(a) * KeySign) < (asfloat(b) * KeySign);
 }
@@ -93,7 +93,8 @@ void CS_IndirectArgs(uint groupIndex : SV_GroupIndex)
 #if defined(_CS_PreSort) || defined(_CS_InnerSort)
-RWStructuredBuffer<Item> SortBuffer : register(u0);
+RWBuffer<uint> SortedIndices : register(u0);
 RWBuffer<float> SortingKeys : register(u1);
 groupshared Item SortData[THREAD_GROUP_SIZE * 2];
@@ -103,12 +104,13 @@ void LoadItem(uint element, uint count)
 	Item item;
 	if (element < count)
 	{
-		item = SortBuffer[element];
+		item.Key = SortingKeys[element];
 		item.Index = SortedIndices[element];
 	}
 	else
 	{
 		item.Key = NullItemKey;
-		item.Value = NullItemValue;
+		item.Index = NullItemIndex;
 	}
 	SortData[element & (THREAD_GROUP_SIZE * 2 - 1)] = item;
 }
@@ -117,7 +119,9 @@ void StoreItem(uint element, uint count)
 {
 	if (element < count)
 	{
-		SortBuffer[element] = SortData[element & 2047];
+        Item item = SortData[element & ((THREAD_GROUP_SIZE * 2 - 1))];
 		SortingKeys[element] = item.Key;
 		SortedIndices[element] = item.Index;
 	}
 }
@@ -153,7 +157,7 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
 			Item a = SortData[index1];
 			Item b = SortData[index2];
-			if (ShouldSwap(a, b))
+			if (ShouldSwap(a.Key, b.Key))
 			{
 				// Swap the items
 				SortData[index1] = b;
@@ -197,7 +201,7 @@ void CS_InnerSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
 		Item a = SortData[index1];
 		Item b = SortData[index2];
-		if (ShouldSwap(a, b))
+		if (ShouldSwap(a.Key, b.Key))
 		{
 			// Swap the items
 			SortData[index1] = b;
@@ -215,7 +219,8 @@ void CS_InnerSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
 #ifdef _CS_OuterSort
-RWStructuredBuffer<Item> SortBuffer : register(u0);
+RWBuffer<uint> SortedIndices : register(u0);
 RWBuffer<float> SortingKeys : register(u1);
 META_CS(true, FEATURE_LEVEL_SM5)
 [numthreads(1024, 1, 1)]
@@ -230,35 +235,19 @@ void CS_OuterSort(uint3 dispatchThreadId : SV_DispatchThreadID)
 	if (index2 >= count)
 		return;
-	Item a = SortBuffer[index1];
+	float aKey = SortingKeys[index1];
-	Item b = SortBuffer[index2];
+	float bKey = SortingKeys[index2];
-	if (ShouldSwap(a, b))
+	if (ShouldSwap(aKey, bKey))
 	{
 		// Swap the items
-		SortBuffer[index1] = b;
+		SortingKeys[index1] = bKey;
-		SortBuffer[index2] = a;
+		SortingKeys[index2] = aKey;
        uint aIndex = SortedIndices[index1];
        uint bIndex = SortedIndices[index2];
 		SortedIndices[index1] = bIndex;
 		SortedIndices[index2] = aIndex;
 	}
 }
 #endif
 #ifdef _CS_CopyIndices
 StructuredBuffer<Item> SortBuffer : register(t1);
 RWBuffer<uint> SortedIndices : register(u0);
 META_CS(true, FEATURE_LEVEL_SM5)
 [numthreads(1024, 1, 1)]
 void CS_CopyIndices(uint3 dispatchThreadId : SV_DispatchThreadID)
 {
 	const uint count = CounterBuffer.Load(CounterOffset);
 	uint index = dispatchThreadId.x;
 	if (index >= count)
 		return;
 	Item element = SortBuffer[index];
 	SortedIndices[index] = element.Value;
 }
 #endif
--- a/Source/Shaders/GPUParticlesSorting.shader
+++ b/Source/Shaders/GPUParticlesSorting.shader
@@ -20,13 +20,9 @@ META_CB_END
 // Particles data buffer
 ByteAddressBuffer ParticlesData : register(t0);
-// Output sorting keys buffer (index + key)
+// Sorting data (per-particle)
-struct Item
+RWBuffer<uint> SortedIndices : register(u0);
-{
+RWBuffer<float> SortingKeys : register(u1);
 	float Key;
 	uint Value;
 };
 RWStructuredBuffer<Item> SortingKeys : register(u0);
 float GetParticleFloat(uint particleIndex, int offset)
 {
@@ -78,8 +74,6 @@ void CS_Sort(uint3 dispatchThreadId : SV_DispatchThreadID)
 #endif
 	// Write sorting index-key pair
-	Item item;
+	SortedIndices[index] = index;
-	item.Key = sortKey;
+	SortingKeys[index] = sortKey;
 	item.Value = index;
 	SortingKeys[index] = item;
 }