Optimize GPU particles Bitonic sort to use separate buffers for indices and keys to avoid additional buffer copy
This commit is contained in:
BIN
Content/Shaders/BitonicSort.flax
(Stored with Git LFS)
BIN
Content/Shaders/BitonicSort.flax
(Stored with Git LFS)
Binary file not shown.
BIN
Content/Shaders/GPUParticlesSorting.flax
(Stored with Git LFS)
BIN
Content/Shaders/GPUParticlesSorting.flax
(Stored with Git LFS)
Binary file not shown.
@@ -848,81 +848,68 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
|
|||||||
context->BindCB(0, GPUParticlesSortingCB);
|
context->BindCB(0, GPUParticlesSortingCB);
|
||||||
|
|
||||||
// Generate sort keys for each particle
|
// Generate sort keys for each particle
|
||||||
for (const GPUEmitterDraw& draw : GPUEmitterDraws)
|
|
||||||
{
|
{
|
||||||
if (!draw.Sorting)
|
PROFILE_GPU("Gen Sort Keys");
|
||||||
continue;
|
for (const GPUEmitterDraw& draw : GPUEmitterDraws)
|
||||||
ASSERT(draw.Buffer->GPU.SortingKeysBuffer);
|
|
||||||
|
|
||||||
// Generate sort keys for particles
|
|
||||||
ParticleEmitter* emitter = draw.Buffer->Emitter;
|
|
||||||
for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++)
|
|
||||||
{
|
{
|
||||||
auto module = emitter->Graph.SortModules[moduleIndex];
|
if (!draw.Sorting)
|
||||||
const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
|
continue;
|
||||||
|
ASSERT(draw.Buffer->GPU.SortingKeys);
|
||||||
// Generate sorting keys based on sorting mode
|
ParticleEmitter* emitter = draw.Buffer->Emitter;
|
||||||
GPUParticlesSortingData data;
|
for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++)
|
||||||
data.ParticleCounterOffset = draw.Buffer->GPU.ParticleCounterOffset;
|
|
||||||
data.ParticleStride = draw.Buffer->Stride;
|
|
||||||
data.ParticleCapacity = draw.Buffer->Capacity;
|
|
||||||
int32 permutationIndex;
|
|
||||||
switch (sortMode)
|
|
||||||
{
|
{
|
||||||
case ParticleSortMode::ViewDepth:
|
auto module = emitter->Graph.SortModules[moduleIndex];
|
||||||
{
|
// TODO: add support for module->SortedIndicesOffset (multiple sort modules)
|
||||||
permutationIndex = 0;
|
const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
|
||||||
data.PositionOffset = emitter->Graph.GetPositionAttributeOffset();
|
GPUParticlesSortingData data;
|
||||||
const Matrix viewProjection = renderContextBatch.GetMainContext().View.ViewProjection();
|
data.ParticleCounterOffset = draw.Buffer->GPU.ParticleCounterOffset;
|
||||||
if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
|
data.ParticleStride = draw.Buffer->Stride;
|
||||||
|
data.ParticleCapacity = draw.Buffer->Capacity;
|
||||||
|
int32 permutationIndex;
|
||||||
|
switch (sortMode)
|
||||||
{
|
{
|
||||||
Matrix matrix;
|
case ParticleSortMode::ViewDepth:
|
||||||
Matrix::Multiply(draw.DrawCall.World, viewProjection, matrix);
|
|
||||||
Matrix::Transpose(matrix, data.PositionTransform);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
Matrix::Transpose(viewProjection, data.PositionTransform);
|
permutationIndex = 0;
|
||||||
}
|
data.PositionOffset = emitter->Graph.GetPositionAttributeOffset();
|
||||||
break;
|
const Matrix viewProjection = renderContextBatch.GetMainContext().View.ViewProjection();
|
||||||
}
|
if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
|
||||||
case ParticleSortMode::ViewDistance:
|
Matrix::Transpose(draw.DrawCall.World * viewProjection, data.PositionTransform);
|
||||||
{
|
else
|
||||||
permutationIndex = 1;
|
Matrix::Transpose(viewProjection, data.PositionTransform);
|
||||||
data.PositionOffset = emitter->Graph.GetPositionAttributeOffset();
|
|
||||||
data.ViewPosition = renderContextBatch.GetMainContext().View.Position;
|
|
||||||
if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
|
|
||||||
{
|
|
||||||
Matrix::Transpose(draw.DrawCall.World, data.PositionTransform);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
Matrix::Transpose(Matrix::Identity, data.PositionTransform);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case ParticleSortMode::CustomAscending:
|
|
||||||
case ParticleSortMode::CustomDescending:
|
|
||||||
{
|
|
||||||
permutationIndex = 2;
|
|
||||||
int32 attributeIdx = module->Attributes[0];
|
|
||||||
if (attributeIdx == -1)
|
|
||||||
break;
|
break;
|
||||||
data.CustomOffset = emitter->Graph.Layout.Attributes[attributeIdx].Offset;
|
}
|
||||||
break;
|
case ParticleSortMode::ViewDistance:
|
||||||
|
{
|
||||||
|
permutationIndex = 1;
|
||||||
|
data.PositionOffset = emitter->Graph.GetPositionAttributeOffset();
|
||||||
|
data.ViewPosition = renderContextBatch.GetMainContext().View.Position;
|
||||||
|
if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
|
||||||
|
Matrix::Transpose(draw.DrawCall.World, data.PositionTransform);
|
||||||
|
else
|
||||||
|
Matrix::Transpose(Matrix::Identity, data.PositionTransform);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case ParticleSortMode::CustomAscending:
|
||||||
|
case ParticleSortMode::CustomDescending:
|
||||||
|
{
|
||||||
|
permutationIndex = 2;
|
||||||
|
int32 attributeIdx = module->Attributes[0];
|
||||||
|
if (attributeIdx == -1)
|
||||||
|
break;
|
||||||
|
data.CustomOffset = emitter->Graph.Layout.Attributes[attributeIdx].Offset;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
context->UpdateCB(GPUParticlesSortingCB, &data);
|
||||||
|
context->BindSR(0, draw.Buffer->GPU.Buffer->View());
|
||||||
|
context->BindUA(0, draw.Buffer->GPU.SortedIndices->View());
|
||||||
|
context->BindUA(1, draw.Buffer->GPU.SortingKeys->View());
|
||||||
|
const int32 threadGroupSize = 1024;
|
||||||
|
context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(draw.Buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1);
|
||||||
}
|
}
|
||||||
#if !BUILD_RELEASE
|
|
||||||
default:
|
|
||||||
CRASH;
|
|
||||||
return;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
context->UpdateCB(GPUParticlesSortingCB, &data);
|
|
||||||
context->BindSR(0, draw.Buffer->GPU.Buffer->View());
|
|
||||||
context->BindUA(0, draw.Buffer->GPU.SortingKeysBuffer->View());
|
|
||||||
const int32 threadGroupSize = 1024;
|
|
||||||
context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(draw.Buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1);
|
|
||||||
}
|
}
|
||||||
|
context->ResetUA();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run sorting
|
// Run sorting
|
||||||
@@ -930,17 +917,18 @@ void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
|
|||||||
{
|
{
|
||||||
if (!draw.Sorting)
|
if (!draw.Sorting)
|
||||||
continue;
|
continue;
|
||||||
ASSERT(draw.Buffer->GPU.SortingKeysBuffer);
|
|
||||||
|
|
||||||
// Execute all sorting modules
|
// Execute all sorting modules
|
||||||
ParticleEmitter* emitter = draw.Buffer->Emitter;
|
ParticleEmitter* emitter = draw.Buffer->Emitter;
|
||||||
for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++)
|
for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++)
|
||||||
{
|
{
|
||||||
auto module = emitter->Graph.SortModules[moduleIndex];
|
auto module = emitter->Graph.SortModules[moduleIndex];
|
||||||
|
// TODO: add support for module->SortedIndicesOffset (multiple sort modules)
|
||||||
const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
|
const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
|
||||||
bool sortAscending = sortMode == ParticleSortMode::CustomAscending;
|
bool sortAscending = sortMode == ParticleSortMode::CustomAscending;
|
||||||
BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortingKeysBuffer, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.ParticlesCountMax);
|
BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.SortingKeys, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.ParticlesCountMax);
|
||||||
// TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier (run all sorting in parallel)
|
// TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier (all sorting in parallel)
|
||||||
|
// TODO: run small emitters sorting (less than 2k particles) sorting in separate loop as pass without UAV barriers (all sorting in parallel)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ ParticleBuffer::~ParticleBuffer()
|
|||||||
{
|
{
|
||||||
SAFE_DELETE_GPU_RESOURCE(GPU.Buffer);
|
SAFE_DELETE_GPU_RESOURCE(GPU.Buffer);
|
||||||
SAFE_DELETE_GPU_RESOURCE(GPU.BufferSecondary);
|
SAFE_DELETE_GPU_RESOURCE(GPU.BufferSecondary);
|
||||||
SAFE_DELETE_GPU_RESOURCE(GPU.SortingKeysBuffer);
|
SAFE_DELETE_GPU_RESOURCE(GPU.SortingKeys);
|
||||||
SAFE_DELETE_GPU_RESOURCE(GPU.SortedIndices);
|
SAFE_DELETE_GPU_RESOURCE(GPU.SortedIndices);
|
||||||
SAFE_DELETE(GPU.RibbonIndexBufferDynamic);
|
SAFE_DELETE(GPU.RibbonIndexBufferDynamic);
|
||||||
SAFE_DELETE(GPU.RibbonVertexBufferDynamic);
|
SAFE_DELETE(GPU.RibbonVertexBufferDynamic);
|
||||||
@@ -161,7 +161,7 @@ bool ParticleBuffer::Init(ParticleEmitter* emitter)
|
|||||||
|
|
||||||
bool ParticleBuffer::AllocateSortBuffer()
|
bool ParticleBuffer::AllocateSortBuffer()
|
||||||
{
|
{
|
||||||
ASSERT(Emitter && GPU.SortedIndices == nullptr && GPU.SortingKeysBuffer == nullptr);
|
ASSERT(Emitter && GPU.SortedIndices == nullptr && GPU.SortingKeys == nullptr);
|
||||||
if (Emitter->Graph.SortModules.IsEmpty())
|
if (Emitter->Graph.SortModules.IsEmpty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
@@ -170,7 +170,7 @@ bool ParticleBuffer::AllocateSortBuffer()
|
|||||||
case ParticlesSimulationMode::CPU:
|
case ParticlesSimulationMode::CPU:
|
||||||
{
|
{
|
||||||
const int32 sortedIndicesSize = Capacity * sizeof(uint32) * Emitter->Graph.SortModules.Count();
|
const int32 sortedIndicesSize = Capacity * sizeof(uint32) * Emitter->Graph.SortModules.Count();
|
||||||
GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("SortedIndices"));
|
GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortedIndices"));
|
||||||
if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesSize, GPUBufferFlags::ShaderResource, PixelFormat::R32_UInt, nullptr, sizeof(uint32), GPUResourceUsage::Dynamic)))
|
if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesSize, GPUBufferFlags::ShaderResource, PixelFormat::R32_UInt, nullptr, sizeof(uint32), GPUResourceUsage::Dynamic)))
|
||||||
return true;
|
return true;
|
||||||
break;
|
break;
|
||||||
@@ -178,12 +178,12 @@ bool ParticleBuffer::AllocateSortBuffer()
|
|||||||
#if COMPILE_WITH_GPU_PARTICLES
|
#if COMPILE_WITH_GPU_PARTICLES
|
||||||
case ParticlesSimulationMode::GPU:
|
case ParticlesSimulationMode::GPU:
|
||||||
{
|
{
|
||||||
const int32 sortedIndicesSize = Capacity * sizeof(uint32) * Emitter->Graph.SortModules.Count();
|
const int32 sortedIndicesCount = Capacity * Emitter->Graph.SortModules.Count();
|
||||||
GPU.SortingKeysBuffer = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortingKeysBuffer"));
|
GPU.SortingKeys = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortingKeys"));
|
||||||
if (GPU.SortingKeysBuffer->Init(GPUBufferDescription::Structured(Capacity, sizeof(float) + sizeof(uint32), true)))
|
if (GPU.SortingKeys->Init(GPUBufferDescription::Buffer(sortedIndicesCount * sizeof(float), GPUBufferFlags::UnorderedAccess, PixelFormat::R32_Float, nullptr, sizeof(float))))
|
||||||
return true;
|
return true;
|
||||||
GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("SortedIndices"));
|
GPU.SortedIndices = GPUDevice::Instance->CreateBuffer(TEXT("ParticleSortedIndices"));
|
||||||
if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesSize, GPUBufferFlags::ShaderResource | GPUBufferFlags::UnorderedAccess, PixelFormat::R32_UInt, nullptr, sizeof(uint32))))
|
if (GPU.SortedIndices->Init(GPUBufferDescription::Buffer(sortedIndicesCount * sizeof(uint32), GPUBufferFlags::ShaderResource | GPUBufferFlags::UnorderedAccess, PixelFormat::R32_UInt, nullptr, sizeof(uint32))))
|
||||||
return true;
|
return true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -206,7 +206,7 @@ public:
|
|||||||
/// <summary>
|
/// <summary>
|
||||||
/// The GPU particles sorting buffer. Contains structure of particle index and the sorting key for every particle. Used to sort particles.
|
/// The GPU particles sorting buffer. Contains structure of particle index and the sorting key for every particle. Used to sort particles.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
GPUBuffer* SortingKeysBuffer = nullptr;
|
GPUBuffer* SortingKeys = nullptr;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// The particles indices buffer (GPU side).
|
/// The particles indices buffer (GPU side).
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
|
|
||||||
GPU_CB_STRUCT(Data {
|
GPU_CB_STRUCT(Data {
|
||||||
float NullItemKey;
|
float NullItemKey;
|
||||||
uint32 NullItemValue;
|
uint32 NullItemIndex;
|
||||||
uint32 CounterOffset;
|
uint32 CounterOffset;
|
||||||
uint32 MaxIterations;
|
uint32 MaxIterations;
|
||||||
uint32 LoopK;
|
uint32 LoopK;
|
||||||
@@ -47,7 +47,6 @@ bool BitonicSort::Init()
|
|||||||
|
|
||||||
bool BitonicSort::setupResources()
|
bool BitonicSort::setupResources()
|
||||||
{
|
{
|
||||||
// Check if shader has not been loaded
|
|
||||||
if (!_shader->IsLoaded())
|
if (!_shader->IsLoaded())
|
||||||
return true;
|
return true;
|
||||||
const auto shader = _shader->GetShader();
|
const auto shader = _shader->GetShader();
|
||||||
@@ -59,14 +58,12 @@ bool BitonicSort::setupResources()
|
|||||||
_preSortCS.Get(shader, "CS_PreSort");
|
_preSortCS.Get(shader, "CS_PreSort");
|
||||||
_innerSortCS = shader->GetCS("CS_InnerSort");
|
_innerSortCS = shader->GetCS("CS_InnerSort");
|
||||||
_outerSortCS = shader->GetCS("CS_OuterSort");
|
_outerSortCS = shader->GetCS("CS_OuterSort");
|
||||||
_copyIndicesCS = shader->GetCS("CS_CopyIndices");
|
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void BitonicSort::Dispose()
|
void BitonicSort::Dispose()
|
||||||
{
|
{
|
||||||
// Base
|
|
||||||
RendererPass::Dispose();
|
RendererPass::Dispose();
|
||||||
|
|
||||||
// Cleanup
|
// Cleanup
|
||||||
@@ -76,17 +73,16 @@ void BitonicSort::Dispose()
|
|||||||
_preSortCS.Clear();
|
_preSortCS.Clear();
|
||||||
_innerSortCS = nullptr;
|
_innerSortCS = nullptr;
|
||||||
_outerSortCS = nullptr;
|
_outerSortCS = nullptr;
|
||||||
_copyIndicesCS = nullptr;
|
|
||||||
_shader = nullptr;
|
_shader = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements)
|
void BitonicSort::Sort(GPUContext* context, GPUBuffer* indicesBuffer, GPUBuffer* keysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, int32 maxElements)
|
||||||
{
|
{
|
||||||
ASSERT(context && sortingKeysBuffer && countBuffer);
|
ASSERT(context && indicesBuffer && keysBuffer && countBuffer);
|
||||||
if (checkIfSkipPass())
|
if (checkIfSkipPass())
|
||||||
return;
|
return;
|
||||||
PROFILE_GPU_CPU("Bitonic Sort");
|
PROFILE_GPU_CPU("Bitonic Sort");
|
||||||
uint32 maxNumElements = sortingKeysBuffer->GetSize() / sizeof(uint64);
|
uint32 maxNumElements = indicesBuffer->GetElementsCount();
|
||||||
if (maxElements > 0 && maxElements < maxNumElements)
|
if (maxElements > 0 && maxElements < maxNumElements)
|
||||||
maxNumElements = maxElements;
|
maxNumElements = maxElements;
|
||||||
const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements);
|
const uint32 alignedMaxNumElements = Math::RoundUpToPowerOf2(maxNumElements);
|
||||||
@@ -96,7 +92,7 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
|
|||||||
Data data;
|
Data data;
|
||||||
data.CounterOffset = counterOffset;
|
data.CounterOffset = counterOffset;
|
||||||
data.NullItemKey = sortAscending ? MAX_float : -MAX_float;
|
data.NullItemKey = sortAscending ? MAX_float : -MAX_float;
|
||||||
data.NullItemValue = 0;
|
data.NullItemIndex = 0;
|
||||||
data.KeySign = sortAscending ? -1.0f : 1.0f;
|
data.KeySign = sortAscending ? -1.0f : 1.0f;
|
||||||
data.MaxIterations = maxIterations;
|
data.MaxIterations = maxIterations;
|
||||||
data.LoopK = 0;
|
data.LoopK = 0;
|
||||||
@@ -110,7 +106,8 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
|
|||||||
{
|
{
|
||||||
// Use pre-sort with smaller thread group size (eg. for small particle emitters sorting)
|
// Use pre-sort with smaller thread group size (eg. for small particle emitters sorting)
|
||||||
const int32 permutation = maxNumElements < 128 ? 1 : 0;
|
const int32 permutation = maxNumElements < 128 ? 1 : 0;
|
||||||
context->BindUA(0, sortingKeysBuffer->View());
|
context->BindUA(0, indicesBuffer->View());
|
||||||
|
context->BindUA(1, keysBuffer->View());
|
||||||
context->Dispatch(_preSortCS.Get(permutation), 1, 1, 1);
|
context->Dispatch(_preSortCS.Get(permutation), 1, 1, 1);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -120,7 +117,8 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
|
|||||||
context->Dispatch(_indirectArgsCS, 1, 1, 1);
|
context->Dispatch(_indirectArgsCS, 1, 1, 1);
|
||||||
|
|
||||||
// Pre-Sort the buffer up to k = 2048 (this also pads the list with invalid indices that will drift to the end of the sorted list)
|
// Pre-Sort the buffer up to k = 2048 (this also pads the list with invalid indices that will drift to the end of the sorted list)
|
||||||
context->BindUA(0, sortingKeysBuffer->View());
|
context->BindUA(0, indicesBuffer->View());
|
||||||
|
context->BindUA(1, keysBuffer->View());
|
||||||
context->DispatchIndirect(_preSortCS.Get(0), _dispatchArgsBuffer, 0);
|
context->DispatchIndirect(_preSortCS.Get(0), _dispatchArgsBuffer, 0);
|
||||||
|
|
||||||
// We have already pre-sorted up through k = 2048 when first writing our list, so we continue sorting with k = 4096
|
// We have already pre-sorted up through k = 2048 when first writing our list, so we continue sorting with k = 4096
|
||||||
@@ -144,27 +142,4 @@ void BitonicSort::Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuf
|
|||||||
}
|
}
|
||||||
|
|
||||||
context->ResetUA();
|
context->ResetUA();
|
||||||
|
|
||||||
if (sortedIndicesBuffer)
|
|
||||||
{
|
|
||||||
// Copy indices to another buffer
|
|
||||||
#if !BUILD_RELEASE
|
|
||||||
switch (sortedIndicesBuffer->GetDescription().Format)
|
|
||||||
{
|
|
||||||
case PixelFormat::R32_UInt:
|
|
||||||
case PixelFormat::R16_UInt:
|
|
||||||
case PixelFormat::R8_UInt:
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
LOG(Warning, "Invalid format {0} of sortedIndicesBuffer for BitonicSort. It needs to be UInt type.", (int32)sortedIndicesBuffer->GetDescription().Format);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
context->BindSR(1, sortingKeysBuffer->View());
|
|
||||||
context->BindUA(0, sortedIndicesBuffer->View());
|
|
||||||
// TODO: use indirect dispatch to match the items count for copy
|
|
||||||
context->Dispatch(_copyIndicesCS, (alignedMaxNumElements + 1023) / 1024, 1, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
context->ResetUA();
|
|
||||||
context->ResetSR();
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,16 +26,16 @@ private:
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Sorts the specified buffer of index-key pairs.
|
/// Sorts the specified buffers of index-key pairs.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="context">The GPU context.</param>
|
/// <param name="context">The GPU context.</param>
|
||||||
/// <param name="sortingKeysBuffer">The sorting keys buffer. Used as a structured buffer of type Item (see above).</param>
|
/// <param name="indicesBuffer">The sorting indices buffer with an index for each item (sequence of: 0, 1, 2, 3...). After sorting represents actual items order based on their keys. Valid for uint value types - used as RWBuffer.</param>
|
||||||
|
/// <param name="keysBuffer">The sorting keys buffer with a sort value for each item (must match order of items in indicesBuffer). Valid for float value types - used as RWBuffer.</param>
|
||||||
/// <param name="countBuffer">The buffer that contains a items counter value.</param>
|
/// <param name="countBuffer">The buffer that contains a items counter value.</param>
|
||||||
/// <param name="counterOffset">The offset into counter buffer to find count for this list. Must be a multiple of 4 bytes.</param>
|
/// <param name="counterOffset">The offset into counter buffer to find count for this list. Must be a multiple of 4 bytes.</param>
|
||||||
/// <param name="sortAscending">True to sort in ascending order (smallest to largest), otherwise false to sort in descending order.</param>
|
/// <param name="sortAscending">True to sort in ascending order (smallest to largest), otherwise false to sort in descending order.</param>
|
||||||
/// <param name="sortedIndicesBuffer">The output buffer for sorted values extracted from the sorted sortingKeysBuffer after algorithm run. Valid for uint value types - used as RWBuffer.</param>
|
|
||||||
/// <param name="maxElements">Optional upper limit of elements to sort. Cna be used to optimize indirect dispatches allocation. If non-zero, then it gets calculated based on the input item buffer size.</param>
|
/// <param name="maxElements">Optional upper limit of elements to sort. Cna be used to optimize indirect dispatches allocation. If non-zero, then it gets calculated based on the input item buffer size.</param>
|
||||||
void Sort(GPUContext* context, GPUBuffer* sortingKeysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, GPUBuffer* sortedIndicesBuffer, uint32 maxElements = 0);
|
void Sort(GPUContext* context, GPUBuffer* indicesBuffer, GPUBuffer* keysBuffer, GPUBuffer* countBuffer, uint32 counterOffset, bool sortAscending, int32 maxElements = 0);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
|||||||
@@ -10,12 +10,12 @@
|
|||||||
struct Item
|
struct Item
|
||||||
{
|
{
|
||||||
float Key;
|
float Key;
|
||||||
uint Value;
|
uint Index;
|
||||||
};
|
};
|
||||||
|
|
||||||
META_CB_BEGIN(0, Data)
|
META_CB_BEGIN(0, Data)
|
||||||
float NullItemKey;
|
float NullItemKey;
|
||||||
uint NullItemValue;
|
uint NullItemIndex;
|
||||||
uint CounterOffset;
|
uint CounterOffset;
|
||||||
uint MaxIterations;
|
uint MaxIterations;
|
||||||
uint LoopK;
|
uint LoopK;
|
||||||
@@ -40,12 +40,12 @@ uint InsertOneBit(uint value, uint oneBitMask)
|
|||||||
// (effectively a negation) or leave the value alone. When the KeySign is
|
// (effectively a negation) or leave the value alone. When the KeySign is
|
||||||
// 1, we are sorting descending, so when A < B, they should swap. For an
|
// 1, we are sorting descending, so when A < B, they should swap. For an
|
||||||
// ascending sort, -A < -B should swap.
|
// ascending sort, -A < -B should swap.
|
||||||
bool ShouldSwap(Item a, Item b)
|
bool ShouldSwap(float a, float b)
|
||||||
{
|
{
|
||||||
//return (a ^ NullItem) < (b ^ NullItem);
|
//return (a ^ NullItem) < (b ^ NullItem);
|
||||||
|
|
||||||
//return (a.Key) < (b.Key);
|
//return (a) < (b);
|
||||||
return (a.Key * KeySign) < (b.Key * KeySign);
|
return (a * KeySign) < (b * KeySign);
|
||||||
//return asfloat(a) < asfloat(b);
|
//return asfloat(a) < asfloat(b);
|
||||||
//return (asfloat(a) * KeySign) < (asfloat(b) * KeySign);
|
//return (asfloat(a) * KeySign) < (asfloat(b) * KeySign);
|
||||||
}
|
}
|
||||||
@@ -93,7 +93,8 @@ void CS_IndirectArgs(uint groupIndex : SV_GroupIndex)
|
|||||||
|
|
||||||
#if defined(_CS_PreSort) || defined(_CS_InnerSort)
|
#if defined(_CS_PreSort) || defined(_CS_InnerSort)
|
||||||
|
|
||||||
RWStructuredBuffer<Item> SortBuffer : register(u0);
|
RWBuffer<uint> SortedIndices : register(u0);
|
||||||
|
RWBuffer<float> SortingKeys : register(u1);
|
||||||
|
|
||||||
groupshared Item SortData[THREAD_GROUP_SIZE * 2];
|
groupshared Item SortData[THREAD_GROUP_SIZE * 2];
|
||||||
|
|
||||||
@@ -103,12 +104,13 @@ void LoadItem(uint element, uint count)
|
|||||||
Item item;
|
Item item;
|
||||||
if (element < count)
|
if (element < count)
|
||||||
{
|
{
|
||||||
item = SortBuffer[element];
|
item.Key = SortingKeys[element];
|
||||||
|
item.Index = SortedIndices[element];
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
item.Key = NullItemKey;
|
item.Key = NullItemKey;
|
||||||
item.Value = NullItemValue;
|
item.Index = NullItemIndex;
|
||||||
}
|
}
|
||||||
SortData[element & (THREAD_GROUP_SIZE * 2 - 1)] = item;
|
SortData[element & (THREAD_GROUP_SIZE * 2 - 1)] = item;
|
||||||
}
|
}
|
||||||
@@ -117,7 +119,9 @@ void StoreItem(uint element, uint count)
|
|||||||
{
|
{
|
||||||
if (element < count)
|
if (element < count)
|
||||||
{
|
{
|
||||||
SortBuffer[element] = SortData[element & 2047];
|
Item item = SortData[element & ((THREAD_GROUP_SIZE * 2 - 1))];
|
||||||
|
SortingKeys[element] = item.Key;
|
||||||
|
SortedIndices[element] = item.Index;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -153,7 +157,7 @@ void CS_PreSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
|
|||||||
Item a = SortData[index1];
|
Item a = SortData[index1];
|
||||||
Item b = SortData[index2];
|
Item b = SortData[index2];
|
||||||
|
|
||||||
if (ShouldSwap(a, b))
|
if (ShouldSwap(a.Key, b.Key))
|
||||||
{
|
{
|
||||||
// Swap the items
|
// Swap the items
|
||||||
SortData[index1] = b;
|
SortData[index1] = b;
|
||||||
@@ -197,7 +201,7 @@ void CS_InnerSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
|
|||||||
Item a = SortData[index1];
|
Item a = SortData[index1];
|
||||||
Item b = SortData[index2];
|
Item b = SortData[index2];
|
||||||
|
|
||||||
if (ShouldSwap(a, b))
|
if (ShouldSwap(a.Key, b.Key))
|
||||||
{
|
{
|
||||||
// Swap the items
|
// Swap the items
|
||||||
SortData[index1] = b;
|
SortData[index1] = b;
|
||||||
@@ -215,7 +219,8 @@ void CS_InnerSort(uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex)
|
|||||||
|
|
||||||
#ifdef _CS_OuterSort
|
#ifdef _CS_OuterSort
|
||||||
|
|
||||||
RWStructuredBuffer<Item> SortBuffer : register(u0);
|
RWBuffer<uint> SortedIndices : register(u0);
|
||||||
|
RWBuffer<float> SortingKeys : register(u1);
|
||||||
|
|
||||||
META_CS(true, FEATURE_LEVEL_SM5)
|
META_CS(true, FEATURE_LEVEL_SM5)
|
||||||
[numthreads(1024, 1, 1)]
|
[numthreads(1024, 1, 1)]
|
||||||
@@ -230,35 +235,19 @@ void CS_OuterSort(uint3 dispatchThreadId : SV_DispatchThreadID)
|
|||||||
if (index2 >= count)
|
if (index2 >= count)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
Item a = SortBuffer[index1];
|
float aKey = SortingKeys[index1];
|
||||||
Item b = SortBuffer[index2];
|
float bKey = SortingKeys[index2];
|
||||||
|
|
||||||
if (ShouldSwap(a, b))
|
if (ShouldSwap(aKey, bKey))
|
||||||
{
|
{
|
||||||
// Swap the items
|
// Swap the items
|
||||||
SortBuffer[index1] = b;
|
SortingKeys[index1] = bKey;
|
||||||
SortBuffer[index2] = a;
|
SortingKeys[index2] = aKey;
|
||||||
|
uint aIndex = SortedIndices[index1];
|
||||||
|
uint bIndex = SortedIndices[index2];
|
||||||
|
SortedIndices[index1] = bIndex;
|
||||||
|
SortedIndices[index2] = aIndex;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef _CS_CopyIndices
|
|
||||||
|
|
||||||
StructuredBuffer<Item> SortBuffer : register(t1);
|
|
||||||
RWBuffer<uint> SortedIndices : register(u0);
|
|
||||||
|
|
||||||
META_CS(true, FEATURE_LEVEL_SM5)
|
|
||||||
[numthreads(1024, 1, 1)]
|
|
||||||
void CS_CopyIndices(uint3 dispatchThreadId : SV_DispatchThreadID)
|
|
||||||
{
|
|
||||||
const uint count = CounterBuffer.Load(CounterOffset);
|
|
||||||
uint index = dispatchThreadId.x;
|
|
||||||
if (index >= count)
|
|
||||||
return;
|
|
||||||
|
|
||||||
Item element = SortBuffer[index];
|
|
||||||
SortedIndices[index] = element.Value;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|||||||
@@ -20,13 +20,9 @@ META_CB_END
|
|||||||
// Particles data buffer
|
// Particles data buffer
|
||||||
ByteAddressBuffer ParticlesData : register(t0);
|
ByteAddressBuffer ParticlesData : register(t0);
|
||||||
|
|
||||||
// Output sorting keys buffer (index + key)
|
// Sorting data (per-particle)
|
||||||
struct Item
|
RWBuffer<uint> SortedIndices : register(u0);
|
||||||
{
|
RWBuffer<float> SortingKeys : register(u1);
|
||||||
float Key;
|
|
||||||
uint Value;
|
|
||||||
};
|
|
||||||
RWStructuredBuffer<Item> SortingKeys : register(u0);
|
|
||||||
|
|
||||||
float GetParticleFloat(uint particleIndex, int offset)
|
float GetParticleFloat(uint particleIndex, int offset)
|
||||||
{
|
{
|
||||||
@@ -78,8 +74,6 @@ void CS_Sort(uint3 dispatchThreadId : SV_DispatchThreadID)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Write sorting index-key pair
|
// Write sorting index-key pair
|
||||||
Item item;
|
SortedIndices[index] = index;
|
||||||
item.Key = sortKey;
|
SortingKeys[index] = sortKey;
|
||||||
item.Value = index;
|
|
||||||
SortingKeys[index] = item;
|
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user