FlaxEngine/Source/Engine/Particles/Particles.cpp

// Copyright (c) Wojciech Figat. All rights reserved.

#include "Particles.h"
#include "ParticleEffect.h"
#include "Engine/Content/Assets/Model.h"
#include "Engine/Core/Collections/Sorting.h"
#include "Engine/Engine/EngineService.h"
#include "Engine/Engine/Time.h"
#include "Engine/Engine/Engine.h"
#include "Engine/Graphics/GPUBuffer.h"
#include "Engine/Graphics/GPUPipelineStatePermutations.h"
#include "Engine/Graphics/RenderTask.h"
#include "Engine/Graphics/DynamicBuffer.h"
#include "Engine/Graphics/GPUContext.h"
#include "Engine/Graphics/GPUPass.h"
#include "Engine/Graphics/RenderTools.h"
#include "Engine/Graphics/Shaders/GPUVertexLayout.h"
#include "Engine/Profiler/ProfilerCPU.h"
#include "Engine/Profiler/ProfilerMemory.h"
#include "Engine/Renderer/DrawCall.h"
#include "Engine/Renderer/RenderList.h"
#include "Engine/Threading/TaskGraph.h"
#if COMPILE_WITH_GPU_PARTICLES
#include "Engine/Threading/Threading.h"
#include "Engine/Content/Assets/Shader.h"
#include "Engine/Profiler/ProfilerGPU.h"
#include "Engine/Renderer/Utils/BitonicSort.h"
#endif
#if USE_EDITOR
#include "Editor/Editor.h"
#endif

PACK_STRUCT(struct SpriteParticleVertex
    {
    float X;
    float Y;
    float U;
    float V;
    });

class SpriteParticleRenderer
{
public:
    volatile int64 Ready = 0;
    GPUBuffer* VB = nullptr;
    GPUBuffer* IB = nullptr;
    const static int32 VertexCount = 4;
    const static int32 IndexCount = 6;

public:
    bool Init()
    {
        if (Platform::AtomicRead(&Ready))
            return false;
        ScopeLock lock(RenderContext::GPULocker);
        if (Platform::AtomicRead(&Ready))
            return false;
        VB = GPUDevice::Instance->CreateBuffer(TEXT("SpriteParticleRenderer.VB"));
        IB = GPUDevice::Instance->CreateBuffer(TEXT("SpriteParticleRenderer.IB"));
        SpriteParticleVertex vertexBuffer[] =
        {
            { -0.5f, -0.5f, 0.0f, 0.0f },
            { +0.5f, -0.5f, 1.0f, 0.0f },
            { +0.5f, +0.5f, 1.0f, 1.0f },
            { -0.5f, +0.5f, 0.0f, 1.0f },
        };
        uint16 indexBuffer[] = { 0, 1, 2, 0, 2, 3, };
        auto layout = GPUVertexLayout::Get({
            { VertexElement::Types::Position, 0, 0, 0, PixelFormat::R32G32_Float },
            { VertexElement::Types::TexCoord, 0, 0, 0, PixelFormat::R32G32_Float },
        });
        bool result = VB->Init(GPUBufferDescription::Vertex(layout, sizeof(SpriteParticleVertex), VertexCount, vertexBuffer)) ||
                      IB->Init(GPUBufferDescription::Index(sizeof(uint16), IndexCount, indexBuffer));
        Platform::AtomicStore(&Ready, 1);
        return result;
    }

    void Dispose()
    {
        SAFE_DELETE_GPU_RESOURCE(VB);
        SAFE_DELETE_GPU_RESOURCE(IB);
    }

    void SetupDrawCall(DrawCall& drawCall) const
    {
        drawCall.Geometry.IndexBuffer = IB;
        drawCall.Geometry.VertexBuffers[0] = VB;
        drawCall.Geometry.VertexBuffers[1] = nullptr;
        drawCall.Geometry.VertexBuffers[2] = nullptr;
        drawCall.Geometry.VertexBuffersOffsets[0] = 0;
        drawCall.Geometry.VertexBuffersOffsets[1] = 0;
        drawCall.Geometry.VertexBuffersOffsets[2] = 0;
        drawCall.Draw.StartIndex = 0;
        drawCall.Draw.IndicesCount = IndexCount;
    }
};

PACK_STRUCT(struct RibbonParticleVertex {
    uint32 Order;
    uint32 ParticleIndex;
    uint32 PrevParticleIndex;
    float Distance;
    // TODO: pack into half/uint16 data

    static GPUVertexLayout* GetLayout()
    {
        return GPUVertexLayout::Get({
            { VertexElement::Types::TexCoord0, 0, 0, 0, PixelFormat::R32_UInt },
            { VertexElement::Types::TexCoord1, 0, 0, 0, PixelFormat::R32_UInt },
            { VertexElement::Types::TexCoord2, 0, 0, 0, PixelFormat::R32_UInt },
            { VertexElement::Types::TexCoord3, 0, 0, 0, PixelFormat::R32_Float },
        });
    }
    });

struct EmitterCache
{
    double LastTimeUsed;
    ParticleBuffer* Buffer;
};

namespace ParticleManagerImpl
{
    CriticalSection PoolLocker;
    Dictionary<ParticleEmitter*, Array<EmitterCache>> Pool;
    Array<ParticleEffect*> UpdateList;
#if COMPILE_WITH_GPU_PARTICLES
    CriticalSection GpuUpdateListLocker;
    Array<ParticleEffect*> GpuUpdateList;
    RenderTask* GpuRenderTask = nullptr;
#endif
}

using namespace ParticleManagerImpl;

TaskGraphSystem* Particles::System = nullptr;
ReadWriteLock Particles::SystemLocker;
bool Particles::EnableParticleBufferPooling = true;
float Particles::ParticleBufferRecycleTimeout = 10.0f;

SpriteParticleRenderer SpriteRenderer;

class ParticleManagerService : public EngineService
{
public:
    ParticleManagerService()
        : EngineService(TEXT("Particle Manager"), 65)
    {
    }

    bool Init() override;
    void Dispose() override;
};

class ParticlesSystem : public TaskGraphSystem
{
public:
    float DeltaTime, UnscaledDeltaTime, Time, UnscaledTime;
    bool Active;

    void Job(int32 index);
    void Execute(TaskGraph* graph) override;
    void PostExecute(TaskGraph* graph) override;
};

ParticleManagerService ParticleManagerServiceInstance;

void Particles::UpdateEffect(ParticleEffect* effect)
{
    PROFILE_MEM(Particles);
    UpdateList.Add(effect);
}

void Particles::OnEffectDestroy(ParticleEffect* effect)
{
    UpdateList.Remove(effect);
#if COMPILE_WITH_GPU_PARTICLES
    GpuUpdateList.Remove(effect);
#endif
}

bool EmitterUseSorting(RenderContextBatch& renderContextBatch, ParticleBuffer* buffer, DrawPass drawModes, const BoundingSphere& bounds)
{
    const RenderView& mainView = renderContextBatch.GetMainContext().View;
    drawModes &= mainView.Pass;
    return buffer->Emitter->Graph.SortModules.HasItems() && EnumHasAnyFlags(drawModes, DrawPass::Forward) && (mainView.IsCullingDisabled || mainView.CullingFrustum.Intersects(bounds));
}

void DrawEmitterCPU(RenderContextBatch& renderContextBatch, ParticleBuffer* buffer, DrawCall& drawCall, DrawPass drawModes, StaticFlags staticFlags, const BoundingSphere& bounds, uint32 renderModulesIndices, int8 sortOrder)
{
    // Skip if CPU buffer is empty
    if (buffer->CPU.Count == 0)
        return;
    const auto context = GPUDevice::Instance->GetMainContext();
    auto emitter = buffer->Emitter;

    // Check if need to perform any particles sorting
    if (EmitterUseSorting(renderContextBatch, buffer, drawModes, bounds) && (buffer->CPU.Count != 0 || buffer->GPU.SortedIndices))
    {
        // Prepare sorting data
        if (!buffer->GPU.SortedIndices)
            buffer->AllocateSortBuffer();

        // Execute all sorting modules
        for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++)
        {
            auto module = emitter->Graph.SortModules[moduleIndex];
            const int32 sortedIndicesOffset = module->SortedIndicesOffset;
            const auto sortMode = static_cast<ParticleSortMode>(module->Values[2].AsInt);
            const int32 stride = buffer->Stride;
            const int32 listSize = buffer->CPU.Count;
            const int32 indicesByteSize = listSize * buffer->GPU.SortedIndices->GetStride();
            RenderListAlloc sortingAllocs[4];
            auto* renderList = renderContextBatch.GetMainContext().List;
            uint32* sortingKeys[2] = { sortingAllocs[0].Init<uint32>(renderList, listSize), sortingAllocs[1].Init<uint32>(renderList, listSize) };
            void* sortingIndices[2] = { sortingAllocs[2].Init(renderList, indicesByteSize, GPU_SHADER_DATA_ALIGNMENT), sortingAllocs[3].Init(renderList, indicesByteSize, GPU_SHADER_DATA_ALIGNMENT) };
            uint32* sortedKeys = sortingKeys[0];
            const uint32 sortKeyXor = sortMode != ParticleSortMode::CustomAscending ? MAX_uint32 : 0;
            switch (sortMode)
            {
            case ParticleSortMode::ViewDepth:
            {
                const int32 positionOffset = emitter->Graph.GetPositionAttributeOffset();
                if (positionOffset == -1)
                    break;
                const Matrix viewProjection = renderContextBatch.GetMainContext().View.ViewProjection();
                const byte* positionPtr = buffer->CPU.Buffer.Get() + positionOffset;
                if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
                {
                    for (int32 i = 0; i < buffer->CPU.Count; i++)
                    {
                        // TODO: use SIMD
                        sortedKeys[i] = RenderTools::ComputeDistanceSortKey(Matrix::TransformPosition(viewProjection, Matrix::TransformPosition(drawCall.World, *(const Float3*)positionPtr)).W) ^ sortKeyXor;
                        positionPtr += stride;
                    }
                }
                else
                {
                    for (int32 i = 0; i < buffer->CPU.Count; i++)
                    {
                        sortedKeys[i] = RenderTools::ComputeDistanceSortKey(Matrix::TransformPosition(viewProjection, *(const Float3*)positionPtr).W) ^ sortKeyXor;
                        positionPtr += stride;
                    }
                }
                break;
            }
            case ParticleSortMode::ViewDistance:
            {
                const int32 positionOffset = emitter->Graph.GetPositionAttributeOffset();
                if (positionOffset == -1)
                    break;
                const Float3 viewPosition = renderContextBatch.GetMainContext().View.Position;
                const byte* positionPtr = buffer->CPU.Buffer.Get() + positionOffset;
                if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
                {
                    for (int32 i = 0; i < buffer->CPU.Count; i++)
                    {
                        // TODO: use SIMD
                        sortedKeys[i] = RenderTools::ComputeDistanceSortKey((viewPosition - Float3::Transform(*(const Float3*)positionPtr, drawCall.World)).LengthSquared()) ^ sortKeyXor;
                        positionPtr += stride;
                    }
                }
                else
                {
                    for (int32 i = 0; i < buffer->CPU.Count; i++)
                    {
                        // TODO: use SIMD
                        sortedKeys[i] = RenderTools::ComputeDistanceSortKey((viewPosition - *(const Float3*)positionPtr).LengthSquared()) ^ sortKeyXor;
                        positionPtr += stride;
                    }
                }
                break;
            }
            case ParticleSortMode::CustomAscending:
            case ParticleSortMode::CustomDescending:
            {
                const int32 attributeIdx = module->Attributes[0];
                if (attributeIdx == -1)
                    break;
                const int32 attributeOffset = emitter->Graph.Layout.Attributes[attributeIdx].Offset;
                if (attributeOffset == -1)
                    break;
                const byte* attributePtr = buffer->CPU.Buffer.Get() + attributeOffset;
                for (int32 i = 0; i < buffer->CPU.Count; i++)
                {
                    sortedKeys[i] = RenderTools::ComputeDistanceSortKey(*(const float*)attributePtr) ^ sortKeyXor;
                    attributePtr += stride;
                }
                break;
            }
#if !BUILD_RELEASE
            default:
                CRASH;
#endif
            }

            // Generate sorting indices
            void* sortedIndices = sortingIndices[0];
            switch (buffer->GPU.SortedIndices->GetFormat())
            {
            case PixelFormat::R16_UInt:
                for (int32 i = 0; i < listSize; i++)
                    ((uint16*)sortedIndices)[i] = (uint16)i;
                break;
            case PixelFormat::R32_UInt:
                for (int32 i = 0; i < listSize; i++)
                    ((uint32*)sortedIndices)[i] = i;
                break;
            }

            // Sort keys with indices
            switch (buffer->GPU.SortedIndices->GetFormat())
            {
            case PixelFormat::R16_UInt:
            {
                uint16* sortedIndicesTyped = (uint16*)sortedIndices;
                Sorting::RadixSort(sortedKeys, sortedIndicesTyped, sortingKeys[1], (uint16*)sortingIndices[1], listSize);
                sortedIndices = sortedIndicesTyped;
                break;
            }
            case PixelFormat::R32_UInt:
            {
                uint32* sortedIndicesTyped = (uint32*)sortedIndices;
                Sorting::RadixSort(sortedKeys, sortedIndicesTyped, sortingKeys[1], (uint32*)sortingIndices[1], listSize);
                sortedIndices = sortedIndicesTyped;
                break;
            }
            }

            // Upload CPU particles indices
            {
                RenderContext::GPULocker.Lock();
                context->UpdateBuffer(buffer->GPU.SortedIndices, sortedIndices, indicesByteSize, sortedIndicesOffset);
                RenderContext::GPULocker.Unlock();
            }
        }
    }

    // Upload CPU particles data to GPU
    {
        RenderContext::GPULocker.Lock();
        context->UpdateBuffer(buffer->GPU.Buffer, buffer->CPU.Buffer.Get(), buffer->CPU.Count * buffer->Stride);
        RenderContext::GPULocker.Unlock();
    }

    // Check if need to setup ribbon modules
    int32 ribbonModuleIndex = 0;
    int32 ribbonModulesDrawIndicesPos = 0;
    int32 ribbonModulesDrawIndicesStart[PARTICLE_EMITTER_MAX_RIBBONS] = {};
    int32 ribbonModulesDrawIndicesCount[PARTICLE_EMITTER_MAX_RIBBONS] = {};
    int32 ribbonModulesSegmentCount[PARTICLE_EMITTER_MAX_RIBBONS] = {};
    if (emitter->Graph.RibbonRenderingModules.HasItems())
    {
        // Prepare ribbon data
        if (!buffer->GPU.RibbonIndexBufferDynamic)
            buffer->GPU.RibbonIndexBufferDynamic = New<DynamicIndexBuffer>(0, (uint32)sizeof(uint16), TEXT("RibbonIndexBufferDynamic"));
        else
            buffer->GPU.RibbonIndexBufferDynamic->Clear();
        if (!buffer->GPU.RibbonVertexBufferDynamic)
            buffer->GPU.RibbonVertexBufferDynamic = New<DynamicVertexBuffer>(0, (uint32)sizeof(RibbonParticleVertex), TEXT("RibbonVertexBufferDynamic"), RibbonParticleVertex::GetLayout());
        else
            buffer->GPU.RibbonVertexBufferDynamic->Clear();
        auto& indexBuffer = buffer->GPU.RibbonIndexBufferDynamic->Data;
        auto& vertexBuffer = buffer->GPU.RibbonVertexBufferDynamic->Data;

        // Setup all ribbon modules
        for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.RenderModules.Count(); moduleIndex++)
        {
            if ((renderModulesIndices & (1u << moduleIndex)) == 0)
                continue;
            auto module = emitter->Graph.RenderModules[moduleIndex];
            if (module->TypeID != 404 || ribbonModuleIndex >= PARTICLE_EMITTER_MAX_RIBBONS)
                continue;
            ribbonModulesDrawIndicesStart[ribbonModuleIndex] = ribbonModulesDrawIndicesPos;
            ribbonModulesDrawIndicesCount[ribbonModuleIndex] = 0;

            // Prepare particles buffer access
            auto positionOffset = emitter->Graph.GetPositionAttributeOffset();
            if (positionOffset == -1 || buffer->CPU.Count < 2 || buffer->CPU.RibbonOrder.IsEmpty())
                break;
            uint32 count = buffer->CPU.Count;
            ASSERT(buffer->CPU.RibbonOrder.Count() == emitter->Graph.RibbonRenderingModules.Count() * buffer->Capacity);
            int32* ribbonOrderData = buffer->CPU.RibbonOrder.Get() + module->RibbonOrderOffset;
            ParticleBufferCPUDataAccessor<Float3> positionData(buffer, emitter->Graph.Layout.GetAttributeOffset(module->Attributes[0]));

            // Write ribbon indices/vertices
            int32 indices = 0, segmentCount = 0;
            float totalDistance = 0.0f;
            int32 firstVertexIndex = vertexBuffer.Count();
            uint32 idxPrev = ribbonOrderData[0], vertexPrev = 0;
            {
                uint32 idxThis = ribbonOrderData[0];

                // 2 vertices
                {
                    vertexBuffer.AddUninitialized(2 * sizeof(RibbonParticleVertex));
                    auto ptr = (RibbonParticleVertex*)(vertexBuffer.Get() + firstVertexIndex);

                    RibbonParticleVertex v = { 0, idxThis, idxThis, totalDistance };

                    *ptr++ = v;
                    *ptr++ = v;
                }

                idxPrev = idxThis;
            }
            for (uint32 i = 1; i < count; i++)
            {
                uint32 idxThis = ribbonOrderData[i];
                Float3 direction = positionData[idxThis] - positionData[idxPrev];
                const float distance = direction.Length();
                if (distance > 0.002f)
                {
                    totalDistance += distance;

                    // 2 vertices
                    {
                        auto idx = vertexBuffer.Count();
                        vertexBuffer.AddUninitialized(2 * sizeof(RibbonParticleVertex));
                        auto ptr = (RibbonParticleVertex*)(vertexBuffer.Get() + idx);

                        // TODO: this could be optimized by manually fetching per-particle data in vertex shader (2x less data to send and fetch)
                        RibbonParticleVertex v = { i, idxThis, idxPrev, totalDistance };

                        *ptr++ = v;
                        *ptr++ = v;
                    }

                    // 2 triangles
                    {
                        auto idx = indexBuffer.Count();
                        indexBuffer.AddUninitialized(6 * sizeof(uint16));
                        auto ptr = (uint16*)(indexBuffer.Get() + idx);

                        uint32 i0 = vertexPrev;
                        uint32 i1 = vertexPrev + 2;

                        *ptr++ = i0;
                        *ptr++ = i0 + 1;
                        *ptr++ = i1;

                        *ptr++ = i0 + 1;
                        *ptr++ = i1 + 1;
                        *ptr++ = i1;

                        indices += 6;
                    }

                    idxPrev = idxThis;
                    segmentCount++;
                    vertexPrev += 2;
                }
            }
            if (segmentCount == 0)
                continue;
            {
                // Fix first particle vertex data to have proper direction
                auto ptr0 = (RibbonParticleVertex*)(vertexBuffer.Get() + firstVertexIndex);
                auto ptr1 = ptr0 + 1;
                auto ptr2 = ptr1 + 1;
                ptr0->PrevParticleIndex = ptr1->PrevParticleIndex = ptr2->ParticleIndex;
            }

            // Setup ribbon data
            ribbonModulesSegmentCount[ribbonModuleIndex] = segmentCount;
            ribbonModulesDrawIndicesCount[ribbonModuleIndex] = indices;
            ribbonModulesDrawIndicesPos += indices;

            ribbonModuleIndex++;
        }

        if (ribbonModuleIndex != 0)
        {
            // Upload data to the GPU buffer
            RenderContext::GPULocker.Lock();
            buffer->GPU.RibbonIndexBufferDynamic->Flush(context);
            buffer->GPU.RibbonVertexBufferDynamic->Flush(context);
            RenderContext::GPULocker.Unlock();
        }
    }

    // Execute all rendering modules
    ribbonModuleIndex = 0;
    for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.RenderModules.Count(); moduleIndex++)
    {
        if ((renderModulesIndices & (1u << moduleIndex)) == 0)
            continue;
        auto module = emitter->Graph.RenderModules[moduleIndex];
        drawCall.Particle.Module = module;

        switch (module->TypeID)
        {
        // Sprite Rendering
        case 400:
        {
            const auto material = (MaterialBase*)module->Assets[0].Get();
            const auto moduleDrawModes = module->Values.Count() > 3 ? (DrawPass)module->Values[3].AsInt : DrawPass::Default;
            auto dp = drawModes & moduleDrawModes & material->GetDrawModes();
            if (dp == DrawPass::None || SpriteRenderer.Init())
                break;
            drawCall.Material = material;

            // Submit draw call
            SpriteRenderer.SetupDrawCall(drawCall);
            drawCall.InstanceCount = buffer->CPU.Count;
            renderContextBatch.GetMainContext().List->AddDrawCall(renderContextBatch, dp, staticFlags, ShadowsCastingMode::DynamicOnly, bounds, drawCall, false, sortOrder);

            break;
        }
        // Model Rendering
        case 403:
        {
            const auto model = (Model*)module->Assets[0].Get();
            const auto material = (MaterialBase*)module->Assets[1].Get();
            const auto moduleDrawModes = module->Values.Count() > 4 ? (DrawPass)module->Values[4].AsInt : DrawPass::Default;
            auto dp = drawModes & moduleDrawModes & material->GetDrawModes();
            if (dp == DrawPass::None)
                break;
            drawCall.Material = material;

            // TODO: model LOD picking for particles?
            int32 lodIndex = 0;
            ModelLOD& lod = model->LODs[lodIndex];
            for (int32 meshIndex = 0; meshIndex < lod.Meshes.Count(); meshIndex++)
            {
                Mesh& mesh = lod.Meshes[meshIndex];
                if (!mesh.IsInitialized())
                    continue;
                // TODO: include mesh entry transformation, visibility and shadows mode?

                // Submit draw call
                mesh.GetDrawCallGeometry(drawCall);
                drawCall.InstanceCount = buffer->CPU.Count;
                renderContextBatch.GetMainContext().List->AddDrawCall(renderContextBatch, dp, staticFlags, ShadowsCastingMode::DynamicOnly, bounds, drawCall, false, sortOrder);
            }

            break;
        }
        // Ribbon Rendering
        case 404:
        {
            if (ribbonModulesDrawIndicesCount[ribbonModuleIndex] == 0)
                break;
            const auto material = (MaterialBase*)module->Assets[0].Get();
            const auto moduleDrawModes = module->Values.Count() > 6 ? (DrawPass)module->Values[6].AsInt : DrawPass::Default;
            auto dp = drawModes & moduleDrawModes & material->GetDrawModes();
            if (dp == DrawPass::None)
                break;
            drawCall.Material = material;

            // Node properties
            float uvTilingDistance = module->Values[3].AsFloat;
            Float2 uvScale = module->Values[4].AsFloat2();
            Float2 uvOffset = module->Values[5].AsFloat2();

            ParticleBufferCPUDataAccessor<float> sortKeyData(buffer, emitter->Graph.Layout.GetAttributeOffset(module->Attributes[1]));
            int32* ribbonOrderData = buffer->CPU.RibbonOrder.Get() + module->RibbonOrderOffset;
            int32 count = buffer->CPU.Count;

            // Setup ribbon data
            auto& ribbon = drawCall.Particle.Ribbon;
            ribbon.UVTilingDistance = uvTilingDistance;
            ribbon.SegmentCount = ribbonModulesSegmentCount[ribbonModuleIndex];
            ribbon.UVScaleX = uvScale.X;
            ribbon.UVScaleY = uvScale.Y;
            ribbon.UVOffsetX = uvOffset.X;
            ribbon.UVOffsetY = uvOffset.Y;
            if (ribbon.SegmentCount != 0 && Math::IsZero(uvTilingDistance) && sortKeyData.IsValid())
            {
                float firstSortValue = sortKeyData[ribbonOrderData[0]];
                float lastSortValue = sortKeyData[ribbonOrderData[count - 1]];

                float sortUScale = lastSortValue - firstSortValue;
                float sortUOffset = firstSortValue;

                ribbon.UVScaleX *= sortUScale;
                ribbon.UVOffsetX += sortUOffset * uvScale.X;
            }

            // TODO: invert particles rendering order if camera is closer to the ribbon end than start

            // Submit draw call
            drawCall.Geometry.IndexBuffer = buffer->GPU.RibbonIndexBufferDynamic->GetBuffer();
            drawCall.Geometry.VertexBuffers[0] = buffer->GPU.RibbonVertexBufferDynamic->GetBuffer();
            drawCall.Geometry.VertexBuffers[1] = nullptr;
            drawCall.Geometry.VertexBuffers[2] = nullptr;
            drawCall.Geometry.VertexBuffersOffsets[0] = 0;
            drawCall.Geometry.VertexBuffersOffsets[1] = 0;
            drawCall.Geometry.VertexBuffersOffsets[2] = 0;
            drawCall.Draw.StartIndex = ribbonModulesDrawIndicesStart[ribbonModuleIndex];
            drawCall.Draw.IndicesCount = ribbonModulesDrawIndicesCount[ribbonModuleIndex];
            drawCall.InstanceCount = 1;
            renderContextBatch.GetMainContext().List->AddDrawCall(renderContextBatch, dp, staticFlags, ShadowsCastingMode::DynamicOnly, bounds, drawCall, false, sortOrder);

            ribbonModuleIndex++;

            break;
        }
        // Volumetric Fog Rendering
        case 405:
        {
            const auto material = (MaterialBase*)module->Assets[0].Get();
            drawCall.Material = material;
            drawCall.InstanceCount = 1;

            auto positionOffset = emitter->Graph.Layout.GetAttributeOffset(module->Attributes[0]);
            int32 count = buffer->CPU.Count;
            if (positionOffset == -1 || count < 0)
                break;
            auto radiusOffset = emitter->Graph.Layout.GetAttributeOffset(module->Attributes[1]);
            ParticleBufferCPUDataAccessor<Float3> positionData(buffer, positionOffset);
            ParticleBufferCPUDataAccessor<float> radiusData(buffer, radiusOffset);
            const bool hasRadius = radiusOffset != -1;
            for (int32 i = 0; i < count; i++)
            {
                // Submit draw call
                // TODO: use instancing for volumetric fog particles (combine it with instanced circle rasterization into 3d texture)
                drawCall.Particle.VolumetricFog.Position = positionData[i];
                if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
                    Float3::Transform(drawCall.Particle.VolumetricFog.Position, drawCall.World, drawCall.Particle.VolumetricFog.Position);
                drawCall.Particle.VolumetricFog.Radius = hasRadius ? radiusData[i] : 100.0f;
                drawCall.Particle.VolumetricFog.ParticleIndex = i;
                renderContextBatch.GetMainContext().List->VolumetricFogParticles.Add(drawCall);
            }
            break;
        }
        }
    }
}

#if COMPILE_WITH_GPU_PARTICLES

GPU_CB_STRUCT(GPUParticlesSortingData {
    Float3 ViewPosition;
    uint32 ParticleCounterOffset;
    uint32 ParticleStride;
    uint32 ParticleCapacity;
    uint32 PositionOffset;
    uint32 CustomOffset;
    Matrix PositionTransform;
    });

AssetReference<Shader> GPUParticlesSorting;
GPUConstantBuffer* GPUParticlesSortingCB;
GPUShaderProgramCS* GPUParticlesSortingCS[3];

// GPU emitters drawing is batched for efficiency
struct GPUEmitterDraw
{
    ParticleBuffer* Buffer;
    DrawCall DrawCall;
    DrawPass DrawModes;
    StaticFlags StaticFlags;
    BoundingSphere Bounds;
    uint32 RenderModulesIndices;
    uint32 IndirectArgsSize;
    int8 SortOrder;
    bool Sorting;
};
Array<GPUEmitterDraw> GPUEmitterDraws;
GPUBuffer* GPUIndirectArgsBuffer = nullptr;

#if COMPILE_WITH_DEV_ENV

void OnShaderReloading(Asset* obj)
{
    GPUParticlesSortingCB = nullptr;
    Platform::MemoryClear(GPUParticlesSortingCS, sizeof(GPUParticlesSortingCS));
}

#endif

void CleanupGPUParticlesSorting()
{
    GPUParticlesSorting = nullptr;
    GPUEmitterDraws.Resize(0);
    SAFE_DELETE_GPU_RESOURCE(GPUIndirectArgsBuffer);
}

void DrawEmittersGPU(RenderContextBatch& renderContextBatch)
{
    PROFILE_GPU_CPU_NAMED("DrawEmittersGPU");
    ScopeReadLock systemScope(Particles::SystemLocker);
    GPUContext* context = GPUDevice::Instance->GetMainContext();

    // Count draws and sorting passes needed for resources allocation
    uint32 indirectArgsSize = 0;
    bool sorting = false;
    for (const GPUEmitterDraw& draw : GPUEmitterDraws)
    {
        indirectArgsSize += draw.IndirectArgsSize;
        sorting |= draw.Sorting;
    }

    // Prepare pipeline
    if (sorting && GPUParticlesSorting == nullptr)
    {
        // TODO: preload shader if platform supports GPU particles (eg. inside ParticleEmitter::load if it's GPU sim with any sort module)
        GPUParticlesSorting = Content::LoadAsyncInternal<Shader>(TEXT("Shaders/GPUParticlesSorting"));
#if COMPILE_WITH_DEV_ENV
        if (GPUParticlesSorting)
            GPUParticlesSorting.Get()->OnReloading.Bind<OnShaderReloading>();
#endif
    }
    if (GPUParticlesSorting == nullptr || !GPUParticlesSorting->IsLoaded())
    {
        // Skip sorting until shader is ready
        sorting = false;
    }
    else if (!GPUParticlesSortingCB)
    {
        const auto shader = GPUParticlesSorting->GetShader();
        const StringAnsiView CS_Sort("CS_Sort");
        GPUParticlesSortingCS[0] = shader->GetCS(CS_Sort, 0);
        GPUParticlesSortingCS[1] = shader->GetCS(CS_Sort, 1);
        GPUParticlesSortingCS[2] = shader->GetCS(CS_Sort, 2);
        GPUParticlesSortingCB = shader->GetCB(0);
        ASSERT_LOW_LAYER(GPUParticlesSortingCB);
    }
    const uint32 indirectArgsCapacity = Math::RoundUpToPowerOf2(indirectArgsSize);
    if (GPUIndirectArgsBuffer == nullptr)
        GPUIndirectArgsBuffer = GPUDevice::Instance->CreateBuffer(TEXT("ParticleIndirectDrawArgsBuffer"));
    if (GPUIndirectArgsBuffer->GetSize() < indirectArgsCapacity)
        GPUIndirectArgsBuffer->Init(GPUBufferDescription::Argument(indirectArgsCapacity));

    // Build indirect arguments
    uint32 indirectArgsOffset = 0;
    {
        PROFILE_GPU_CPU_NAMED("Init Indirect Args");

        GPUMemoryPass pass(context);
        pass.Transition(GPUIndirectArgsBuffer, GPUResourceAccess::CopyWrite);
        for (GPUEmitterDraw& draw : GPUEmitterDraws)
            pass.Transition(draw.Buffer->GPU.Buffer, GPUResourceAccess::CopyRead);

        // Init default arguments
        byte* indirectArgsMemory = (byte*)renderContextBatch.GetMainContext().List->Memory.Allocate(indirectArgsSize, GPU_SHADER_DATA_ALIGNMENT);
        for (GPUEmitterDraw& draw : GPUEmitterDraws)
        {
            ParticleEmitter* emitter = draw.Buffer->Emitter;
            for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.RenderModules.Count(); moduleIndex++)
            {
                if ((draw.RenderModulesIndices & (1u << moduleIndex)) == 0)
                    continue;
                auto module = emitter->Graph.RenderModules.Get()[moduleIndex];
                switch (module->TypeID)
                {
                // Sprite Rendering
                case 400:
                {
                    const auto material = (MaterialBase*)module->Assets[0].Get();
                    const auto moduleDrawModes = module->Values.Count() > 3 ? (DrawPass)module->Values[3].AsInt : DrawPass::Default;
                    auto dp = draw.DrawModes & moduleDrawModes & material->GetDrawModes();
                    if (dp == DrawPass::None || SpriteRenderer.Init())
                        break;

                    // Draw sprite for each particle
                    GPUDrawIndexedIndirectArgs args = { SpriteParticleRenderer::IndexCount, 1, 0, 0, 0 };
                    Platform::MemoryCopy(indirectArgsMemory + indirectArgsOffset, &args, sizeof(args));
                    indirectArgsOffset += sizeof(args);
                    break;
                }
                // Model Rendering
                case 403:
                {
                    const auto model = (Model*)module->Assets[0].Get();
                    const auto material = (MaterialBase*)module->Assets[1].Get();
                    const auto moduleDrawModes = module->Values.Count() > 4 ? (DrawPass)module->Values[4].AsInt : DrawPass::Default;
                    auto dp = draw.DrawModes & moduleDrawModes & material->GetDrawModes();
                    if (dp == DrawPass::None)
                        break;
                    // TODO: model LOD picking for particles?
                    int32 lodIndex = 0;
                    ModelLOD& lod = model->LODs[lodIndex];
                    for (int32 meshIndex = 0; meshIndex < lod.Meshes.Count(); meshIndex++)
                    {
                        Mesh& mesh = lod.Meshes[meshIndex];
                        if (!mesh.IsInitialized())
                            continue;

                        // Draw mesh for each particle
                        GPUDrawIndexedIndirectArgs args = { (uint32)mesh.GetTriangleCount() * 3, 1, 0, 0, 0 };
                        Platform::MemoryCopy(indirectArgsMemory + indirectArgsOffset, &args, sizeof(args));
                        indirectArgsOffset += sizeof(args);
                    }
                    break;
                }
                }
            }
        }

        // Upload default arguments
        context->UpdateBuffer(GPUIndirectArgsBuffer, indirectArgsMemory, indirectArgsOffset);

        // Wait for whole buffer write end before submitting buffer copies
        pass.MemoryBarrier();

        // Copy particle counts into draw commands
        indirectArgsOffset = 0;
        for (GPUEmitterDraw& draw : GPUEmitterDraws)
        {
            ParticleEmitter* emitter = draw.Buffer->Emitter;
            for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.RenderModules.Count(); moduleIndex++)
            {
                if ((draw.RenderModulesIndices & (1u << moduleIndex)) == 0)
                    continue;
                auto module = emitter->Graph.RenderModules.Get()[moduleIndex];
                switch (module->TypeID)
                {
                // Sprite Rendering
                case 400:
                {
                    const auto material = (MaterialBase*)module->Assets[0].Get();
                    const auto moduleDrawModes = module->Values.Count() > 3 ? (DrawPass)module->Values[3].AsInt : DrawPass::Default;
                    auto dp = draw.DrawModes & moduleDrawModes & material->GetDrawModes();
                    if (dp == DrawPass::None || SpriteRenderer.Init())
                        break;

                    // Draw sprite for each particle
                    context->CopyBuffer(GPUIndirectArgsBuffer, draw.Buffer->GPU.Buffer, 4, indirectArgsOffset + 4, draw.Buffer->GPU.ParticleCounterOffset);
                    indirectArgsOffset += sizeof(GPUDrawIndexedIndirectArgs);
                    break;
                }
                // Model Rendering
                case 403:
                {
                    const auto model = (Model*)module->Assets[0].Get();
                    const auto material = (MaterialBase*)module->Assets[1].Get();
                    const auto moduleDrawModes = module->Values.Count() > 4 ? (DrawPass)module->Values[4].AsInt : DrawPass::Default;
                    auto dp = draw.DrawModes & moduleDrawModes & material->GetDrawModes();
                    if (dp == DrawPass::None)
                        break;
                    // TODO: model LOD picking for particles?
                    int32 lodIndex = 0;
                    ModelLOD& lod = model->LODs[lodIndex];
                    for (int32 meshIndex = 0; meshIndex < lod.Meshes.Count(); meshIndex++)
                    {
                        Mesh& mesh = lod.Meshes[meshIndex];
                        if (!mesh.IsInitialized())
                            continue;

                        // Draw mesh for each particle
                        context->CopyBuffer(GPUIndirectArgsBuffer, draw.Buffer->GPU.Buffer, 4, indirectArgsOffset + 4, draw.Buffer->GPU.ParticleCounterOffset);
                        indirectArgsOffset += sizeof(GPUDrawIndexedIndirectArgs);
                    }
                    break;
                }
                }
            }
        }
    }
    indirectArgsOffset = 0;

    // Sort particles
    if (sorting)
    {
        PROFILE_GPU_CPU_NAMED("Sort Particles");
        context->BindCB(0, GPUParticlesSortingCB);

        // Generate sort keys for each particle
        {
            PROFILE_GPU("Gen Sort Keys");

            GPUComputePass pass(context);
            for (const GPUEmitterDraw& draw : GPUEmitterDraws)
            {
                if (draw.Sorting)
                {
                    pass.Transition(draw.Buffer->GPU.Buffer, GPUResourceAccess::ShaderReadCompute);
                    pass.Transition(draw.Buffer->GPU.SortedIndices, GPUResourceAccess::UnorderedAccess);
                    pass.Transition(draw.Buffer->GPU.SortingKeys, GPUResourceAccess::UnorderedAccess);
                }
            }

            for (const GPUEmitterDraw& draw : GPUEmitterDraws)
            {
                if (!draw.Sorting)
                    continue;
                ASSERT(draw.Buffer->GPU.SortingKeys);
                ParticleEmitter* emitter = draw.Buffer->Emitter;
                for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++)
                {
                    auto module = emitter->Graph.SortModules[moduleIndex];
                    // TODO: add support for module->SortedIndicesOffset (multiple sort modules)
                    const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
                    GPUParticlesSortingData data;
                    data.ParticleCounterOffset = draw.Buffer->GPU.ParticleCounterOffset;
                    data.ParticleStride = draw.Buffer->Stride;
                    data.ParticleCapacity = draw.Buffer->Capacity;
                    int32 permutationIndex;
                    switch (sortMode)
                    {
                    case ParticleSortMode::ViewDepth:
                    {
                        permutationIndex = 0;
                        data.PositionOffset = emitter->Graph.GetPositionAttributeOffset();
                        const Matrix viewProjection = renderContextBatch.GetMainContext().View.ViewProjection();
                        if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
                            Matrix::Transpose(draw.DrawCall.World * viewProjection, data.PositionTransform);
                        else
                            Matrix::Transpose(viewProjection, data.PositionTransform);
                        break;
                    }
                    case ParticleSortMode::ViewDistance:
                    {
                        permutationIndex = 1;
                        data.PositionOffset = emitter->Graph.GetPositionAttributeOffset();
                        data.ViewPosition = renderContextBatch.GetMainContext().View.Position;
                        if (emitter->SimulationSpace == ParticlesSimulationSpace::Local)
                            Matrix::Transpose(draw.DrawCall.World, data.PositionTransform);
                        else
                            Matrix::Transpose(Matrix::Identity, data.PositionTransform);
                        break;
                    }
                    case ParticleSortMode::CustomAscending:
                    case ParticleSortMode::CustomDescending:
                    {
                        permutationIndex = 2;
                        int32 attributeIdx = module->Attributes[0];
                        if (attributeIdx == -1)
                            break;
                        data.CustomOffset = emitter->Graph.Layout.Attributes[attributeIdx].Offset;
                        break;
                    }
                    }
                    context->UpdateCB(GPUParticlesSortingCB, &data);
                    context->BindSR(0, draw.Buffer->GPU.Buffer->View());
                    context->BindUA(0, draw.Buffer->GPU.SortedIndices->View());
                    context->BindUA(1, draw.Buffer->GPU.SortingKeys->View());
                    const int32 threadGroupSize = 1024;
                    context->Dispatch(GPUParticlesSortingCS[permutationIndex], Math::DivideAndRoundUp(draw.Buffer->GPU.ParticlesCountMax, threadGroupSize), 1, 1);
                }
            }
            context->ResetUA();
        }

        // Run sorting
        constexpr int32 inplaceSortSizeLimit = 2048;
        {
            // Small emitters can be sorted in-place with a single independent dispatch (simultaneously)
            GPUComputePass pass(context);
            for (const GPUEmitterDraw& draw : GPUEmitterDraws)
            {
                if (!draw.Sorting || draw.Buffer->GPU.ParticlesCountMax > inplaceSortSizeLimit)
                    continue;
                ParticleEmitter* emitter = draw.Buffer->Emitter;
                for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++)
                {
                    auto module = emitter->Graph.SortModules[moduleIndex];
                    // TODO: add support for module->SortedIndicesOffset (multiple sort modules)
                    const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
                    bool sortAscending = sortMode == ParticleSortMode::CustomAscending;
                    BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.SortingKeys, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.ParticlesCountMax);
                }
            }
        }
        for (const GPUEmitterDraw& draw : GPUEmitterDraws)
        {
            if (!draw.Sorting || draw.Buffer->GPU.ParticlesCountMax <= inplaceSortSizeLimit)
                continue;
            ParticleEmitter* emitter = draw.Buffer->Emitter;
            for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.SortModules.Count(); moduleIndex++)
            {
                auto module = emitter->Graph.SortModules[moduleIndex];
                // TODO: add support for module->SortedIndicesOffset (multiple sort modules)
                const auto sortMode = (ParticleSortMode)module->Values[2].AsInt;
                bool sortAscending = sortMode == ParticleSortMode::CustomAscending;
                BitonicSort::Instance()->Sort(context, draw.Buffer->GPU.SortedIndices, draw.Buffer->GPU.SortingKeys, draw.Buffer->GPU.Buffer, draw.Buffer->GPU.ParticleCounterOffset, sortAscending, draw.Buffer->GPU.ParticlesCountMax);
                // TODO: use args buffer from GPUIndirectArgsBuffer instead of internal from BitonicSort to get rid of UAV barrier (all sorting in parallel)
            }
        }
    }
    else
    {
        // Initialize with identity sort indices in case the buffer has been allocated
        for (const GPUEmitterDraw& draw : GPUEmitterDraws)
        {
            if (!draw.Sorting || !draw.Buffer->GPU.SortedIndices)
                continue;
            const int32 capacity = draw.Buffer->Capacity;
            const int32 capacityBytes = capacity * draw.Buffer->GPU.SortedIndices->GetStride();
            const int32 indicesBytes = draw.Buffer->GPU.SortedIndices->GetSize();
            RenderListAlloc sortedIndicesAlloc;
            auto* renderList = renderContextBatch.GetMainContext().List;
            void* indices = sortedIndicesAlloc.Init(renderList, indicesBytes, GPU_SHADER_DATA_ALIGNMENT);
            switch (draw.Buffer->GPU.SortedIndices->GetFormat())
            {
            case PixelFormat::R16_UInt:
                for (int32 i = 0; i < capacity; i++)
                    ((uint16*)indices)[i] = (uint16)i;
                break;
            case PixelFormat::R32_UInt:
                for (int32 i = 0; i < capacity; i++)
                    ((uint32*)indices)[i] = i;
                break;
            }
            for (int32 i = 1; i < draw.Buffer->Emitter->Graph.SortModules.Count(); i++)
                Platform::MemoryCopy((byte*)indices + i * capacityBytes, indices, capacityBytes);
            context->UpdateBuffer(draw.Buffer->GPU.SortedIndices, indices, indicesBytes, 0);
        }
    }

    // TODO: transition here SortedIndices into ShaderReadNonPixel and Buffer into ShaderReadGraphics to reduce barriers during particles rendering

    // Submit draw calls
    for (GPUEmitterDraw& draw : GPUEmitterDraws)
    {
        // Execute all rendering modules using indirect draw arguments
        ParticleEmitter* emitter = draw.Buffer->Emitter;
        for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.RenderModules.Count(); moduleIndex++)
        {
            if ((draw.RenderModulesIndices & (1u << moduleIndex)) == 0)
                continue;
            auto module = emitter->Graph.RenderModules.Get()[moduleIndex];
            draw.DrawCall.Particle.Module = module;
            switch (module->TypeID)
            {
                // Sprite Rendering
            case 400:
            {
                const auto material = (MaterialBase*)module->Assets[0].Get();
                const auto moduleDrawModes = module->Values.Count() > 3 ? (DrawPass)module->Values[3].AsInt : DrawPass::Default;
                auto dp = draw.DrawModes & moduleDrawModes & material->GetDrawModes();
                if (dp == DrawPass::None || SpriteRenderer.Init())
                    break;
                draw.DrawCall.Material = material;

                // Submit draw call
                SpriteRenderer.SetupDrawCall(draw.DrawCall);
                draw.DrawCall.InstanceCount = 0;
                draw.DrawCall.Draw.IndirectArgsBuffer = GPUIndirectArgsBuffer;
                draw.DrawCall.Draw.IndirectArgsOffset = indirectArgsOffset;
                renderContextBatch.GetMainContext().List->AddDrawCall(renderContextBatch, dp, draw.StaticFlags, ShadowsCastingMode::DynamicOnly, draw.Bounds, draw.DrawCall, false, draw.SortOrder);
                indirectArgsOffset += sizeof(GPUDrawIndexedIndirectArgs);
                break;
            }
            // Model Rendering
            case 403:
            {
                const auto model = (Model*)module->Assets[0].Get();
                const auto material = (MaterialBase*)module->Assets[1].Get();
                const auto moduleDrawModes = module->Values.Count() > 4 ? (DrawPass)module->Values[4].AsInt : DrawPass::Default;
                auto dp = draw.DrawModes & moduleDrawModes & material->GetDrawModes();
                if (dp == DrawPass::None)
                    break;
                draw.DrawCall.Material = material;

                // TODO: model LOD picking for particles?
                int32 lodIndex = 0;
                ModelLOD& lod = model->LODs[lodIndex];
                for (int32 meshIndex = 0; meshIndex < lod.Meshes.Count(); meshIndex++)
                {
                    Mesh& mesh = lod.Meshes[meshIndex];
                    if (!mesh.IsInitialized())
                        continue;
                    // TODO: include mesh entry transformation, visibility and shadows mode?

                    // Execute draw call
                    mesh.GetDrawCallGeometry(draw.DrawCall);
                    draw.DrawCall.InstanceCount = 0;
                    draw.DrawCall.Draw.IndirectArgsBuffer = GPUIndirectArgsBuffer;
                    draw.DrawCall.Draw.IndirectArgsOffset = indirectArgsOffset;
                    renderContextBatch.GetMainContext().List->AddDrawCall(renderContextBatch, dp, draw.StaticFlags, ShadowsCastingMode::DynamicOnly, draw.Bounds, draw.DrawCall, false, draw.SortOrder);
                    indirectArgsOffset += sizeof(GPUDrawIndexedIndirectArgs);
                }
                break;
            }
            // Ribbon Rendering
            case 404:
            {
                // Not supported
                break;
            }
            // Volumetric Fog Rendering
            case 405:
            {
                // Not supported
                break;
            }
            }
        }
    }

    GPUEmitterDraws.Clear();
}

void DrawEmitterGPU(RenderContextBatch& renderContextBatch, ParticleBuffer* buffer, DrawCall& drawCall, DrawPass drawModes, StaticFlags staticFlags, const BoundingSphere& bounds, uint32 renderModulesIndices, int8 sortOrder)
{
    // Setup drawing data
    uint32 indirectArgsSize = 0;
    ParticleEmitter* emitter = buffer->Emitter;
    for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.RenderModules.Count(); moduleIndex++)
    {
        if ((renderModulesIndices & (1u << moduleIndex)) == 0)
            continue;
        auto module = emitter->Graph.RenderModules.Get()[moduleIndex];
        switch (module->TypeID)
        {
            // Sprite Rendering
        case 400:
            indirectArgsSize += sizeof(GPUDrawIndexedIndirectArgs);
            break;
        // Model Rendering
        case 403:
        {
            const auto model = (Model*)module->Assets[0].Get();
            // TODO: model LOD picking for particles?
            int32 lodIndex = 0;
            ModelLOD& lod = model->LODs[lodIndex];
            indirectArgsSize += sizeof(GPUDrawIndexedIndirectArgs) * lod.Meshes.Count();
            break;
        }
        }
    }
    if (indirectArgsSize == 0)
        return;
    bool sorting = EmitterUseSorting(renderContextBatch, buffer, drawModes, bounds) && (buffer->GPU.ParticlesCountMax != 0 || buffer->GPU.SortedIndices);
    if (sorting && !buffer->GPU.SortedIndices)
        buffer->AllocateSortBuffer();

    // When rendering in async, delay GPU particles drawing to be in sync by moving drawing into delayed callback post scene drawing to use GPUContext safely
    // Also, batch rendering all GPU emitters together for more efficient usage of GPU memory barriers and indirect arguments buffers allocation
    RenderContext::GPULocker.Lock();
    if (GPUEmitterDraws.Count() == 0)
    {
        // The first emitter schedules the drawing of all batched draws
        renderContextBatch.GetMainContext().List->AddDelayedDraw([](RenderContextBatch& renderContextBatch, int32 contextIndex)
        {
            DrawEmittersGPU(renderContextBatch);
        });
    }
    GPUEmitterDraws.Add({ buffer, drawCall, drawModes, staticFlags, bounds, renderModulesIndices, indirectArgsSize, sortOrder, sorting });
    RenderContext::GPULocker.Unlock();
}

#endif

void Particles::DrawParticles(RenderContextBatch& renderContextBatch, ParticleEffect* effect)
{
    PROFILE_CPU();
    PROFILE_MEM(Particles);

    // Drawing assumes that all views within a batch have the same Origin
    const Vector3& viewOrigin = renderContextBatch.GetMainContext().View.Origin;
    BoundingSphere bounds = effect->GetSphere();
    bounds.Center -= viewOrigin;

    // Cull particles against all views
    bool drawAnyView = false, drawMainView = false;
    DrawPass viewsDrawModes = DrawPass::None;
    for (int32 i = 0; i < renderContextBatch.Contexts.Count(); i++)
    {
        const RenderView& view = renderContextBatch.Contexts.Get()[i].View;
        const bool visible = (view.Pass & effect->DrawModes) != DrawPass::None && (view.IsCullingDisabled || view.CullingFrustum.Intersects(bounds));
        if (visible)
        {
            drawAnyView = true;
            drawMainView |= i == 0;
            viewsDrawModes |= view.Pass;
        }
    }
    if (drawAnyView == false)
        return;
    viewsDrawModes &= effect->DrawModes;

    // Setup
    ScopeReadLock systemScope(SystemLocker);
    Matrix worlds[2];
    Matrix::Translation(-viewOrigin, worlds[0]); // World
    renderContextBatch.GetMainContext().View.GetWorldMatrix(effect->GetTransform(), worlds[1]); // Local
    const StaticFlags staticFlags = effect->GetStaticFlags();
    const int8 sortOrder = effect->SortOrder;

    // Draw lights (only to into the main view)
    if (drawMainView && renderContextBatch.GetMainContext().View.Pass != DrawPass::Depth)
    {
        for (int32 emitterIndex = 0; emitterIndex < effect->Instance.Emitters.Count(); emitterIndex++)
        {
            auto& emitterData = effect->Instance.Emitters[emitterIndex];
            const auto buffer = emitterData.Buffer;
            if (!buffer || (buffer->Mode == ParticlesSimulationMode::CPU && buffer->CPU.Count == 0))
                continue;
            auto emitter = buffer->Emitter;
            if (!emitter || !emitter->IsLoaded())
                continue;

            buffer->Emitter->GraphExecutorCPU.Draw(buffer->Emitter, effect, emitterData, renderContextBatch.GetMainContext(), worlds[(int32)emitter->SimulationSpace]);
        }
    }

    // Setup a draw call common data
    DrawCall drawCall;
    drawCall.PerInstanceRandom = effect->GetPerInstanceRandom();
    drawCall.SetStencilValue(effect->GetLayer());
    drawCall.ObjectPosition = bounds.Center;
    drawCall.ObjectRadius = (float)bounds.Radius;

    // Draw all emitters
    for (int32 emitterIndex = 0; emitterIndex < effect->Instance.Emitters.Count(); emitterIndex++)
    {
        auto& emitterData = effect->Instance.Emitters[emitterIndex];
        const auto buffer = emitterData.Buffer;
        if (!buffer)
            continue;
        auto emitter = buffer->Emitter;
        if (!emitter || !emitter->IsLoaded())
            continue;

        drawCall.World = worlds[(int32)emitter->SimulationSpace];
        drawCall.Particle.Particles = buffer;

        // Check if need to render any module
        uint32 renderModulesIndices = 0;
        for (int32 moduleIndex = 0; moduleIndex < emitter->Graph.RenderModules.Count() && moduleIndex < 32; moduleIndex++)
        {
            auto module = emitter->Graph.RenderModules[moduleIndex];

            switch (module->TypeID)
            {
            // Sprite Rendering
            case 400:
            {
                const auto material = (MaterialBase*)module->Assets[0].Get();
                const auto moduleDrawModes = module->Values.Count() > 3 ? (DrawPass)module->Values[3].AsInt : DrawPass::Default;
                if (!material ||
                    !material->IsReady() ||
                    !material->IsParticle() ||
                    (viewsDrawModes & material->GetDrawModes() & moduleDrawModes) == DrawPass::None
                )
                    break;
                renderModulesIndices |= 1u << moduleIndex;
                break;
            }
            // Model Rendering
            case 403:
            {
                const auto model = (Model*)module->Assets[0].Get();
                const auto moduleDrawModes = module->Values.Count() > 4 ? (DrawPass)module->Values[4].AsInt : DrawPass::Default;
                if (!model ||
                    !model->IsLoaded() ||
                    !model->CanBeRendered())
                    break;
                const auto material = (MaterialBase*)module->Assets[1].Get();
                if (!material ||
                    !material->IsReady() ||
                    !material->IsParticle() ||
                    (viewsDrawModes & material->GetDrawModes() & moduleDrawModes) == DrawPass::None
                )
                    break;
                renderModulesIndices |= 1u << moduleIndex;
                break;
            }
            // Ribbon Rendering
            case 404:
            {
                const auto material = (MaterialBase*)module->Assets[0].Get();
                const auto moduleDrawModes = module->Values.Count() > 6 ? (DrawPass)module->Values[6].AsInt : DrawPass::Default;
                if (!material ||
                    !material->IsReady() ||
                    !material->IsParticle() ||
                    (viewsDrawModes & material->GetDrawModes() & moduleDrawModes) == DrawPass::None
                )
                    break;
                renderModulesIndices |= 1u << moduleIndex;
                break;
            }
            // Volumetric Fog Rendering
            case 405:
            {
                const auto material = (MaterialBase*)module->Assets[0].Get();
                if (!material ||
                    !material->IsReady() ||
                    material->GetInfo().Domain != MaterialDomain::VolumeParticle ||
                    (renderContextBatch.GetMainContext().View.Flags & ViewFlags::Fog) == ViewFlags::None ||
                    drawMainView
                )
                    break;
                renderModulesIndices |= 1u << moduleIndex;
                break;
            }
            }
        }
        if (renderModulesIndices == 0)
            continue;

        // Draw
        switch (buffer->Mode)
        {
        case ParticlesSimulationMode::CPU:
            DrawEmitterCPU(renderContextBatch, buffer, drawCall, viewsDrawModes, staticFlags, bounds, renderModulesIndices, sortOrder);
            break;
#if COMPILE_WITH_GPU_PARTICLES
        case ParticlesSimulationMode::GPU:
            DrawEmitterGPU(renderContextBatch, buffer, drawCall, viewsDrawModes, staticFlags, bounds, renderModulesIndices, sortOrder);
            break;
#endif
        }
    }
}

#if USE_EDITOR

void Particles::DebugDraw(ParticleEffect* effect)
{
    PROFILE_CPU_NAMED("Particles.DrawDebug");
    ScopeReadLock systemScope(SystemLocker);

    // Draw all emitters
    for (auto& emitterData : effect->Instance.Emitters)
    {
        const auto buffer = emitterData.Buffer;
        if (!buffer)
            continue;
        auto emitter = buffer->Emitter;
        if (!emitter || !emitter->IsLoaded())
            continue;
        emitter->GraphExecutorCPU.DrawDebug(emitter, effect, emitterData);
    }
}

#endif

#if COMPILE_WITH_GPU_PARTICLES

void UpdateGPU(RenderTask* task, GPUContext* context)
{
    ScopeLock lock(GpuUpdateListLocker);
    if (GpuUpdateList.IsEmpty())
        return;
    PROFILE_CPU_NAMED("GPUParticles");
    PROFILE_GPU("GPU Particles");
    PROFILE_MEM(Particles);
    ScopeReadLock systemScope(Particles::SystemLocker);

    // Collect valid emitter tracks to update
    struct GPUSim
    {
        ParticleEffect* Effect;
        ParticleEmitter* Emitter;
        int32 EmitterIndex;
        ParticleEmitterInstance* Data;

        bool operator<(const GPUSim& other) const
        {
            // Sort by particle count (larger effects start first)
            if (Data->Buffer->GPU.ParticlesCountMax != other.Data->Buffer->GPU.ParticlesCountMax)
                return Data->Buffer->GPU.ParticlesCountMax > other.Data->Buffer->GPU.ParticlesCountMax;
            if (Emitter->Capacity != other.Emitter->Capacity)
                return Emitter->Capacity > other.Emitter->Capacity;

            // Merge emitters together (compute pipeline switches)
            return (uintptr)Emitter < (uintptr)other.Emitter;
        }
    };
    Array<GPUSim, RendererAllocation> sims;
    sims.EnsureCapacity(Math::AlignUp(GpuUpdateList.Count(), 64)); // Preallocate with some slack
    for (ParticleEffect* effect : GpuUpdateList)
    {
        auto& instance = effect->Instance;
        const auto particleSystem = effect->ParticleSystem.Get();
        if (!particleSystem || !particleSystem->IsLoaded())
            continue;

        for (int32 j = 0; j < particleSystem->Tracks.Count(); j++)
        {
            const auto& track = particleSystem->Tracks[j];
            if (track.Type != ParticleSystem::Track::Types::Emitter || track.Disabled)
                continue;
            const int32 emitterIndex = track.AsEmitter.Index;
            ParticleEmitter* emitter = particleSystem->Emitters[emitterIndex].Get();
            if (!emitter || !emitter->IsLoaded() || emitter->SimulationMode != ParticlesSimulationMode::GPU || instance.Emitters.Count() <= emitterIndex)
                continue;
            ParticleEmitterInstance& data = instance.Emitters[emitterIndex];
            if (!data.Buffer)
                continue;
            ASSERT(emitter->Capacity != 0 && emitter->Graph.Layout.Size != 0);
            if (!emitter->GPU.CanSim(emitter, data))
            {
                // Emitters that are culled still might need to clear the particle counter (used for indirect draws)
                if (data.Buffer->GPU.PendingClear)
                    emitter->GPU.PreSim(context, emitter, effect, emitterIndex, data);
                continue;
            }
            sims.Add({ effect, emitter, emitterIndex, &data });
        }
    }
    GpuUpdateList.Clear();

    // Sort particles by emitter type to reduce compute pipeline switches
    Sorting::QuickSort(sims);

    // Pre-pass with buffers setup
    {
        PROFILE_CPU_NAMED("PreSim");

        GPUMemoryPass pass(context);
        for (GPUSim& sim : sims)
        {
            if (sim.Data->Buffer->GPU.PendingClear)
                pass.Transition(sim.Data->Buffer->GPU.Buffer, GPUResourceAccess::CopyWrite);
            pass.Transition(sim.Data->Buffer->GPU.BufferSecondary, GPUResourceAccess::CopyWrite);
        }

        for (GPUSim& sim : sims)
        {
            sim.Emitter->GPU.PreSim(context, sim.Emitter, sim.Effect, sim.EmitterIndex, *sim.Data);
        }
    }

    // Pre-pass with buffers setup
    {
        PROFILE_GPU_CPU_NAMED("Sim");

        GPUComputePass pass(context);
        for (GPUSim& sim : sims)
        {
            pass.Transition(sim.Data->Buffer->GPU.Buffer, GPUResourceAccess::ShaderReadCompute);
            pass.Transition(sim.Data->Buffer->GPU.BufferSecondary, GPUResourceAccess::UnorderedAccess);
        }

        for (GPUSim& sim : sims)
        {
            sim.Emitter->GPU.Sim(context, sim.Emitter, sim.Effect, sim.EmitterIndex, *sim.Data);
        }
    }

    // Post-pass with buffers setup
    {
        PROFILE_CPU_NAMED("PostSim");

        GPUMemoryPass pass(context);
        for (GPUSim& sim : sims)
        {
            if (sim.Data->CustomData.HasItems())
            {
                pass.Transition(sim.Data->Buffer->GPU.BufferSecondary, GPUResourceAccess::CopyRead);
                pass.Transition(sim.Data->Buffer->GPU.Buffer, GPUResourceAccess::CopyWrite);
            }
        }

        for (GPUSim& sim : sims)
        {
            sim.Emitter->GPU.PostSim(context, sim.Emitter, sim.Effect, sim.EmitterIndex, *sim.Data);
        }
    }

    context->ResetSR();
    context->ResetUA();
    context->FlushState();
}

#endif

ParticleBuffer* Particles::AcquireParticleBuffer(ParticleEmitter* emitter)
{
    PROFILE_CPU();
    PROFILE_MEM(Particles);
    ParticleBuffer* result = nullptr;
    ASSERT(emitter && emitter->IsLoaded());

    if (emitter->EnablePooling && EnableParticleBufferPooling)
    {
        PoolLocker.Lock();
        const auto entries = Pool.TryGet(emitter);
        if (entries)
        {
            while (entries->HasItems() && !result)
            {
                // Reuse buffer
                result = entries->Last().Buffer;
                entries->RemoveLast();

                // Remove old buffers
                if (result->Version != emitter->Graph.Version)
                {
                    Delete(result);
                    result = nullptr;
                }
            }
        }
        PoolLocker.Unlock();
    }

    if (!result)
    {
        // Create new buffer
        result = New<ParticleBuffer>();
        if (result->Init(emitter))
        {
            LOG(Error, "Failed to create particle buffer for emitter {0}", emitter->ToString());
            Delete(result);
            return nullptr;
        }
    }
    else
    {
        // Prepare buffer
        result->Clear();
    }

    return result;
}

void Particles::RecycleParticleBuffer(ParticleBuffer* buffer)
{
    PROFILE_CPU();
    PROFILE_MEM(Particles);
    if (buffer->Emitter->EnablePooling && EnableParticleBufferPooling)
    {
        // Return to pool
        EmitterCache c;
        c.LastTimeUsed = Platform::GetTimeSeconds();
        c.Buffer = buffer;

        PoolLocker.Lock();
        Pool[buffer->Emitter].Add(c);
        PoolLocker.Unlock();
    }
    else
    {
        // Destroy
        Delete(buffer);
    }
}

void Particles::OnEmitterUnload(ParticleEmitter* emitter)
{
    PROFILE_CPU();
    PoolLocker.Lock();
    const auto entries = Pool.TryGet(emitter);
    if (entries)
    {
        for (int32 i = 0; i < entries->Count(); i++)
        {
            Delete(entries->At(i).Buffer);
        }
        entries->Clear();
        Pool.Remove(emitter);
    }
    PoolLocker.Unlock();

#if COMPILE_WITH_GPU_PARTICLES
    GpuUpdateListLocker.Lock();
    for (int32 i = GpuUpdateList.Count() - 1; i >= 0; i--)
    {
        if (GpuUpdateList[i]->Instance.ContainsEmitter(emitter))
            GpuUpdateList.RemoveAt(i);
    }
    GpuUpdateListLocker.Unlock();
#endif
}

bool ParticleManagerService::Init()
{
    PROFILE_MEM(Particles);
    Particles::System = New<ParticlesSystem>();
    Particles::System->Order = 10000;
    Engine::UpdateGraph->AddSystem(Particles::System);
    return false;
}

void ParticleManagerService::Dispose()
{
    UpdateList.Clear();
#if COMPILE_WITH_GPU_PARTICLES
    GpuUpdateList.Clear();
    if (GpuRenderTask)
    {
        ScopeLock lock(RenderTask::TasksLocker);
        RenderTask::Tasks.Remove(GpuRenderTask);
        Delete(GpuRenderTask);
        GpuRenderTask = nullptr;
    }
    CleanupGPUParticlesSorting();
#endif

    PoolLocker.Lock();
    for (auto i = Pool.Begin(); i.IsNotEnd(); ++i)
    {
        auto& entries = i->Value;
        for (int32 j = 0; j < entries.Count(); j++)
        {
            Delete(entries[j].Buffer);
        }
        entries.Clear();
    }
    Pool.Clear();
    PoolLocker.Unlock();

    SpriteRenderer.Dispose();
    SAFE_DELETE(Particles::System);
}

void ParticlesSystem::Job(int32 index)
{
    PROFILE_CPU_NAMED("Particles.Job");
    PROFILE_MEM(Particles);
    auto effect = UpdateList[index];
    auto& instance = effect->Instance;
    const auto particleSystem = effect->ParticleSystem.Get();
    if (!particleSystem || !particleSystem->IsLoaded())
        return;
    bool anyEmitterNotReady = false;
    for (int32 j = 0; j < particleSystem->Tracks.Count(); j++)
    {
        const auto& track = particleSystem->Tracks[j];
        if (track.Type != ParticleSystem::Track::Types::Emitter || track.Disabled)
            continue;
        auto emitter = particleSystem->Emitters[track.AsEmitter.Index].Get();
        if (!emitter || !emitter->IsLoaded())
        {
            anyEmitterNotReady = true;
            break;
        }
    }
    if (anyEmitterNotReady)
        return;
#if COMPILE_WITH_PROFILER && TRACY_ENABLE
    const StringView particleSystemName(particleSystem->GetPath());
    ZoneName(*particleSystemName, particleSystemName.Length());
#endif

    // Prepare instance data
    instance.Sync(particleSystem);

    bool updateBounds = false;
    bool updateGpu = false;

    // Simulation delta time can be based on a time since last update or the current delta time
    bool useTimeScale = effect->UseTimeScale;
#if USE_EDITOR
    if (!Editor::IsPlayMode)
        useTimeScale = false;
#endif
    float dt = useTimeScale ? DeltaTime : UnscaledDeltaTime;
    float t = useTimeScale ? Time : UnscaledTime;
    const float lastUpdateTime = instance.LastUpdateTime;
    if (lastUpdateTime > 0 && t > lastUpdateTime)
    {
        dt = t - lastUpdateTime;
    }
    else if (lastUpdateTime < 0)
    {
        // Update bounds after first system update
        updateBounds = true;
    }
    // TODO: if using fixed timestep quantize the dt and accumulate remaining part for the next update?
    //if (dt <= 1.0f / 240.0f)
    //    return;
    dt *= effect->SimulationSpeed;
    instance.Time += dt;
    const float fps = particleSystem->FramesPerSecond;
    const float duration = (float)particleSystem->DurationFrames / fps;
    if (instance.Time > duration)
    {
        if (effect->IsLooping)
        {
            // Loop
            // TODO: accumulate (duration - instance.Time) into next update dt
            instance.Time = 0;
            for (int32 j = 0; j < instance.Emitters.Count(); j++)
            {
                auto& e = instance.Emitters[j];
                e.Time = 0;
                for (auto& s : e.SpawnModulesData)
                {
                    s.NextSpawnTime = 0.0f;
                }
            }
        }
        else
        {
            // End
            instance.Time = duration;
            for (auto& emitterInstance : instance.Emitters)
            {
                if (emitterInstance.Buffer)
                {
                    Particles::RecycleParticleBuffer(emitterInstance.Buffer);
                    emitterInstance.Buffer = nullptr;
                }
            }
            // Stop playing effect.
            effect->Stop();
            return;
        }
    }
    instance.LastUpdateTime = t;

    // Update all emitter tracks
    for (int32 j = 0; j < particleSystem->Tracks.Count(); j++)
    {
        const auto& track = particleSystem->Tracks[j];
        if (track.Type != ParticleSystem::Track::Types::Emitter || track.Disabled)
            continue;
        auto emitter = particleSystem->Emitters[track.AsEmitter.Index].Get();
        auto& data = instance.Emitters[track.AsEmitter.Index];
        ASSERT(emitter && emitter->IsLoaded());
        if (emitter->Capacity == 0 || emitter->Graph.Layout.Size == 0)
            continue;
        PROFILE_CPU_ASSET(emitter);

        // Calculate new time position
        const float startTime = (float)track.AsEmitter.StartFrame / fps;
        const float durationTime = (float)track.AsEmitter.DurationFrames / fps;
        const bool canSpawn = startTime <= instance.Time && instance.Time <= startTime + durationTime;

        // Update instance data
        data.Sync(effect->Instance, particleSystem, track.AsEmitter.Index);
        if (!data.Buffer)
        {
            data.Buffer = Particles::AcquireParticleBuffer(emitter);
        }
        data.Time += dt;

        // Update particles simulation
        switch (emitter->SimulationMode)
        {
        case ParticlesSimulationMode::CPU:
            emitter->GraphExecutorCPU.Update(emitter, effect, data, dt, canSpawn);
            updateBounds |= emitter->UseAutoBounds;
            break;
#if COMPILE_WITH_GPU_PARTICLES
        case ParticlesSimulationMode::GPU:
            emitter->GPU.Update(emitter, effect, data, dt, canSpawn);
            updateGpu = true;
            break;
#endif
        default:
            break;
        }
    }

    // Update bounds if any of the emitters uses auto-bounds
    if (updateBounds)
    {
        effect->UpdateBounds();
    }

#if COMPILE_WITH_GPU_PARTICLES
    // Register for GPU update
    if (updateGpu)
    {
        ScopeLock lock(GpuUpdateListLocker);
        GpuUpdateList.Add(effect);
    }
#endif
}

void ParticlesSystem::Execute(TaskGraph* graph)
{
    if (UpdateList.Count() == 0)
        return;
    Active = true;

    // Ensure no particle assets can be reloaded/modified during async update
    Particles::SystemLocker.ReadLock();

    // Setup data for async update
    const auto& tickData = Time::Update;
    DeltaTime = tickData.DeltaTime.GetTotalSeconds();
    UnscaledDeltaTime = tickData.UnscaledDeltaTime.GetTotalSeconds();
    Time = tickData.Time.GetTotalSeconds();
    UnscaledTime = tickData.UnscaledTime.GetTotalSeconds();

    // Schedule work to update all particles in async
    Function<void(int32)> job;
    job.Bind<ParticlesSystem, &ParticlesSystem::Job>(this);
    graph->DispatchJob(job, UpdateList.Count());
}

void ParticlesSystem::PostExecute(TaskGraph* graph)
{
    if (!Active)
        return;
    PROFILE_CPU_NAMED("Particles.PostExecute");
    PROFILE_MEM(Particles);

    // Cleanup
    Particles::SystemLocker.ReadUnlock();
    Active = false;
    UpdateList.Clear();

#if COMPILE_WITH_GPU_PARTICLES
    // Create GPU render task if missing but required
    if (GpuUpdateList.HasItems() && !GpuRenderTask)
    {
        GpuRenderTask = New<RenderTask>();
        GpuRenderTask->Order = -10000000;
        GpuRenderTask->Render.Bind(UpdateGPU);
        ScopeLock lock(RenderTask::TasksLocker);
        RenderTask::Tasks.Add(GpuRenderTask);
    }
    else if (GpuRenderTask)
    {
        ScopeLock lock(RenderTask::TasksLocker);
        GpuRenderTask->Enabled = GpuUpdateList.HasItems();
    }
#endif

    // Recycle buffers
    const auto timeSeconds = Platform::GetTimeSeconds();
    PoolLocker.Lock();
    for (auto i = Pool.Begin(); i.IsNotEnd(); ++i)
    {
        auto& entries = i->Value;
        for (int32 j = 0; j < entries.Count(); j++)
        {
            auto& e = entries[j];
            if (timeSeconds - e.LastTimeUsed >= Particles::ParticleBufferRecycleTimeout)
            {
                Delete(e.Buffer);
                entries.RemoveAt(j--);
            }
        }

        if (entries.IsEmpty())
            Pool.Remove(i);
    }
    PoolLocker.Unlock();
}