Add new GPU Query API that is lightweight and supports occlusion queries

2026-01-16 10:40:30 +01:00
parent d2d7a871ce
commit 9ac231c403
31 changed files with 829 additions and 254 deletions
--- a/Source/Engine/Graphics/Enums.h
+++ b/Source/Engine/Graphics/Enums.h
@@ -349,6 +349,24 @@ API_ENUM(Attributes="Flags") enum class GPUResourceMapMode

 DECLARE_ENUM_OPERATORS(GPUResourceMapMode);

+/// <summary>
+/// GPU resources types.
+/// </summary>
+enum class GPUQueryType
+{
+    /// <summary>
+    /// Measures duration of GPU commands execution. Returns time in microseconds (1/1000 ms).
+    /// </summary>
+    Timer = 0,
+
+    /// <summary>
+    /// Tests object visibility by counting number of pixel samples that are not culled (by depth or stencil tests).
+    /// </summary>
+    Occlusion = 1,
+
+    MAX
+};
+
 /// <summary>
 /// Primitives types.
 /// </summary>
--- a/Source/Engine/Graphics/GPUContext.h
+++ b/Source/Engine/Graphics/GPUContext.h
@@ -28,6 +28,7 @@ class GPUBufferView;
 class GPUVertexLayout;
 struct GPUPass;
 enum class GPUResourceAccess;
+enum class GPUQueryType;

 // Gets the GPU texture view. Checks if pointer is not null and texture has one or more mip levels loaded.
 #define GET_TEXTURE_VIEW_SAFE(t) (t && t->ResidentMipLevels() > 0 ? t->View() : nullptr)
@@ -554,6 +555,20 @@ public:
    /// <param name="offsetForArgs">The aligned byte offset for arguments.</param>
    API_FUNCTION() virtual void DrawIndexedInstancedIndirect(GPUBuffer* bufferForArgs, uint32 offsetForArgs) = 0;

+public:
+    /// <summary>
+    /// Begins the GPU query that will measure commands until EndQuery.
+    /// </summary>
+    /// <param name="type">Query type.</param>
+    /// <returns>Unique identifier of the query used to EndQuery and then GetQueryResult to read the query result data.</returns>
+    virtual uint64 BeginQuery(GPUQueryType type) = 0;
+
+    /// <summary>
+    /// Ends the GPU query. Use GPUDevice::GetQueryResult to read the results back.
+    /// </summary>
+    /// <param name="queryID">Query identifier returned by BeginQuery.</param>
+    virtual void EndQuery(uint64 queryID) = 0;
+
 public:
    /// <summary>
    /// Sets the rendering viewport and scissor rectangle.
--- a/Source/Engine/Graphics/GPUDevice.h
+++ b/Source/Engine/Graphics/GPUDevice.h
@@ -370,6 +370,16 @@ public:
    /// </summary>
    virtual void WaitForGPU() = 0;

+    /// <summary>
+    /// Reads the query result from the GPU.
+    /// </summary>
+    /// <remarks>GPU query results are short-lived, meaning that in the frame that results are ready, they won't be available in the next frame, as queries are reused.</remarks>
+    /// <param name="queryID">Query identifier returned by GPUContext::BeginQuery.</param>
+    /// <param name="result">The output result data of the query. Valid only when function returns true.</param>
+    /// <param name="wait">True if wait for the GPU to end processing commands for sync data ready. Otherwise, if query is incomplete then function will return value of false without result.</param>
+    /// <returns>True if got valid query result, otherwise false. If called with wait enabled then device failed to readback the query data.</returns>
+    virtual bool GetQueryResult(uint64 queryID, uint64& result, bool wait = false) = 0;
+
 public:
    void AddResource(GPUResource* resource);
    void RemoveResource(GPUResource* resource);
--- a/Source/Engine/Graphics/GPUResource.h
+++ b/Source/Engine/Graphics/GPUResource.h
@@ -32,7 +32,7 @@ API_ENUM() enum class GPUResourceType
    PipelineState,
    // GPU binding descriptor
    Descriptor,
-    // GPU timer query
+    // GPU timer or occlusion query
    Query,
    // GPU texture sampler
    Sampler,
--- a/Source/Engine/Graphics/Graphics.Build.cs
+++ b/Source/Engine/Graphics/Graphics.Build.cs
@@ -40,6 +40,7 @@ public abstract class GraphicsDeviceBaseModule : EngineModule
 public class Graphics : EngineModule
 {
    private static bool _logMissingVulkanSDK;
+    private static bool _logMissingWindowsSDK;

    /// <inheritdoc />
    public override void Setup(BuildOptions options)
@@ -59,7 +60,7 @@ public class Graphics : EngineModule
            if (windowsToolchain != null && windowsToolchain.SDK != Flax.Build.Platforms.WindowsPlatformSDK.v8_1)
                options.PrivateDependencies.Add("GraphicsDeviceDX12");
            else
-                Log.WarningOnce(string.Format("Building for {0} without Vulkan rendering backend (Vulkan SDK is missing)", options.Platform.Target), ref _logMissingVulkanSDK);
+                Log.WarningOnce(string.Format("Building for {0} without D3D12 rendering backend (Windows SDK is missing)", options.Platform.Target), ref _logMissingWindowsSDK);
            break;
        case TargetPlatform.UWP:
            options.PrivateDependencies.Add("GraphicsDeviceDX11");
--- a/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.cpp
+++ b/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.cpp
@@ -566,6 +566,81 @@ void GPUContextDX11::DrawIndexedInstancedIndirect(GPUBuffer* bufferForArgs, uint
    RENDER_STAT_DRAW_CALL(0, 0);
 }

+uint64 GPUContextDX11::BeginQuery(GPUQueryType type)
+{
+    // Allocate a pooled query
+    uint16 queryIndex;
+    static_assert(ARRAY_COUNT(_device->_readyQueries) == (int32)GPUQueryType::MAX, "Invalid query types count");
+    if (_device->_readyQueries[(int32)type].HasItems())
+    {
+        // Use query from cached list
+        queryIndex = _device->_readyQueries[(int32)type].Pop();
+    }
+    else
+    {
+        // Add a new query
+        queryIndex = _device->_queries.Count();
+        auto& query = _device->_queries.AddOne();
+        query.Type = type;
+        D3D11_QUERY_DESC queryDesc;
+        queryDesc.Query = D3D11_QUERY_TIMESTAMP;
+        queryDesc.MiscFlags = 0;
+        HRESULT hr = _device->GetDevice()->CreateQuery(&queryDesc, &query.Query);
+        LOG_DIRECTX_RESULT_WITH_RETURN(hr, 0);
+        if (type == GPUQueryType::Timer)
+        {
+            // Timer queries need additional one for begin and end disjoint
+            hr = _device->GetDevice()->CreateQuery(&queryDesc, &query.TimerBeginQuery);
+            LOG_DIRECTX_RESULT_WITH_RETURN(hr, 0);
+            queryDesc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT;
+            hr = _device->GetDevice()->CreateQuery(&queryDesc, &query.DisjointQuery);
+            LOG_DIRECTX_RESULT_WITH_RETURN(hr, 0);
+        }
+    }
+    static_assert(sizeof(GPUQueryDX11) == sizeof(uint64), "Invalid query size.");
+    GPUQueryDX11 q = {};
+    q.Type = (uint16)type;
+    q.Index = queryIndex;
+    q.Padding = 1; // Ensure Raw is never 0, even for the first query
+
+    // Begin query
+    {
+        auto& query = _device->_queries[queryIndex];
+        ASSERT_LOW_LAYER(query.State == GPUQueryDataDX11::Ready);
+        ASSERT_LOW_LAYER(query.Type == type);
+        query.State = GPUQueryDataDX11::Active;
+        auto context = _device->GetIM();
+        if (type == GPUQueryType::Timer)
+        {
+            context->Begin(query.DisjointQuery);
+            context->End(query.TimerBeginQuery);
+        }
+        else
+        {
+            context->Begin(query.Query);
+        }
+    }
+
+    return q.Raw;
+}
+
+void GPUContextDX11::EndQuery(uint64 queryID)
+{
+    if (!queryID)
+        return;
+
+    // End query
+    GPUQueryDX11 q;
+    q.Raw = queryID;
+    auto& query = _device->_queries[q.Index];
+    auto context = _device->GetIM();
+    context->End(query.Query);
+    if (q.Type == (uint16)GPUQueryType::Timer)
+    {
+        context->End(query.DisjointQuery);
+    }
+}
+
 void GPUContextDX11::SetViewport(const Viewport& viewport)
 {
    _context->RSSetViewports(1, (D3D11_VIEWPORT*)&viewport);
--- a/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.h
+++ b/Source/Engine/GraphicsDevice/DirectX/DX11/GPUContextDX11.h
@@ -154,6 +154,8 @@ public:
    void DrawIndexedInstanced(uint32 indicesCount, uint32 instanceCount, int32 startInstance, int32 startVertex, int32 startIndex) override;
    void DrawInstancedIndirect(GPUBuffer* bufferForArgs, uint32 offsetForArgs) override;
    void DrawIndexedInstancedIndirect(GPUBuffer* bufferForArgs, uint32 offsetForArgs) override;
+    uint64 BeginQuery(GPUQueryType type) override;
+    void EndQuery(uint64 queryID) override;
    void SetViewport(const Viewport& viewport) override;
    void SetScissor(const Rectangle& scissorRect) override;
    GPUPipelineState* GetState() const override;
--- a/Source/Engine/GraphicsDevice/DirectX/DX11/GPUDeviceDX11.cpp
+++ b/Source/Engine/GraphicsDevice/DirectX/DX11/GPUDeviceDX11.cpp
@@ -175,6 +175,15 @@ GPUVertexLayoutDX11::GPUVertexLayoutDX11(GPUDeviceDX11* device, const Elements&
    }
 }

+void GPUQueryDataDX11::Release()
+{
+    SAFE_RELEASE(Query);
+    SAFE_RELEASE(TimerBeginQuery);
+    SAFE_RELEASE(DisjointQuery);
+    Result = 0;
+    State = Ready;
+}
+
 GPUDevice* GPUDeviceDX11::Create()
 {
    // Configuration
@@ -801,6 +810,11 @@ void GPUDeviceDX11::Dispose()
    {
        SAFE_RELEASE(RasterizerStates[i]);
    }
+    for (auto& query : _queries)
+        query.Release();
+    _queries.Clear();
+    for (auto& e : _readyQueries)
+        e.Clear();

    // Clear DirectX stuff
    SAFE_DELETE(_mainContext);
@@ -877,6 +891,88 @@ void GPUDeviceDX11::DrawEnd()
        infoQueue->ClearStoredMessages();
    }
 #endif
+
+    // Auto-return finished queries back to the pool
+    auto* queries = _queries.Get();
+    int32 queriesCount = _queries.Count();
+    for (int32 i = 0; i < queriesCount; i++)
+    {
+        auto& query = queries[i];
+        if (query.State == GPUQueryDataDX11::Finished)
+        {
+            query.State = GPUQueryDataDX11::Ready;
+            query.Result = 0;
+            _readyQueries[(int32)query.Type].Push(i);
+        }
+    }
+}
+
+bool GPUDeviceDX11::GetQueryResult(uint64 queryID, uint64& result, bool wait)
+{
+    if (!queryID)
+        return false;
+
+    GPUQueryDX11 q;
+    q.Raw = queryID;
+    auto& query = _queries[q.Index];
+    if (query.State == GPUQueryDataDX11::Finished)
+    {
+        // Use resolved result
+        result = query.Result;
+        return true;
+    }
+    auto context = GetIM();
+
+RETRY:
+    bool hasData;
+    if (q.Type == (uint16)GPUQueryType::Timer)
+    {
+        D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjointData;
+        hasData = context->GetData(query.DisjointQuery, &disjointData, sizeof(disjointData), 0) == S_OK;
+        if (hasData)
+        {
+            UINT64 timeBegin = 0, timeEnd = 0;
+            context->GetData(query.TimerBeginQuery, &timeBegin, sizeof(timeBegin), 0);
+            context->GetData(query.Query, &timeEnd, sizeof(timeEnd), 0);
+
+            if (disjointData.Disjoint == FALSE)
+            {
+                result = timeEnd > timeBegin ? (timeEnd - timeBegin) * 1000000ull / disjointData.Frequency : 0;
+            }
+            else
+            {
+                result = 0;
+#if !BUILD_RELEASE
+                static bool LogOnce = true;
+                if (LogOnce)
+                {
+                    LogOnce = false;
+                    LOG(Warning, "Unreliable GPU timer query detected.");
+                }
+#endif
+            }
+        }
+    }
+    else
+    {
+        hasData = context->GetData(query.Query, &result, sizeof(uint64), 0) == S_OK;
+    }
+
+    if (!hasData && wait)
+    {
+        // Wait until data is ready
+        Platform::Yield();
+        goto RETRY;
+    }
+
+    if (hasData)
+    {
+        // Query has valid data now (until auto-recycle back to pool)
+        query.State = GPUQueryDataDX11::Finished;
+        query.Result = result;
+    }
+
+    return hasData;
 }

 GPUTexture* GPUDeviceDX11::CreateTexture(const StringView& name)
--- a/Source/Engine/GraphicsDevice/DirectX/DX11/GPUDeviceDX11.h
+++ b/Source/Engine/GraphicsDevice/DirectX/DX11/GPUDeviceDX11.h
@@ -15,6 +15,38 @@ enum class StencilOperation : byte;
 class GPUContextDX11;
 class GPUSwapChainDX11;

+/// <summary>
+/// GPU query ID packed into 64-bits.
+/// </summary>
+struct GPUQueryDX11
+{
+    union
+    {
+        struct
+        {
+            uint16 Type;
+            uint16 Index;
+            uint32 Padding;
+        };
+        uint64 Raw;
+    };
+};
+
+/// <summary>
+/// GPU query data (reusable via pooling).
+/// </summary>
+struct GPUQueryDataDX11
+{
+    ID3D11Query* Query = nullptr;
+    ID3D11Query* TimerBeginQuery = nullptr;
+    ID3D11Query* DisjointQuery = nullptr;
+    uint64 Result = 0;
+    enum States { Ready, Active, Finished } State = Ready;
+    GPUQueryType Type = GPUQueryType::MAX;
+
+    void Release();
+};
+
 /// <summary>
 /// Implementation of Graphics Device for DirectX 11 backend.
 /// </summary>
@@ -60,6 +92,8 @@ private:
    GPUContextDX11* _mainContext = nullptr;
    bool _allowTearing = false;
    GPUBuffer* _dummyVB = nullptr;
+    Array<GPUQueryDataDX11> _queries;
+    Array<uint16> _readyQueries[2]; // Timer and Occlusion

    // Static Samplers
    ID3D11SamplerState* _samplerLinearClamp = nullptr;
@@ -124,6 +158,7 @@ public:
    void Dispose() override;
    void WaitForGPU() override;
    void DrawEnd() override;
+    bool GetQueryResult(uint64 queryID, uint64& result, bool wait = false) override;
    GPUTexture* CreateTexture(const StringView& name) override;
    GPUShader* CreateShader(const StringView& name) override;
    GPUPipelineState* CreatePipelineState() override;
--- a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUContextDX12.cpp
+++ b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUContextDX12.cpp
@@ -1275,6 +1275,31 @@ void GPUContextDX12::DrawIndexedInstancedIndirect(GPUBuffer* bufferForArgs, uint
    RENDER_STAT_DRAW_CALL(0, 0);
 }

+uint64 GPUContextDX12::BeginQuery(GPUQueryType type)
+{
+    auto query = _device->AllocQuery(type);
+    if (query.Raw)
+    {
+        auto heap = _device->QueryHeaps[query.Heap];
+        if (type == GPUQueryType::Timer) // Timer queries call End twice on different queries to calculate duration between GPU time clocks
+            _commandList->EndQuery(heap->QueryHeap, heap->QueryType, query.SecondaryElement);
+        else
+            _commandList->BeginQuery(heap->QueryHeap, heap->QueryType, query.Element);
+    }
+    return query.Raw;
+}
+
+void GPUContextDX12::EndQuery(uint64 queryID)
+{
+    if (queryID)
+    {
+        GPUQueryDX12 query;
+        query.Raw = queryID;
+        auto heap = _device->QueryHeaps[query.Heap];
+        _commandList->EndQuery(heap->QueryHeap, heap->QueryType, query.Element);
+    }
+}
+
 void GPUContextDX12::SetViewport(const Viewport& viewport)
 {
    _commandList->RSSetViewports(1, (D3D12_VIEWPORT*)&viewport);
--- a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUContextDX12.h
+++ b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUContextDX12.h
@@ -197,6 +197,8 @@ public:
    void DrawIndexedInstanced(uint32 indicesCount, uint32 instanceCount, int32 startInstance, int32 startVertex, int32 startIndex) override;
    void DrawInstancedIndirect(GPUBuffer* bufferForArgs, uint32 offsetForArgs) override;
    void DrawIndexedInstancedIndirect(GPUBuffer* bufferForArgs, uint32 offsetForArgs) override;
+    uint64 BeginQuery(GPUQueryType type) override;
+    void EndQuery(uint64 queryID) override;
    void SetViewport(const Viewport& viewport) override;
    void SetScissor(const Rectangle& scissorRect) override;
    GPUPipelineState* GetState() const override;
--- a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUDeviceDX12.cpp
+++ b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUDeviceDX12.cpp
@@ -555,7 +555,6 @@ GPUDeviceDX12::GPUDeviceDX12(IDXGIFactory4* dxgiFactory, GPUAdapterDX* adapter)
    , _commandQueue(nullptr)
    , _mainContext(nullptr)
    , UploadBuffer(this)
-    , TimestampQueryHeap(this, D3D12_QUERY_HEAP_TYPE_TIMESTAMP, DX12_BACK_BUFFER_COUNT * 1024)
    , Heap_CBV_SRV_UAV(this, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, 4 * 1024, false)
    , Heap_RTV(this, D3D12_DESCRIPTOR_HEAP_TYPE_RTV, 1 * 1024, false)
    , Heap_DSV(this, D3D12_DESCRIPTOR_HEAP_TYPE_DSV, 64, false)
@@ -883,9 +882,6 @@ bool GPUDeviceDX12::Init()
        VALIDATE_DIRECTX_CALL(_device->CreateRootSignature(0, signatureBlob->GetBufferPointer(), signatureBlob->GetBufferSize(), IID_PPV_ARGS(&_rootSignature)));
    }

-    if (TimestampQueryHeap.Init())
-        return true;
-
    // Cached command signatures
    {
        DrawIndirectCommandSignature = New<CommandSignatureDX12>(this, 1);
@@ -927,8 +923,9 @@ void GPUDeviceDX12::RenderEnd()
    // Base
    GPUDeviceDX::RenderEnd();

-    // Resolve the timestamp queries
-    TimestampQueryHeap.EndQueryBatchAndResolveQueryData(_mainContext);
+    // Resolve the queries
+    for (auto heap : QueryHeaps)
+        heap->EndQueryBatchAndResolveQueryData(_mainContext);
 }

 GPUDeviceDX12::~GPUDeviceDX12()
@@ -957,11 +954,47 @@ ID3D12CommandQueue* GPUDeviceDX12::GetCommandQueueDX12() const
    return _commandQueue->GetCommandQueue();
 }

+GPUQueryDX12 GPUDeviceDX12::AllocQuery(GPUQueryType type)
+{
+    // Get query heap with free space
+    int32 heapIndex = 0;
+    int32 count = GPUQueryDX12::GetQueriesCount(type);
+    for (; heapIndex < QueryHeaps.Count(); heapIndex++)
+    {
+        auto heap = QueryHeaps[heapIndex];
+        if (heap->Type == type && heap->CanAlloc(count))
+            break;
+    }
+    if (heapIndex == QueryHeaps.Count())
+    {
+        // Allocate a new query heap
+        auto heap = New<QueryHeapDX12>();
+        int32 size = type == GPUQueryType::Occlusion ? 4096 : 1024;
+        if (heap->Init(this, type, size))
+        {
+            Delete(heap);
+            return {};
+        }
+        QueryHeaps.Add(heap);
+    }
+
+    // Alloc query from the heap
+    GPUQueryDX12 query = {};
+    {
+        static_assert(sizeof(GPUQueryDX12) == sizeof(uint64), "Invalid DX12 query size.");
+        query.Type = (uint16)type;
+        query.Heap = heapIndex;
+        auto heap = QueryHeaps[heapIndex];
+        heap->Alloc(query.Element);
+        if (count == 2)
+            heap->Alloc(query.SecondaryElement);
+    }
+    return query;
+}
+
 void GPUDeviceDX12::Dispose()
 {
    GPUDeviceLock lock(this);
-
-    // Check if has been disposed already
    if (_state == DeviceState::Disposed)
        return;

@@ -982,7 +1015,12 @@ void GPUDeviceDX12::Dispose()
    for (auto& srv : _nullSrv)
        srv.Release();
    _nullUav.Release();
-    TimestampQueryHeap.Destroy();
+    for (auto* heap : QueryHeaps)
+    {
+        heap->Destroy();
+        Delete(heap);
+    }
+    QueryHeaps.Clear();
    DX_SAFE_RELEASE_CHECK(_rootSignature, 0);
    Heap_CBV_SRV_UAV.ReleaseGPU();
    Heap_RTV.ReleaseGPU();
@@ -1012,6 +1050,28 @@ void GPUDeviceDX12::WaitForGPU()
    _commandQueue->WaitForGPU();
 }

+bool GPUDeviceDX12::GetQueryResult(uint64 queryID, uint64& result, bool wait)
+{
+    GPUQueryDX12 query;
+    query.Raw = queryID;
+    auto heap = QueryHeaps[query.Heap];
+    int32 count = GPUQueryDX12::GetQueriesCount((GPUQueryType)query.Type);
+    if (!wait && (!heap->IsReady(query.Element) || (count != 2 || !heap->IsReady(query.SecondaryElement))))
+        return false;
+    if (query.Type == (uint16)GPUQueryType::Timer)
+    {
+        uint64 timestampFrequency = 1;
+        const uint64 timeBegin = *(uint64*)heap->Resolve(query.SecondaryElement);
+        const uint64 timeEnd = *(uint64*)heap->Resolve(query.Element, &timestampFrequency);
+        result = timeEnd > timeBegin ? (timeEnd - timeBegin) * 1000000ull / timestampFrequency : 0;
+    }
+    else
+    {
+        result = *(uint64*)heap->Resolve(query.Element);
+    }
+    return true;
+}
+
 GPUTexture* GPUDeviceDX12::CreateTexture(const StringView& name)
 {
    PROFILE_MEM(GraphicsTextures);
--- a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUDeviceDX12.h
+++ b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUDeviceDX12.h
@@ -65,21 +65,13 @@ public:
    ~GPUDeviceDX12();

 public:
-    /// <summary>
-    /// Data uploading utility via pages.
-    /// </summary>
    UploadBufferDX12 UploadBuffer;
-
-    /// <summary>
-    /// The timestamp queries heap.
-    /// </summary>
-    QueryHeapDX12 TimestampQueryHeap;
-
    bool AllowTearing = false;
    CommandSignatureDX12* DispatchIndirectCommandSignature = nullptr;
    CommandSignatureDX12* DrawIndexedIndirectCommandSignature = nullptr;
    CommandSignatureDX12* DrawIndirectCommandSignature = nullptr;
    GPUBuffer* DummyVB = nullptr;
+    Array<QueryHeapDX12*, InlinedAllocation<8>> QueryHeaps;

    D3D12_CPU_DESCRIPTOR_HANDLE NullSRV(D3D12_SRV_DIMENSION dimension) const;
    D3D12_CPU_DESCRIPTOR_HANDLE NullUAV() const;
@@ -136,6 +128,8 @@ public:
        return _mainContext;
    }

+    GPUQueryDX12 AllocQuery(GPUQueryType type);
+
 public:

    DescriptorHeapPoolDX12 Heap_CBV_SRV_UAV;
@@ -185,6 +179,7 @@ public:
    void RenderEnd() override;
    void Dispose() final override;
    void WaitForGPU() override;
+    bool GetQueryResult(uint64 queryID, uint64& result, bool wait = false) override;
    GPUTexture* CreateTexture(const StringView& name) override;
    GPUShader* CreateShader(const StringView& name) override;
    GPUPipelineState* CreatePipelineState() override;
--- a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUTimerQueryDX12.cpp
+++ b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUTimerQueryDX12.cpp
@@ -20,9 +20,7 @@ void GPUTimerQueryDX12::OnReleaseGPU()
 void GPUTimerQueryDX12::Begin()
 {
    const auto context = _device->GetMainContextDX12();
-    auto& heap = _device->TimestampQueryHeap;
-    heap.EndQuery(context, _begin);
-
+    _query = context->BeginQuery(GPUQueryType::Timer);
    _hasResult = false;
    _endCalled = false;
 }
@@ -31,14 +29,8 @@ void GPUTimerQueryDX12::End()
 {
    if (_endCalled)
        return;
-
    const auto context = _device->GetMainContextDX12();
-    auto& heap = _device->TimestampQueryHeap;
-    heap.EndQuery(context, _end);
-
-    const auto queue = _device->GetCommandQueue()->GetCommandQueue();
-    VALIDATE_DIRECTX_CALL(queue->GetTimestampFrequency(&_gpuFrequency));
-
+    context->EndQuery(_query);
    _endCalled = true;
 }

@@ -48,33 +40,16 @@ bool GPUTimerQueryDX12::HasResult()
        return false;
    if (_hasResult)
        return true;
-
-    auto& heap = _device->TimestampQueryHeap;
-    return heap.IsReady(_end) && heap.IsReady(_begin);
+    uint64 result;
+    return _device->GetQueryResult(_query, result, false);
 }

 float GPUTimerQueryDX12::GetResult()
 {
    if (_hasResult)
-    {
        return _timeDelta;
-    }
-
-    const uint64 timeBegin = *(uint64*)_device->TimestampQueryHeap.ResolveQuery(_begin);
-    const uint64 timeEnd = *(uint64*)_device->TimestampQueryHeap.ResolveQuery(_end);
-
-    // Calculate event duration in milliseconds
-    if (timeEnd > timeBegin)
-    {
-        const uint64 delta = timeEnd - timeBegin;
-        const double frequency = double(_gpuFrequency);
-        _timeDelta = static_cast<float>((delta / frequency) * 1000.0);
-    }
-    else
-    {
-        _timeDelta = 0.0f;
-    }
-
+    uint64 result;
+    _timeDelta = _device->GetQueryResult(_query, result, true) ? (float)((double)result / 1000.0) : 0.0f;
    _hasResult = true;
    return _timeDelta;
 }
--- a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUTimerQueryDX12.h
+++ b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUTimerQueryDX12.h
@@ -17,9 +17,7 @@ private:
    bool _hasResult = false;
    bool _endCalled = false;
    float _timeDelta = 0.0f;
-    uint64 _gpuFrequency = 0;
-    QueryHeapDX12::ElementHandle _begin;
-    QueryHeapDX12::ElementHandle _end;
+    uint64 _query = 0;

 public:

--- a/Source/Engine/GraphicsDevice/DirectX/DX12/QueryHeapDX12.cpp
+++ b/Source/Engine/GraphicsDevice/DirectX/DX12/QueryHeapDX12.cpp
@@ -7,42 +7,34 @@
 #include "GPUContextDX12.h"
 #include "../RenderToolsDX.h"

-QueryHeapDX12::QueryHeapDX12(GPUDeviceDX12* device, const D3D12_QUERY_HEAP_TYPE& queryHeapType, int32 queryHeapCount)
-    : _device(device)
-    , _queryHeap(nullptr)
-    , _resultBuffer(nullptr)
-    , _queryHeapType(queryHeapType)
-    , _currentIndex(0)
-    , _queryHeapCount(queryHeapCount)
+bool QueryHeapDX12::Init(GPUDeviceDX12* device, GPUQueryType type, uint32 size)
 {
-    if (queryHeapType == D3D12_QUERY_HEAP_TYPE_OCCLUSION)
-    {
-        _resultSize = sizeof(uint64);
-        _queryType = D3D12_QUERY_TYPE_OCCLUSION;
-    }
-    else if (queryHeapType == D3D12_QUERY_HEAP_TYPE_TIMESTAMP)
-    {
-        _resultSize = sizeof(uint64);
-        _queryType = D3D12_QUERY_TYPE_TIMESTAMP;
-    }
-    else
-    {
-        MISSING_CODE("Not support D3D12 query heap type.");
-    }
-}
-
-bool QueryHeapDX12::Init()
-{
-    _resultData.Resize(_resultSize * _queryHeapCount);
-
    // Create the query heap
-    D3D12_QUERY_HEAP_DESC heapDesc;
-    heapDesc.Type = _queryHeapType;
+    Type = type;
+    _device = device;
+    _queryHeapCount = size;
+    D3D12_QUERY_HEAP_DESC heapDesc = {};
    heapDesc.Count = _queryHeapCount;
    heapDesc.NodeMask = 0;
-    HRESULT result = _device->GetDevice()->CreateQueryHeap(&heapDesc, IID_PPV_ARGS(&_queryHeap));
+    switch (type)
+    {
+    case GPUQueryType::Timer:
+        _resultSize = sizeof(uint64);
+        QueryType = D3D12_QUERY_TYPE_TIMESTAMP;
+        heapDesc.Type = D3D12_QUERY_HEAP_TYPE_TIMESTAMP;
+        break;
+    case GPUQueryType::Occlusion:
+        _resultSize = sizeof(uint64);
+        QueryType = D3D12_QUERY_TYPE_OCCLUSION;
+        heapDesc.Type = D3D12_QUERY_HEAP_TYPE_OCCLUSION;
+        break;
+    case GPUQueryType::MAX:
+        return true;
+    }
+    _resultData.Resize(_resultSize * _queryHeapCount);
+    HRESULT result = _device->GetDevice()->CreateQueryHeap(&heapDesc, IID_PPV_ARGS(&QueryHeap));
    LOG_DIRECTX_RESULT_WITH_RETURN(result, true);
-    DX_SET_DEBUG_NAME(_queryHeap, "Query Heap");
+    DX_SET_DEBUG_NAME(QueryHeap, "Query Heap");

    // Create the result buffer
    D3D12_HEAP_PROPERTIES heapProperties;
@@ -77,8 +69,8 @@ bool QueryHeapDX12::Init()
 void QueryHeapDX12::Destroy()
 {
    SAFE_RELEASE(_resultBuffer);
-    SAFE_RELEASE(_queryHeap);
-    _currentBatch.Clear();
+    SAFE_RELEASE(QueryHeap);
+    _currentBatch = QueryBatch();
    _resultData.SetCapacity(0);
 }

@@ -92,45 +84,36 @@ void QueryHeapDX12::EndQueryBatchAndResolveQueryData(GPUContextDX12* context)
    _currentBatch.Open = false;

    // Resolve the batch
-    const int32 offset = _currentBatch.Start * _resultSize;
-    context->GetCommandList()->ResolveQueryData(_queryHeap, _queryType, _currentBatch.Start, _currentBatch.Count, _resultBuffer, offset);
-    _currentBatch.Sync = _device->GetCommandQueue()->GetSyncPoint();
+    const uint32 offset = _currentBatch.Start * _resultSize;
+    context->GetCommandList()->ResolveQueryData(QueryHeap, QueryType, _currentBatch.Start, _currentBatch.Count, _resultBuffer, offset);
+    const auto queue = _device->GetCommandQueue();
+    _currentBatch.Sync = queue->GetSyncPoint();
+
+    // Get GPU clock frequency for timer queries
+    if (Type == GPUQueryType::Timer)
+    {
+        VALIDATE_DIRECTX_CALL(queue->GetCommandQueue()->GetTimestampFrequency(&_currentBatch.TimestampFrequency));
+    }

    // Begin a new query batch
    _batches.Add(_currentBatch);
    StartQueryBatch();
 }

-void QueryHeapDX12::AllocQuery(GPUContextDX12* context, ElementHandle& handle)
+bool QueryHeapDX12::CanAlloc(int32 count) const
+{
+    return _currentBatch.Open && _currentIndex + count <= GetQueryHeapCount();
+}
+
+void QueryHeapDX12::Alloc(ElementHandle& handle)
 {
    ASSERT(_currentBatch.Open);

-    // Check if need to start from the buffer head
-    if (_currentIndex >= GetQueryHeapCount())
-    {
-        // We're in the middle of a batch, but we're at the end of the heap so split the batch in two
-        EndQueryBatchAndResolveQueryData(context);
-    }
-
    // Allocate element into the current batch
    handle = _currentIndex++;
    _currentBatch.Count++;
 }

-void QueryHeapDX12::BeginQuery(GPUContextDX12* context, ElementHandle& handle)
-{
-    AllocQuery(context, handle);
-
-    context->GetCommandList()->BeginQuery(_queryHeap, _queryType, handle);
-}
-
-void QueryHeapDX12::EndQuery(GPUContextDX12* context, ElementHandle& handle)
-{
-    AllocQuery(context, handle);
-
-    context->GetCommandList()->EndQuery(_queryHeap, _queryType, handle);
-}
-
 bool QueryHeapDX12::IsReady(ElementHandle& handle)
 {
    // Current batch is not ready (not ended)
@@ -150,7 +133,7 @@ bool QueryHeapDX12::IsReady(ElementHandle& handle)
    return true;
 }

-void* QueryHeapDX12::ResolveQuery(ElementHandle& handle)
+void* QueryHeapDX12::Resolve(ElementHandle& handle, uint64* timestampFrequency)
 {
    // Prevent queries from the current batch
    ASSERT(!_currentBatch.ContainsElement(handle));
@@ -192,10 +175,15 @@ void* QueryHeapDX12::ResolveQuery(ElementHandle& handle)
            // All elements got its results so we can remove this batch
            _batches.RemoveAt(i);

+            // Cache timestamps frequency for later
+            _timestampFrequency = batch.TimestampFrequency;
+
            break;
        }
    }

+    if (timestampFrequency)
+        *timestampFrequency = _timestampFrequency;
    return _resultData.Get() + handle * _resultSize;
 }

@@ -204,7 +192,7 @@ void QueryHeapDX12::StartQueryBatch()
    ASSERT(!_currentBatch.Open);

    // Clear the current batch
-    _currentBatch.Clear();
+    _currentBatch = QueryBatch();

    // Loop active index on overflow
    if (_currentIndex >= GetQueryHeapCount())
--- a/Source/Engine/GraphicsDevice/DirectX/DX12/QueryHeapDX12.h
+++ b/Source/Engine/GraphicsDevice/DirectX/DX12/QueryHeapDX12.h
@@ -10,6 +10,31 @@ class GPUContextDX12;
 class GPUBuffer;

 #include "CommandQueueDX12.h"
+#include "Engine/Graphics/Enums.h"
+
+/// <summary>
+/// GPU query ID packed into 64-bits.
+/// </summary>
+struct GPUQueryDX12
+{
+    union
+    {
+        struct
+        {
+            uint16 Type;
+            uint16 Heap;
+            uint16 Element;
+            uint16 SecondaryElement;
+        };
+        uint64 Raw;
+    };
+
+    static int32 GetQueriesCount(GPUQueryType type)
+    {
+        // Timer queries need to know duration via GPU timer queries difference
+        return type == GPUQueryType::Timer ? 2 : 1;
+    }
+};

 /// <summary>
 /// GPU queries heap for DirectX 12 backend.
@@ -17,14 +42,12 @@ class GPUBuffer;
 class QueryHeapDX12
 {
 public:
-
    /// <summary>
    /// The query element handle.
    /// </summary>
-    typedef int32 ElementHandle;
+    typedef uint16 ElementHandle;

 private:
-
    struct QueryBatch
    {
        /// <summary>
@@ -35,71 +58,54 @@ private:
        /// <summary>
        /// The first element in the batch (inclusive).
        /// </summary>
-        int32 Start = 0;
+        uint32 Start = 0;

        /// <summary>
        /// The amount of elements added to this batch.
        /// </summary>
-        int32 Count = 0;
+        uint32 Count = 0;
+
+        /// <summary>
+        /// The GPU clock frequency for timer queries.
+        /// </summary>
+        uint64 TimestampFrequency = 0;

        /// <summary>
        /// Is the batch still open for more begin/end queries.
        /// </summary>
        bool Open = false;

-        /// <summary>
-        /// Clears this batch.
-        /// </summary>
-        inline void Clear()
-        {
-            Sync = SyncPointDX12();
-            Start = 0;
-            Count = 0;
-            Open = false;
-        }
-
        /// <summary>
        /// Checks if this query batch contains a given element contains the element.
        /// </summary>
        /// <param name="elementIndex">The index of the element.</param>
        /// <returns>True if element is in this query, otherwise false.</returns>
-        bool ContainsElement(int32 elementIndex) const
+        bool ContainsElement(uint32 elementIndex) const
        {
            return elementIndex >= Start && elementIndex < Start + Count;
        }
    };

 private:
-
-    GPUDeviceDX12* _device;
-    ID3D12QueryHeap* _queryHeap;
-    ID3D12Resource* _resultBuffer;
-    D3D12_QUERY_TYPE _queryType;
-    D3D12_QUERY_HEAP_TYPE _queryHeapType;
-    int32 _currentIndex;
-    int32 _resultSize;
-    int32 _queryHeapCount;
+    GPUDeviceDX12* _device = nullptr;
+    ID3D12Resource* _resultBuffer = nullptr;
+    uint32 _currentIndex = 0;
+    uint32 _resultSize = 0;
+    uint32 _queryHeapCount = 0;
    QueryBatch _currentBatch;
    Array<QueryBatch> _batches;
    Array<byte> _resultData;
+    uint64 _timestampFrequency;

 public:
-
-    /// <summary>
-    /// Initializes a new instance of the <see cref="QueryHeapDX12"/> class.
-    /// </summary>
-    /// <param name="device">The device.</param>
-    /// <param name="queryHeapType">Type of the query heap.</param>
-    /// <param name="queryHeapCount">The query heap count.</param>
-    QueryHeapDX12(GPUDeviceDX12* device, const D3D12_QUERY_HEAP_TYPE& queryHeapType, int32 queryHeapCount);
-
-public:
-
    /// <summary>
    /// Initializes this instance.
    /// </summary>
+    /// <param name="device">The device.</param>
+    /// <param name="type">Type of the query heap.</param>
+    /// <param name="size">The size of the heap.</param>
    ///	<returns>True if failed, otherwise false.</returns>
-    bool Init();
+    bool Init(GPUDeviceDX12* device, GPUQueryType type, uint32 size);

    /// <summary>
    /// Destroys this instance.
@@ -107,12 +113,14 @@ public:
    void Destroy();

 public:
+    GPUQueryType Type;
+    ID3D12QueryHeap* QueryHeap = nullptr;
+    D3D12_QUERY_TYPE QueryType = D3D12_QUERY_TYPE_OCCLUSION;

    /// <summary>
    /// Gets the query heap capacity.
    /// </summary>
-    /// <returns>The queries count.</returns>
-    FORCE_INLINE int32 GetQueryHeapCount() const
+    FORCE_INLINE uint32 GetQueryHeapCount() const
    {
        return _queryHeapCount;
    }
@@ -120,8 +128,7 @@ public:
    /// <summary>
    /// Gets the size of the result value (in bytes).
    /// </summary>
-    /// <returns>The size of the query result value (in bytes).</returns>
-    FORCE_INLINE int32 GetResultSize() const
+    FORCE_INLINE uint32 GetResultSize() const
    {
        return _resultSize;
    }
@@ -129,40 +136,30 @@ public:
    /// <summary>
    /// Gets the result buffer (CPU readable via Map/Unmap).
    /// </summary>
-    /// <returns>The query results buffer.</returns>
    FORCE_INLINE ID3D12Resource* GetResultBuffer() const
    {
        return _resultBuffer;
    }

 public:
-
    /// <summary>
    /// Stops tracking the current batch of begin/end query calls that will be resolved together. This implicitly starts a new batch.
    /// </summary>
    /// <param name="context">The context.</param>
    void EndQueryBatchAndResolveQueryData(GPUContextDX12* context);

+    /// <summary>
+    /// Checks if can alloc a new query (without rolling the existing batch).
+    /// </summary>
+    /// <param name="count">How many elements to allocate?</param>
+    /// <returns>True if can alloc new query within the same batch.</returns>
+    bool CanAlloc(int32 count = 1) const;
+
    /// <summary>
    /// Allocates the query heap element.
    /// </summary>
-    /// <param name="context">The context.</param>
    /// <param name="handle">The result handle.</param>
-    void AllocQuery(GPUContextDX12* context, ElementHandle& handle);
-
-    /// <summary>
-    /// Calls BeginQuery on command list for the given query heap slot.
-    /// </summary>
-    /// <param name="context">The context.</param>
-    /// <param name="handle">The query handle.</param>
-    void BeginQuery(GPUContextDX12* context, ElementHandle& handle);
-
-    /// <summary>
-    /// Calls EndQuery on command list for the given query heap slot.
-    /// </summary>
-    /// <param name="context">The context.</param>
-    /// <param name="handle">The query handle.</param>
-    void EndQuery(GPUContextDX12* context, ElementHandle& handle);
+    void Alloc(ElementHandle& handle);

    /// <summary>
    /// Determines whether the specified query handle is ready to read data (command list has been executed by the GPU).
@@ -175,11 +172,11 @@ public:
    /// Resolves the query (or skips if already resolved).
    /// </summary>
    /// <param name="handle">The result handle.</param>
+    /// <param name="timestampFrequency">The optional pointer to GPU timestamps frequency value to store.</param>
    ///	<returns>The pointer to the resolved query data.</returns>
-    void* ResolveQuery(ElementHandle& handle);
+    void* Resolve(ElementHandle& handle, uint64* timestampFrequency = nullptr);

 private:
-
    /// <summary>
    /// Starts tracking a new batch of begin/end query calls that will be resolved together
    /// </summary>
--- a/Source/Engine/GraphicsDevice/Null/GPUContextNull.h
+++ b/Source/Engine/GraphicsDevice/Null/GPUContextNull.h
@@ -160,6 +160,15 @@ public:
    {
    }

+    uint64 BeginQuery(GPUQueryType type) override
+    {
+        return 0;
+    }
+
+    void EndQuery(uint64 queryID) override
+    {
+    }
+
    void SetViewport(const Viewport& viewport) override
    {
    }
--- a/Source/Engine/GraphicsDevice/Null/GPUDeviceNull.cpp
+++ b/Source/Engine/GraphicsDevice/Null/GPUDeviceNull.cpp
@@ -144,6 +144,11 @@ void GPUDeviceNull::WaitForGPU()
 {
 }

+bool GPUDeviceNull::GetQueryResult(uint64 queryID, uint64& result, bool wait)
+{
+    return false;
+}
+
 GPUTexture* GPUDeviceNull::CreateTexture(const StringView& name)
 {
    PROFILE_MEM(GraphicsTextures);
--- a/Source/Engine/GraphicsDevice/Null/GPUDeviceNull.h
+++ b/Source/Engine/GraphicsDevice/Null/GPUDeviceNull.h
@@ -20,18 +20,15 @@ class GPUDeviceNull : public GPUDevice
    friend GPUSwapChainNull;

 private:
-
    GPUContextNull* _mainContext;
    GPUAdapterNull* _adapter;

 public:
-
    static GPUDevice* Create();
    GPUDeviceNull();
    ~GPUDeviceNull();

 public:
-
    // [GPUDevice]
    GPUContext* GetMainContext() override;
    GPUAdapter* GetAdapter() const override;
@@ -41,6 +38,7 @@ public:
    void Draw() override;
    void Dispose() override;
    void WaitForGPU() override;
+    bool GetQueryResult(uint64 queryID, uint64& result, bool wait = false) override;
    GPUTexture* CreateTexture(const StringView& name) override;
    GPUShader* CreateShader(const StringView& name) override;
    GPUPipelineState* CreatePipelineState() override;
--- a/Source/Engine/GraphicsDevice/Vulkan/CmdBufferVulkan.cpp
+++ b/Source/Engine/GraphicsDevice/Vulkan/CmdBufferVulkan.cpp
@@ -6,7 +6,7 @@
 #include "RenderToolsVulkan.h"
 #include "QueueVulkan.h"
 #include "GPUContextVulkan.h"
-#if VULKAN_USE_QUERIES
+#if VULKAN_USE_TIMER_QUERIES
 #include "GPUTimerQueryVulkan.h"
 #endif
 #include "DescriptorSetVulkan.h"
@@ -243,6 +243,7 @@ void CmdBufferPoolVulkan::RefreshFenceStatus(const CmdBufferVulkan* skipCmdBuffe

 CmdBufferManagerVulkan::CmdBufferManagerVulkan(GPUDeviceVulkan* device, GPUContextVulkan* context)
    : _device(device)
+    , _context(context)
    , _pool(device)
    , _queue(context->GetQueue())
    , _activeCmdBuffer(nullptr)
@@ -259,12 +260,28 @@ void CmdBufferManagerVulkan::SubmitActiveCmdBuffer(SemaphoreVulkan* signalSemaph
        if (_activeCmdBuffer->IsInsideRenderPass())
            _activeCmdBuffer->EndRenderPass();

-#if VULKAN_USE_QUERIES
-        // Pause all active queries
-        for (int32 i = 0; i < _queriesInProgress.Count(); i++)
+#if VULKAN_USE_TIMER_QUERIES && GPU_VULKAN_PAUSE_QUERIES
+        // Pause all active timer queries
+        auto queries = _activeTimerQueries.Get();
+#if GPU_VULKAN_QUERY_NEW
+        for (int32 i = 0; i < _activeTimerQueries.Count(); i++)
        {
-            _queriesInProgress.Get()[i]->Interrupt(_activeCmdBuffer);
+            GPUQueryVulkan query;
+            query.Raw = queries[i];
+
+            // End active query to get time from start until submission
+            auto pool = _device->QueryPools[query.PoolIndex];
+            vkCmdWriteTimestamp(_activeCmdBuffer->GetHandle(), VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, pool->GetHandle(), query.SecondQueryIndex);
+            pool->MarkQueryAsStarted(query.SecondQueryIndex);
+            // TODO: somehow handle ending this query properly by stopping split query instead
+            //_context->EndQuery(query.Raw);
+
+            // TODO: reimplement timer queries pause/resume to be more exact?
        }
+#else
+        for (int32 i = 0; i < _activeTimerQueries.Count(); i++)
+            queries->Interrupt(_activeCmdBuffer);
+#endif
 #endif

        _activeCmdBuffer->End();
@@ -317,27 +334,37 @@ void CmdBufferManagerVulkan::PrepareForNewActiveCommandBuffer()

    _activeCmdBuffer->Begin();

-#if VULKAN_USE_QUERIES
-    // Resume any paused queries with the new command buffer
-    for (int32 i = 0; i < _queriesInProgress.Count(); i++)
+#if VULKAN_USE_TIMER_QUERIES && GPU_VULKAN_PAUSE_QUERIES
+    // Resume any paused timer queries with the new command buffer
+    auto queries = _activeTimerQueries.Get();
+#if GPU_VULKAN_QUERY_NEW
+    for (int32 i = 0; i < _activeTimerQueries.Count(); i++)
    {
-        _queriesInProgress.Get()[i]->Resume(_activeCmdBuffer);
+        GPUQueryVulkan query;
+        query.Raw = queries[i];
+        //_activeTimerQueries.Get()[i]->Resume(_activeCmdBuffer);
+    }
+#else
+    for (int32 i = 0; i < _activeTimerQueries.Count(); i++)
+    {
+        queries->Resume(_activeCmdBuffer);
    }
 #endif
+#endif
 }

-void CmdBufferManagerVulkan::OnQueryBegin(GPUTimerQueryVulkan* query)
+#if GPU_VULKAN_QUERY_NEW && GPU_VULKAN_PAUSE_QUERIES
+
+void CmdBufferManagerVulkan::OnTimerQueryBegin(QueryType query)
 {
-#if VULKAN_USE_QUERIES
-    _queriesInProgress.Add(query);
-#endif
+    _activeTimerQueries.Add(query);
 }

-void CmdBufferManagerVulkan::OnQueryEnd(GPUTimerQueryVulkan* query)
+void CmdBufferManagerVulkan::OnTimerQueryEnd(QueryType query)
 {
-#if VULKAN_USE_QUERIES
-    _queriesInProgress.Remove(query);
-#endif
+    _activeTimerQueries.Remove(query);
 }

 #endif
+
+#endif
--- a/Source/Engine/GraphicsDevice/Vulkan/CmdBufferVulkan.h
+++ b/Source/Engine/GraphicsDevice/Vulkan/CmdBufferVulkan.h
@@ -168,10 +168,18 @@ class CmdBufferManagerVulkan
 {
 private:
    GPUDeviceVulkan* _device;
+    GPUContextVulkan* _context;
    CmdBufferPoolVulkan _pool;
    QueueVulkan* _queue;
    CmdBufferVulkan* _activeCmdBuffer;
-    Array<GPUTimerQueryVulkan*> _queriesInProgress;
+#if VULKAN_USE_TIMER_QUERIES && GPU_VULKAN_PAUSE_QUERIES
+#if GPU_VULKAN_QUERY_NEW
+    typedef uint64 QueryType;
+#else
+    typedef GPUTimerQueryVulkan* QueryType;
+#endif
+    Array<QueryType> _activeTimerQueries;
+#endif

 public:
    CmdBufferManagerVulkan(GPUDeviceVulkan* device, GPUContextVulkan* context);
@@ -192,11 +200,6 @@ public:
        return _activeCmdBuffer != nullptr;
    }

-    FORCE_INLINE bool HasQueriesInProgress() const
-    {
-        return _queriesInProgress.Count() != 0;
-    }
-
    FORCE_INLINE CmdBufferVulkan* GetCmdBuffer()
    {
        if (!_activeCmdBuffer)
@@ -207,14 +210,16 @@ public:
 public:
    void SubmitActiveCmdBuffer(SemaphoreVulkan* signalSemaphore = nullptr);
    void WaitForCmdBuffer(CmdBufferVulkan* cmdBuffer, float timeInSecondsToWait = 1.0f);
-    void RefreshFenceStatus(CmdBufferVulkan* skipCmdBuffer = nullptr)
+    void RefreshFenceStatus(const CmdBufferVulkan* skipCmdBuffer = nullptr)
    {
        _pool.RefreshFenceStatus(skipCmdBuffer);
    }
    void PrepareForNewActiveCommandBuffer();

-    void OnQueryBegin(GPUTimerQueryVulkan* query);
-    void OnQueryEnd(GPUTimerQueryVulkan* query);
+#if VULKAN_USE_TIMER_QUERIES && GPU_VULKAN_PAUSE_QUERIES
+    void OnTimerQueryBegin(QueryType query);
+    void OnTimerQueryEnd(QueryType query);
+#endif
 };

 #endif
--- a/Source/Engine/GraphicsDevice/Vulkan/Config.h
+++ b/Source/Engine/GraphicsDevice/Vulkan/Config.h
@@ -45,8 +45,14 @@
 #endif
 #endif

-#ifndef VULKAN_USE_QUERIES
-#define VULKAN_USE_QUERIES 1
+#ifndef VULKAN_USE_TIMER_QUERIES
+#define VULKAN_USE_TIMER_QUERIES 1
 #endif

+// Toggles GPUTimerQueryVulkan to use BeginQuery/EndQuery via GPuContext rather than old custom implementation
+#define GPU_VULKAN_QUERY_NEW 1
+
+// Toggles pausing and resuming all GPU timer queries when command buffer is being flushed (for more exact timings)
+#define GPU_VULKAN_PAUSE_QUERIES 0
+
 #endif
--- a/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.cpp
+++ b/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.cpp
@@ -1300,6 +1300,72 @@ void GPUContextVulkan::DrawIndexedInstancedIndirect(GPUBuffer* bufferForArgs, ui
    RENDER_STAT_DRAW_CALL(0, 0);
 }

+uint64 GPUContextVulkan::BeginQuery(GPUQueryType type)
+{
+    // Check if timer queries are supported
+    if (type == GPUQueryType::Timer && _device->PhysicalDeviceLimits.timestampComputeAndGraphics != VK_TRUE)
+        return 0;
+
+    // Allocate query
+    auto poolIndex = _device->GetOrCreateQueryPool(type);
+    auto pool = _device->QueryPools[poolIndex];
+    uint32 index = 0;
+    const auto cmdBuffer = _cmdBufferManager->GetCmdBuffer();
+    if (!pool->AcquireQuery(cmdBuffer, index))
+        return 0;
+    GPUQueryVulkan query;
+    query.PoolIndex = (uint16)poolIndex;
+    query.QueryIndex = (uint16)index;
+    query.SecondQueryIndex = 0;
+    query.Dummy = 1; // Ensure Raw is never 0, even for the first query
+
+    // Begin query
+    switch (type)
+    {
+    case GPUQueryType::Timer:
+        // Timer queries need 2 slots (begin + end)
+        pool->AcquireQuery(cmdBuffer, index);
+        query.SecondQueryIndex = (uint16)index;
+
+        vkCmdWriteTimestamp(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, pool->GetHandle(), query.QueryIndex);
+#if GPU_VULKAN_PAUSE_QUERIES
+        _cmdBufferManager->OnTimerQueryBegin(query.Raw);
+#endif
+        break;
+    case GPUQueryType::Occlusion:
+        vkCmdBeginQuery(cmdBuffer->GetHandle(), pool->GetHandle(), query.QueryIndex, 0);
+        break;
+    }
+    pool->MarkQueryAsStarted(query.QueryIndex);
+
+    return query.Raw;
+}
+
+void GPUContextVulkan::EndQuery(uint64 queryID)
+{
+    if (!queryID)
+        return;
+    GPUQueryVulkan query;
+    query.Raw = queryID;
+    auto pool = _device->QueryPools[query.PoolIndex];
+
+    // End query
+    const auto cmdBuffer = _cmdBufferManager->GetCmdBuffer();
+    switch (pool->Type)
+    {
+    case GPUQueryType::Timer:
+        vkCmdWriteTimestamp(cmdBuffer->GetHandle(), VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, pool->GetHandle(), query.SecondQueryIndex);
+        pool->MarkQueryAsStarted(query.SecondQueryIndex);
+#if GPU_VULKAN_PAUSE_QUERIES
+        _cmdBufferManager->OnTimerQueryEnd(query.Raw);
+#endif
+        break;
+    case GPUQueryType::Occlusion:
+        vkCmdEndQuery(cmdBuffer->GetHandle(), pool->GetHandle(), query.QueryIndex);
+        break;
+    }
+}
+
 void GPUContextVulkan::SetViewport(const Viewport& viewport)
 {
    vkCmdSetViewport(_cmdBufferManager->GetCmdBuffer()->GetHandle(), 0, 1, (VkViewport*)&viewport);
--- a/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.h
+++ b/Source/Engine/GraphicsDevice/Vulkan/GPUContextVulkan.h
@@ -189,6 +189,8 @@ public:
    void DrawIndexedInstanced(uint32 indicesCount, uint32 instanceCount, int32 startInstance, int32 startVertex, int32 startIndex) override;
    void DrawInstancedIndirect(GPUBuffer* bufferForArgs, uint32 offsetForArgs) override;
    void DrawIndexedInstancedIndirect(GPUBuffer* bufferForArgs, uint32 offsetForArgs) override;
+    uint64 BeginQuery(GPUQueryType type) override;
+    void EndQuery(uint64 queryID) override;
    void SetViewport(const Viewport& viewport) override;
    void SetScissor(const Rectangle& scissorRect) override;
    GPUPipelineState* GetState() const override;
--- a/Source/Engine/GraphicsDevice/Vulkan/GPUDeviceVulkan.cpp
+++ b/Source/Engine/GraphicsDevice/Vulkan/GPUDeviceVulkan.cpp
@@ -627,14 +627,14 @@ RenderPassVulkan::~RenderPassVulkan()
    Device->DeferredDeletionQueue.EnqueueResource(DeferredDeletionQueueVulkan::Type::RenderPass, Handle);
 }

-QueryPoolVulkan::QueryPoolVulkan(GPUDeviceVulkan* device, int32 capacity, VkQueryType type)
+QueryPoolVulkan::QueryPoolVulkan(GPUDeviceVulkan* device, int32 capacity, GPUQueryType type)
    : _device(device)
    , _handle(VK_NULL_HANDLE)
-    , _type(type)
+    , Type(type)
 {
    VkQueryPoolCreateInfo createInfo;
    RenderToolsVulkan::ZeroStruct(createInfo, VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
-    createInfo.queryType = type;
+    createInfo.queryType = type == GPUQueryType::Occlusion ? VK_QUERY_TYPE_OCCLUSION : VK_QUERY_TYPE_TIMESTAMP;
    createInfo.queryCount = capacity;
    VALIDATE_VULKAN_RESULT(vkCreateQueryPool(device->Device, &createInfo, nullptr, &_handle));

@@ -667,7 +667,7 @@ void QueryPoolVulkan::Reset(CmdBufferVulkan* cmdBuffer)

 #endif

-BufferedQueryPoolVulkan::BufferedQueryPoolVulkan(GPUDeviceVulkan* device, int32 capacity, VkQueryType type)
+BufferedQueryPoolVulkan::BufferedQueryPoolVulkan(GPUDeviceVulkan* device, int32 capacity, GPUQueryType type)
    : QueryPoolVulkan(device, capacity, type)
    , _lastBeginIndex(0)
 {
@@ -720,6 +720,16 @@ void BufferedQueryPoolVulkan::ReleaseQuery(uint32 queryIndex)
            _lastBeginIndex = (uint32)queryIndex;
        }
    }
+    if (_usedQueryBits[word] == 0)
+    {
+        // Check if pool got empty and reset the pointer back to start
+        for (int32 wordIndex = 0; wordIndex < _usedQueryBits.Count(); wordIndex++)
+        {
+            if (_usedQueryBits[wordIndex])
+                return;
+        }
+        _lastBeginIndex = 0;
+    }
 }

 void BufferedQueryPoolVulkan::MarkQueryAsStarted(uint32 queryIndex)
@@ -729,7 +739,7 @@ void BufferedQueryPoolVulkan::MarkQueryAsStarted(uint32 queryIndex)
    _startedQueryBits[word] = _startedQueryBits[word] | bit;
 }

-bool BufferedQueryPoolVulkan::GetResults(GPUContextVulkan* context, uint32 index, uint64& result)
+bool BufferedQueryPoolVulkan::GetResults(uint32 index, uint64& result)
 {
    const uint64 bit = (uint64)(index % 64);
    const uint64 bitMask = (uint64)1 << bit;
@@ -1228,22 +1238,20 @@ GPUDeviceVulkan::~GPUDeviceVulkan()
    GPUDeviceVulkan::Dispose();
 }

-BufferedQueryPoolVulkan* GPUDeviceVulkan::FindAvailableQueryPool(VkQueryType queryType)
+int32 GPUDeviceVulkan::GetOrCreateQueryPool(GPUQueryType type)
 {
-    auto& pools = queryType == VK_QUERY_TYPE_OCCLUSION ? OcclusionQueryPools : TimestampQueryPools;
-
-    // Try to use pool with available space inside
-    for (int32 i = 0; i < pools.Count(); i++)
+    auto pools = QueryPools.Get();
+    for (int32 i = 0; i < QueryPools.Count(); i++)
    {
-        auto pool = pools.Get()[i];
-        if (pool->HasRoom())
-            return pool;
+        auto pool = pools[i];
+        if (pool->Type == type && pool->HasRoom())
+            return i;
    }

-    // Create new pool
-    const auto pool = New<BufferedQueryPoolVulkan>(this, queryType == VK_QUERY_TYPE_OCCLUSION ? 4096 : 1024, queryType);
-    pools.Add(pool);
-    return pool;
+    PROFILE_CPU_NAMED("Create Create Pool");
+    auto pool = New<BufferedQueryPoolVulkan>(this, type == GPUQueryType::Occlusion ? 4096 : 1024, type);
+    QueryPools.Add(pool);
+    return QueryPools.Count() - 1;
 }

 RenderPassVulkan* GPUDeviceVulkan::GetOrCreateRenderPass(RenderTargetLayoutVulkan& layout)
@@ -1752,6 +1760,10 @@ bool GPUDeviceVulkan::Init()
        limits.MaximumTexture3DSize = PhysicalDeviceLimits.maxImageDimension3D;
        limits.MaximumTextureCubeSize = PhysicalDeviceLimits.maxImageDimensionCube;
        limits.MaximumSamplerAnisotropy = PhysicalDeviceLimits.maxSamplerAnisotropy;
+        if (PhysicalDeviceLimits.timestampComputeAndGraphics != VK_TRUE)
+        {
+            LOG(Warning, "Timer Queries are unsupported by this device");
+        }

        for (int32 i = 0; i < static_cast<int32>(PixelFormat::MAX); i++)
        {
@@ -1982,6 +1994,16 @@ void GPUDeviceVulkan::DrawBegin()
    // Base
    GPUDevice::DrawBegin();

+    // Put back used queries to the pool
+    for (auto& query : QueriesToRelease)
+    {
+        auto pool = QueryPools[query.PoolIndex];
+        pool->ReleaseQuery(query.QueryIndex);
+        if (pool->Type == GPUQueryType::Timer)
+            pool->ReleaseQuery(query.SecondQueryIndex);
+    }
+    QueriesToRelease.Clear();
+
    // Flush resources
    DeferredDeletionQueue.ReleaseResources();
    DescriptorPoolsManager->GC();
@@ -2022,8 +2044,7 @@ void GPUDeviceVulkan::Dispose()
    _layouts.ClearDelete();
    HelperResources.Dispose();
    UploadBuffer.Dispose();
-    TimestampQueryPools.ClearDelete();
-    OcclusionQueryPools.ClearDelete();
+    QueryPools.ClearDelete();
    SAFE_DELETE_GPU_RESOURCE(UniformBufferUploader);
    Delete(DescriptorPoolsManager);
    SAFE_DELETE(MainContext);
@@ -2084,6 +2105,61 @@ void GPUDeviceVulkan::WaitForGPU()
    }
 }

+bool GPUDeviceVulkan::GetQueryResult(uint64 queryID, uint64& result, bool wait)
+{
+    if (!queryID)
+        return false;
+    GPUQueryVulkan query;
+    query.Raw = queryID;
+    auto pool = QueryPools[query.PoolIndex];
+
+RETRY:
+    bool hasData;
+    uint64 resultSecondary;
+    switch (pool->Type)
+    {
+    case GPUQueryType::Timer:
+        hasData = pool->GetResults(query.QueryIndex, result) && pool->GetResults(query.SecondQueryIndex, resultSecondary);
+#if VULKAN_USE_TIMER_QUERIES && GPU_VULKAN_PAUSE_QUERIES
+        if (hasData)
+        {
+            // Check if dependant queries have completed (timer queries can be split when active command buffer get submitted) 
+            // TODO: impl this
+        }
+#endif
+        if (hasData)
+        {
+            if (resultSecondary >= result)
+            {
+                // Convert GPU timestamps to nanoseconds and then to microseconds
+                double nanoseconds = double(resultSecondary - result) * double(PhysicalDeviceLimits.timestampPeriod);
+                result = (uint64)(nanoseconds * 0.001);
+            }
+            else
+                result = 0;
+        }
+        break;
+    case GPUQueryType::Occlusion:
+        hasData = pool->GetResults(query.QueryIndex, result);
+        break;
+    }
+
+    if (!hasData && wait)
+    {
+        // Wait until data is ready
+        Platform::Yield();
+        goto RETRY;
+    }
+
+    if (hasData)
+    {
+        // Auto-release query on the next frame
+        QueriesToRelease.Add(query);
+    }
+
+    return hasData;
+}
+
 GPUTexture* GPUDeviceVulkan::CreateTexture(const StringView& name)
 {
    PROFILE_MEM(GraphicsTextures);
--- a/Source/Engine/GraphicsDevice/Vulkan/GPUDeviceVulkan.h
+++ b/Source/Engine/GraphicsDevice/Vulkan/GPUDeviceVulkan.h
@@ -28,6 +28,24 @@ class GPUDeviceVulkan;
 class UniformBufferUploaderVulkan;
 class DescriptorPoolsManagerVulkan;

+/// <summary>
+/// GPU query ID packed into 64-bits.
+/// </summary>
+struct GPUQueryVulkan
+{
+    union
+    {
+        struct
+        {
+            uint16 PoolIndex;
+            uint16 QueryIndex;
+            uint16 SecondQueryIndex;
+            uint16 Dummy;
+        };
+        uint64 Raw;
+    };
+};
+
 class SemaphoreVulkan
 {
 private:
@@ -261,16 +279,17 @@ protected:
    GPUDeviceVulkan* _device;
    VkQueryPool _handle;

-    const VkQueryType _type;
 #if VULKAN_RESET_QUERY_POOLS
    Array<Range> _resetRanges;
 #endif

 public:
-    QueryPoolVulkan(GPUDeviceVulkan* device, int32 capacity, VkQueryType type);
+    QueryPoolVulkan(GPUDeviceVulkan* device, int32 capacity, GPUQueryType type);
    ~QueryPoolVulkan();

 public:
+    const GPUQueryType Type;
+
    inline VkQueryPool GetHandle() const
    {
        return _handle;
@@ -294,11 +313,11 @@ private:
    int32 _lastBeginIndex;

 public:
-    BufferedQueryPoolVulkan(GPUDeviceVulkan* device, int32 capacity, VkQueryType type);
+    BufferedQueryPoolVulkan(GPUDeviceVulkan* device, int32 capacity, GPUQueryType type);
    bool AcquireQuery(CmdBufferVulkan* cmdBuffer, uint32& resultIndex);
    void ReleaseQuery(uint32 queryIndex);
    void MarkQueryAsStarted(uint32 queryIndex);
-    bool GetResults(GPUContextVulkan* context, uint32 index, uint64& result);
+    bool GetResults(uint32 index, uint64& result);
    bool HasRoom() const;
 };

@@ -498,14 +517,13 @@ public:
    VkPhysicalDeviceFeatures PhysicalDeviceFeatures;
    VkPhysicalDeviceVulkan12Features PhysicalDeviceFeatures12;

-    Array<BufferedQueryPoolVulkan*> TimestampQueryPools;
-    Array<BufferedQueryPoolVulkan*> OcclusionQueryPools;
-
+    Array<BufferedQueryPoolVulkan*> QueryPools;
+    Array<GPUQueryVulkan> QueriesToRelease;
 #if VULKAN_RESET_QUERY_POOLS
    Array<QueryPoolVulkan*> QueriesToReset;
 #endif

-    BufferedQueryPoolVulkan* FindAvailableQueryPool(VkQueryType queryType);
+    int32 GetOrCreateQueryPool(GPUQueryType type);
    RenderPassVulkan* GetOrCreateRenderPass(RenderTargetLayoutVulkan& layout);
    FramebufferVulkan* GetOrCreateFramebuffer(FramebufferVulkan::Key& key, VkExtent2D& extent, uint32 layers);
    PipelineLayoutVulkan* GetOrCreateLayout(DescriptorSetLayoutInfoVulkan& key);
@@ -553,6 +571,7 @@ public:
    void DrawBegin() override;
    void Dispose() override;
    void WaitForGPU() override;
+    bool GetQueryResult(uint64 queryID, uint64& result, bool wait = false) override;
    GPUTexture* CreateTexture(const StringView& name) override;
    GPUShader* CreateShader(const StringView& name) override;
    GPUPipelineState* CreatePipelineState() override;
--- a/Source/Engine/GraphicsDevice/Vulkan/GPUTimerQueryVulkan.cpp
+++ b/Source/Engine/GraphicsDevice/Vulkan/GPUTimerQueryVulkan.cpp
@@ -11,6 +11,78 @@ GPUTimerQueryVulkan::GPUTimerQueryVulkan(GPUDeviceVulkan* device)
 {
 }

+#if !VULKAN_USE_TIMER_QUERIES
+
+void GPUTimerQueryVulkan::OnReleaseGPU()
+{
+}
+
+void GPUTimerQueryVulkan::Begin()
+{
+}
+
+void GPUTimerQueryVulkan::End()
+{
+}
+
+bool GPUTimerQueryVulkan::HasResult()
+{
+    return true;
+}
+
+float GPUTimerQueryVulkan::GetResult()
+{
+    return 0;
+}
+
+#elif GPU_VULKAN_QUERY_NEW
+
+void GPUTimerQueryVulkan::OnReleaseGPU()
+{
+    _hasResult = false;
+    _endCalled = false;
+    _timeDelta = 0.0f;
+}
+
+void GPUTimerQueryVulkan::Begin()
+{
+    const auto context = _device->GetMainContext();
+    _query = context->BeginQuery(GPUQueryType::Timer);
+    _hasResult = false;
+    _endCalled = false;
+}
+
+void GPUTimerQueryVulkan::End()
+{
+    if (_endCalled)
+        return;
+    const auto context = _device->GetMainContext();
+    context->EndQuery(_query);
+    _endCalled = true;
+}
+
+bool GPUTimerQueryVulkan::HasResult()
+{
+    if (!_endCalled)
+        return false;
+    if (_hasResult)
+        return true;
+    uint64 result;
+    return _device->GetQueryResult(_query, result, false);
+}
+
+float GPUTimerQueryVulkan::GetResult()
+{
+    if (_hasResult)
+        return _timeDelta;
+    uint64 result;
+    _timeDelta = _device->GetQueryResult(_query, result, true) ? (float)((double)result / 1000.0) : 0.0f;
+    _hasResult = true;
+    return _timeDelta;
+}
+
+#else
+
 void GPUTimerQueryVulkan::Interrupt(CmdBufferVulkan* cmdBuffer)
 {
    if (!_interrupted)
@@ -38,8 +110,7 @@ bool GPUTimerQueryVulkan::GetResult(Query& query)
 {
    if (query.Pool)
    {
-        const auto context = (GPUContextVulkan*)_device->GetMainContext();
-        if (query.Pool->GetResults(context, query.Index, query.Result))
+        if (query.Pool->GetResults(query.Index, query.Result))
        {
            // Release query
            query.Pool->ReleaseQuery(query.Index);
@@ -58,7 +129,7 @@ bool GPUTimerQueryVulkan::GetResult(Query& query)

 void GPUTimerQueryVulkan::WriteTimestamp(CmdBufferVulkan* cmdBuffer, Query& query, VkPipelineStageFlagBits stage) const
 {
-    auto pool = _device->FindAvailableQueryPool(VK_QUERY_TYPE_TIMESTAMP);
+    auto pool = _device->QueryPools[_device->GetOrCreateQueryPool(GPUQueryType::Timer)];
    uint32 index;
    if (pool->AcquireQuery(cmdBuffer, index))
    {
@@ -76,7 +147,6 @@ void GPUTimerQueryVulkan::WriteTimestamp(CmdBufferVulkan* cmdBuffer, Query& quer

 bool GPUTimerQueryVulkan::TryGetResult()
 {
-#if VULKAN_USE_QUERIES
    // Try get queries value (if not already)
    for (int32 i = 0; i < _queries.Count(); i++)
    {
@@ -115,20 +185,12 @@ bool GPUTimerQueryVulkan::TryGetResult()
            e.End.Pool->ReleaseQuery(e.End.Index);
    }
    _queries.Clear();
-#else
-    _timeDelta = 0.0f;
-    _hasResult = true;
-#endif
    return true;
 }

 bool GPUTimerQueryVulkan::UseQueries()
 {
-#if VULKAN_USE_QUERIES
    return _device->PhysicalDeviceLimits.timestampComputeAndGraphics == VK_TRUE;
-#else
-    return false;
-#endif
 }

 void GPUTimerQueryVulkan::OnReleaseGPU()
@@ -150,7 +212,6 @@ void GPUTimerQueryVulkan::OnReleaseGPU()

 void GPUTimerQueryVulkan::Begin()
 {
-#if VULKAN_USE_QUERIES
    if (UseQueries())
    {
        const auto context = (GPUContextVulkan*)_device->GetMainContext();
@@ -162,12 +223,11 @@ void GPUTimerQueryVulkan::Begin()
        _queryIndex = 0;
        _interrupted = false;
        WriteTimestamp(cmdBuffer, e.Begin, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
-        context->GetCmdBufferManager()->OnQueryBegin(this);
+        context->GetCmdBufferManager()->OnTimerQueryBegin(this);

        ASSERT(_queries.IsEmpty());
        _queries.Add(e);
    }
-#endif

    _hasResult = false;
    _endCalled = false;
@@ -178,7 +238,6 @@ void GPUTimerQueryVulkan::End()
    if (_endCalled)
        return;

-#if VULKAN_USE_QUERIES
    if (UseQueries())
    {
        const auto context = (GPUContextVulkan*)_device->GetMainContext();
@@ -188,9 +247,8 @@ void GPUTimerQueryVulkan::End()
        {
            WriteTimestamp(cmdBuffer, _queries[_queryIndex].End, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT);
        }
-        context->GetCmdBufferManager()->OnQueryEnd(this);
+        context->GetCmdBufferManager()->OnTimerQueryEnd(this);
    }
-#endif

    _endCalled = true;
 }
@@ -213,3 +271,5 @@ float GPUTimerQueryVulkan::GetResult()
 }

 #endif
+
+#endif
--- a/Source/Engine/GraphicsDevice/Vulkan/GPUTimerQueryVulkan.h
+++ b/Source/Engine/GraphicsDevice/Vulkan/GPUTimerQueryVulkan.h
@@ -13,6 +13,13 @@
 class GPUTimerQueryVulkan : public GPUResourceVulkan<GPUTimerQuery>
 {
 private:
+#if !VULKAN_USE_TIMER_QUERIES
+#elif GPU_VULKAN_QUERY_NEW
+    bool _hasResult = false;
+    bool _endCalled = false;
+    float _timeDelta = 0.0f;
+    uint64 _query = 0;
+#else
    struct Query
    {
        BufferedQueryPoolVulkan* Pool;
@@ -32,6 +39,7 @@ private:
    float _timeDelta = 0.0f;
    int32 _queryIndex;
    Array<QueryPair, InlinedAllocation<8>> _queries;
+#endif

 public:
    /// <summary>
@@ -40,6 +48,7 @@ public:
    /// <param name="device">The graphics device.</param>
    GPUTimerQueryVulkan(GPUDeviceVulkan* device);

+#if !GPU_VULKAN_QUERY_NEW
 public:
    /// <summary>
    /// Interrupts an in-progress query, allowing the command buffer to submitted. Interrupted queries must be resumed using Resume().
@@ -58,6 +67,7 @@ private:
    void WriteTimestamp(CmdBufferVulkan* cmdBuffer, Query& query, VkPipelineStageFlagBits stage) const;
    bool TryGetResult();
    bool UseQueries();
+#endif

 public:
    // [GPUTimerQuery]
--- a/Source/Engine/GraphicsDevice/Vulkan/Mac/MacVulkanPlatform.h
+++ b/Source/Engine/GraphicsDevice/Vulkan/Mac/MacVulkanPlatform.h
@@ -9,7 +9,7 @@
 #define VULKAN_BACK_BUFFERS_COUNT 3

 // General/Validation Error:0 VK_ERROR_INITIALIZATION_FAILED: Could not create MTLCounterSampleBuffer for query pool of type VK_QUERY_TYPE_TIMESTAMP. Reverting to emulated behavior. (Error code 0): Cannot allocate sample buffer
-#define VULKAN_USE_QUERIES 0
+#define VULKAN_USE_TIMER_QUERIES 0

 /// <summary>
 /// The implementation for the Vulkan API support for Mac platform.
--- a/Source/Engine/GraphicsDevice/Vulkan/iOS/iOSVulkanPlatform.h
+++ b/Source/Engine/GraphicsDevice/Vulkan/iOS/iOSVulkanPlatform.h
@@ -9,7 +9,7 @@
 #define VULKAN_BACK_BUFFERS_COUNT 3

 // General/Validation Error:0 VK_ERROR_INITIALIZATION_FAILED: Could not create MTLCounterSampleBuffer for query pool of type VK_QUERY_TYPE_TIMESTAMP. Reverting to emulated behavior. (Error code 0): Cannot allocate sample buffer
-#define VULKAN_USE_QUERIES 0
+#define VULKAN_USE_TIMER_QUERIES 0

 /// <summary>
 /// The implementation for the Vulkan API support for iOS platform.