From ad3c2be5109e9e864eb101eb955595d3fd1c0f49 Mon Sep 17 00:00:00 2001
From: Wojtek Figat <wojtek@figat.pl>
Date: Tue, 3 Mar 2026 21:35:42 +0100
Subject: [PATCH] Add timer and occlusion queries support to WebGPU

---
 .../DirectX/DX12/GPUDeviceDX12.cpp            |   1 +
 .../GraphicsDevice/Vulkan/GPUDeviceVulkan.cpp |   5 +
 .../WebGPU/GPUContextWebGPU.cpp               | 101 ++++++++-
 .../GraphicsDevice/WebGPU/GPUContextWebGPU.h  |   6 +
 .../GraphicsDevice/WebGPU/GPUDeviceWebGPU.cpp | 201 +++++++++++++++++-
 .../GraphicsDevice/WebGPU/GPUDeviceWebGPU.h   |  62 ++++++
 .../GraphicsDevice/WebGPU/IncludeWebGPU.h     |   2 +
 Source/Engine/Profiler/ProfilerGPU.cpp        |   1 +
 .../Engine/Renderer/AmbientOcclusionPass.cpp  |   4 +-
 9 files changed, 373 insertions(+), 10 deletions(-)
diff --git a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUDeviceDX12.cpp b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUDeviceDX12.cpp
index ad8d814f5..db0b0555b 100644
--- a/Source/Engine/GraphicsDevice/DirectX/DX12/GPUDeviceDX12.cpp
+++ b/Source/Engine/GraphicsDevice/DirectX/DX12/GPUDeviceDX12.cpp
@@ -985,6 +985,7 @@ GPUQueryDX12 GPUDeviceDX12::AllocQuery(GPUQueryType type)
     if (heapIndex == QueryHeaps.Count())
     {
         // Allocate a new query heap
+        PROFILE_MEM(GraphicsCommands);
         auto heap = New<QueryHeapDX12>();
         int32 size = type == GPUQueryType::Occlusion ? 4096 : 1024;
         if (heap->Init(this, type, size))
diff --git a/Source/Engine/GraphicsDevice/Vulkan/GPUDeviceVulkan.cpp b/Source/Engine/GraphicsDevice/Vulkan/GPUDeviceVulkan.cpp
index 6e1164a46..5266a704b 100644
--- a/Source/Engine/GraphicsDevice/Vulkan/GPUDeviceVulkan.cpp
+++ b/Source/Engine/GraphicsDevice/Vulkan/GPUDeviceVulkan.cpp
@@ -1245,6 +1245,7 @@ int32 GPUDeviceVulkan::GetOrCreateQueryPool(GPUQueryType type)
     }
 
     PROFILE_CPU_NAMED("Create Create Pool");
+    PROFILE_MEM(GraphicsCommands);
     auto pool = New<BufferedQueryPoolVulkan>(this, type == GPUQueryType::Occlusion ? 4096 : 1024, type);
     QueryPools.Add(pool);
     return QueryPools.Count() - 1;
@@ -1257,6 +1258,7 @@ RenderPassVulkan* GPUDeviceVulkan::GetOrCreateRenderPass(RenderTargetLayoutVulka
         return renderPass;
 
     PROFILE_CPU_NAMED("Create Render Pass");
+    PROFILE_MEM(GraphicsCommands);
     renderPass = New<RenderPassVulkan>(this, layout);
     _renderPasses.Add(layout, renderPass);
     return renderPass;
@@ -1269,6 +1271,7 @@ FramebufferVulkan* GPUDeviceVulkan::GetOrCreateFramebuffer(FramebufferVulkan::Ke
         return framebuffer;
 
     PROFILE_CPU_NAMED("Create Framebuffer");
+    PROFILE_MEM(GraphicsCommands);
     framebuffer = New<FramebufferVulkan>(this, key, extent, layers);
     _framebuffers.Add(key, framebuffer);
     return framebuffer;
@@ -1281,6 +1284,7 @@ PipelineLayoutVulkan* GPUDeviceVulkan::GetOrCreateLayout(DescriptorSetLayoutInfo
         return layout;
 
     PROFILE_CPU_NAMED("Create Pipeline Layout");
+    PROFILE_MEM(GraphicsCommands);
     layout = New<PipelineLayoutVulkan>(this, key);
     _layouts.Add(key, layout);
     return layout;
@@ -2237,6 +2241,7 @@ FenceVulkan* FenceManagerVulkan::AllocateFence(bool createSignaled)
     }
     else
     {
+        PROFILE_MEM(GraphicsCommands);
         fence = New<FenceVulkan>();
         fence->IsSignaled = createSignaled;
         VkFenceCreateInfo info;
diff --git a/Source/Engine/GraphicsDevice/WebGPU/GPUContextWebGPU.cpp b/Source/Engine/GraphicsDevice/WebGPU/GPUContextWebGPU.cpp
index b190d9d86..bf99dfb0c 100644
--- a/Source/Engine/GraphicsDevice/WebGPU/GPUContextWebGPU.cpp
+++ b/Source/Engine/GraphicsDevice/WebGPU/GPUContextWebGPU.cpp
@@ -72,6 +72,7 @@ void GPUContextWebGPU::FrameBegin()
     GPUContext::FrameBegin();
 
     // Setup
+    _usedQuerySets = 0;
     _renderPassDirty = false;
     _pipelineDirty = false;
     _bindGroupDirty = false;
@@ -424,12 +425,53 @@ void GPUContextWebGPU::DrawIndexedInstancedIndirect(GPUBuffer* bufferForArgs, ui
 
 uint64 GPUContextWebGPU::BeginQuery(GPUQueryType type)
 {
-    // TODO: impl timer/occlusion queries
-    return 0;
+    auto query = _device->AllocateQuery(type);
+    if (query.Raw)
+    {
+        ASSERT_LOW_LAYER(query.Set < WEBGPU_MAX_QUERY_SETS);
+        auto set = _device->QuerySets[query.Set];
+        if (set->Type == GPUQueryType::Timer)
+        {
+            // Put a new timestamp write
+            WriteTimestamp(set, query.Index);
+        }
+        else if (_activeOcclusionQuerySet == query.Set && _renderPass)
+        {
+            // Begin occlusion query on the active set
+            wgpuRenderPassEncoderBeginOcclusionQuery(_renderPass, query.Index);
+        }
+        else
+        {
+            // Set the next pending occlusion query set to use for the next pass (or frame)
+            _pendingOcclusionQuerySet = query.Set;
+        }
+
+        // Mark query set as used (to be resolved on the frame end)
+        static_assert(sizeof(_usedQuerySets) * 8 >= WEBGPU_MAX_QUERY_SETS, "Not enough bits in flags of used queries set.");
+        _usedQuerySets |= 1u << query.Set;
+
+    }
+    return query.Raw;
 }
 
 void GPUContextWebGPU::EndQuery(uint64 queryID)
 {
+    if (queryID)
+    {
+        GPUQueryWebGPU query;
+        query.Raw = queryID;
+        auto set = _device->QuerySets[query.Set];
+        if (set->Type == GPUQueryType::Timer)
+        {
+            // Put a new timestamp write
+            WriteTimestamp(set, query.Index + 1);
+        }
+        else if (_activeOcclusionQuerySet == query.Set && _renderPass)
+        {
+            // End occlusion query on the active set
+            wgpuRenderPassEncoderEndOcclusionQuery(_renderPass);
+        }
+    }
 }
 
 void GPUContextWebGPU::SetViewport(const Viewport& viewport)
@@ -496,6 +538,18 @@ void GPUContextWebGPU::Flush()
     if (_renderPass)
         EndRenderPass();
 
+    // Flush pending actions
+    FlushTimestamps();
+    _pendingTimestampWrites.Clear();
+
+    // Resolve used queries
+    for (uint32 setIndex = 0; setIndex < _device->QuerySetsCount; setIndex++)
+    {
+        if (_usedQuerySets & (1u << setIndex))
+            _device->QuerySets[setIndex]->Resolve(Encoder);
+    }
+    _usedQuerySets = 0;
+
     // End commands recording
     WGPUCommandBufferDescriptor commandBufferDesc = WGPU_COMMAND_BUFFER_DESCRIPTOR_INIT;
     WGPUCommandBuffer commandBuffer = wgpuCommandEncoderFinish(Encoder, &commandBufferDesc);
@@ -724,6 +778,15 @@ void GPUContextWebGPU::CopySubresource(GPUResource* dstResource, uint32 dstSubre
     }
 }
 
+void GPUContextWebGPU::WriteTimestamp(GPUQuerySetWebGPU* set, uint32 index)
+{
+    WGPUPassTimestampWrites write = WGPU_PASS_TIMESTAMP_WRITES_INIT;
+    write.querySet = set->Set;
+    write.beginningOfPassWriteIndex = index;
+    write.endOfPassWriteIndex = 0; // makePassTimestampWrites doesn't pass undefined properly thus it has to be a valid query (index 0 is left as dummy)
+    _pendingTimestampWrites.Add(write);
+}
+
 bool GPUContextWebGPU::FindClear(const GPUTextureViewWebGPU* view, PendingClear& clear)
 {
     for (auto& e : _pendingClears)
@@ -928,6 +991,15 @@ void GPUContextWebGPU::FlushRenderPass()
     {
         _pipelineKey.DepthStencilFormat = WGPUTextureFormat_Undefined;
     }
+    if (_pendingOcclusionQuerySet != _activeOcclusionQuerySet)
+    {
+        _activeOcclusionQuerySet = _pendingOcclusionQuerySet;
+        renderPassDesc.occlusionQuerySet = _device->QuerySets[_activeOcclusionQuerySet]->Set;
+    }
+    FlushTimestamps(1);
+    if (_pendingTimestampWrites.HasItems())
+        renderPassDesc.timestampWrites = &_pendingTimestampWrites.Last();
+    _pendingTimestampWrites.Clear();
     ASSERT(attachmentSize.Packed != 0);
     _renderPass = wgpuCommandEncoderBeginRenderPass(Encoder, &renderPassDesc);
     ASSERT(_renderPass);
@@ -1100,4 +1172,29 @@ void GPUContextWebGPU::FlushBindGroup()
     }
 }
 
+void GPUContextWebGPU::FlushTimestamps(int32 skipLast)
+{
+    for (int32 i = 0; i < _pendingTimestampWrites.Count() - skipLast; i++)
+    {
+        // WebGPU timestamps have very bad API design made for single-file examples, not real game engines so drain writes here with dummy render passes
+        // Also, webgpu.h wrapper doesn't pass timestampWrites as array but just a single item...
+        WGPURenderPassDescriptor dummyDesc = WGPU_RENDER_PASS_DESCRIPTOR_INIT;
+        if (!_device->DefaultRenderTarget)
+        {
+            _device->DefaultRenderTarget = (GPUTextureWebGPU*)_device->CreateTexture(TEXT("DefaultRenderTarget"));
+            _device->DefaultRenderTarget->Init(GPUTextureDescription::New2D(1, 1, PixelFormat::R8G8B8A8_UNorm, GPUTextureFlags::RenderTarget));
+        }
+        WGPURenderPassColorAttachment dummyAttachment = WGPU_RENDER_PASS_COLOR_ATTACHMENT_INIT;
+        dummyAttachment.view = ((GPUTextureViewWebGPU*)_device->DefaultRenderTarget->View(0))->ViewRender;
+        dummyAttachment.loadOp = WGPULoadOp_Clear;
+        dummyAttachment.storeOp = WGPUStoreOp_Discard;
+        dummyDesc.colorAttachmentCount = 1;
+        dummyDesc.colorAttachments = &dummyAttachment;
+        dummyDesc.timestampWrites = &_pendingTimestampWrites[i];
+        auto renderPass = wgpuCommandEncoderBeginRenderPass(Encoder, &dummyDesc);
+        wgpuRenderPassEncoderEnd(renderPass);
+        wgpuRenderPassEncoderRelease(renderPass);
+    }
+}
+
 #endif
diff --git a/Source/Engine/GraphicsDevice/WebGPU/GPUContextWebGPU.h b/Source/Engine/GraphicsDevice/WebGPU/GPUContextWebGPU.h
index 29c4c8544..140d96618 100644
--- a/Source/Engine/GraphicsDevice/WebGPU/GPUContextWebGPU.h
+++ b/Source/Engine/GraphicsDevice/WebGPU/GPUContextWebGPU.h
@@ -43,6 +43,10 @@ private:
 
     GPUDeviceWebGPU* _device;
     uint32 _minUniformBufferOffsetAlignment;
+    int32 _activeOcclusionQuerySet = -1;
+    int32 _pendingOcclusionQuerySet = -1;
+    uint32 _usedQuerySets = 0;
+    Array<WGPUPassTimestampWrites> _pendingTimestampWrites;
 
     // State tracking
     uint32 _renderPassDirty : 1;
@@ -85,6 +89,7 @@ public:
     WGPUCommandEncoder Encoder = nullptr;
 
 private:
+    void WriteTimestamp(GPUQuerySetWebGPU* set, uint32 index);
     bool FindClear(const GPUTextureViewWebGPU* view, PendingClear& clear);
     void ManualClear(const PendingClear& clear);
     void OnDrawCall();
@@ -92,6 +97,7 @@ private:
     void EndRenderPass();
     void FlushRenderPass();
     void FlushBindGroup();
+    void FlushTimestamps(int32 skipLast = 0);
 
 public:
     // [GPUContext]
diff --git a/Source/Engine/GraphicsDevice/WebGPU/GPUDeviceWebGPU.cpp b/Source/Engine/GraphicsDevice/WebGPU/GPUDeviceWebGPU.cpp
index c8bbb1d1c..585a7a4fa 100644
--- a/Source/Engine/GraphicsDevice/WebGPU/GPUDeviceWebGPU.cpp
+++ b/Source/Engine/GraphicsDevice/WebGPU/GPUDeviceWebGPU.cpp
@@ -30,6 +30,132 @@ GPUVertexLayoutWebGPU::GPUVertexLayoutWebGPU(GPUDeviceWebGPU* device, const Elem
     SetElements(elements, explicitOffsets);
 }
 
+GPUQuerySetWebGPU::GPUQuerySetWebGPU(WGPUDevice device, GPUQueryType type, uint32 count)
+    : _device(device)
+    , _count(count)
+    , Type(type)
+{
+    // Timer queries use 2 items for begin/end timestamps
+    ASSERT_LOW_LAYER(count % 2 == 0 || type != GPUQueryType::Timer);
+    if (type == GPUQueryType::Timer)
+        _index = 2; // Skip first item in timer queries due to bug in makePassTimestampWrites that cannot pass undefined value properly
+
+    // Create query set
+    WGPUQuerySetDescriptor desc = WGPU_QUERY_SET_DESCRIPTOR_INIT;
+    desc.type = type == GPUQueryType::Timer ? WGPUQueryType_Timestamp : WGPUQueryType_Occlusion;
+    desc.count = count;
+    Set = wgpuDeviceCreateQuerySet(device, &desc);
+    ASSERT(Set);
+
+    // Create buffer for queries data
+    WGPUBufferDescriptor bufferDesc = WGPU_BUFFER_DESCRIPTOR_INIT;
+    bufferDesc.size = count * sizeof(uint64);
+    bufferDesc.usage = WGPUBufferUsage_QueryResolve | WGPUBufferUsage_CopySrc;
+    _queryBuffer = wgpuDeviceCreateBuffer(device, &bufferDesc);
+    ASSERT(_queryBuffer);
+
+    // Create buffer for reading copied queries data on CPU
+    bufferDesc.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst;
+    _readBuffer = wgpuDeviceCreateBuffer(device, &bufferDesc);
+    ASSERT(_readBuffer);
+
+#if COMPILE_WITH_PROFILER
+    _memorySize = bufferDesc.size * 3; // Set + QueryBuffer + ReadBuffer
+    PROFILE_MEM_INC(GraphicsCommands, _memorySize);
+#endif
+}
+
+GPUQuerySetWebGPU::~GPUQuerySetWebGPU()
+{
+    PROFILE_MEM_DEC(GraphicsCommands, _memorySize);
+    wgpuBufferDestroy(_readBuffer);
+    wgpuBufferRelease(_readBuffer);
+    wgpuBufferDestroy(_queryBuffer);
+    wgpuBufferRelease(_queryBuffer);
+    wgpuQuerySetDestroy(Set);
+    wgpuQuerySetRelease(Set);
+}
+
+bool GPUQuerySetWebGPU::CanAllocate() const
+{
+    return _index < _count && (_state == Active || _state == Mapped);
+}
+
+uint32 GPUQuerySetWebGPU::Allocate()
+{
+    if (_state == Mapped)
+    {
+        // Start a new batch from the beginning
+        wgpuBufferUnmap(_readBuffer);
+        _state = Active;
+        _index = 2;
+        _mapped = nullptr;
+    }
+    uint32 index = _index;
+    _index += Type == GPUQueryType::Timer ? 2 : 1;
+    return index;
+}
+
+void GPUQuerySetWebGPU::Resolve(WGPUCommandEncoder encoder)
+{
+    ASSERT(_index != 0 && _state == Active);
+    wgpuCommandEncoderResolveQuerySet(encoder, Set, 0, _index, _queryBuffer, 0);
+    wgpuCommandEncoderCopyBufferToBuffer(encoder, _queryBuffer, 0, _readBuffer, 0, _index * sizeof(uint64));
+    _state = Resolved;
+}
+
+bool GPUQuerySetWebGPU::Read(uint32 index, uint64& result, bool wait)
+{
+    if (_state == Resolved)
+    {
+        // Start mapping the buffer
+        ASSERT(!wait); // TODO: impl wgpuBufferMapAsync with waiting (see GPUBufferWebGPU::Map)
+        WGPUBufferMapCallbackInfo callback = WGPU_BUFFER_MAP_CALLBACK_INFO_INIT;
+        callback.mode = WGPUCallbackMode_AllowSpontaneous;
+        callback.userdata1 = this;
+        callback.callback = [](WGPUMapAsyncStatus status, WGPUStringView message, WGPU_NULLABLE void* userdata1, WGPU_NULLABLE void* userdata2)
+        {
+            if (status == WGPUMapAsyncStatus_Success)
+            {
+                auto set = (GPUQuerySetWebGPU*)userdata1;
+                set->OnRead();
+            }
+#if !BUILD_RELEASE
+            else
+            {
+                LOG(Error, "Query Set map failed with status {}, {}", (uint32)status, WEBGPU_TO_STR(message));
+            }
+#endif
+        };
+        wgpuBufferMapAsync(_readBuffer, WGPUMapMode_Read, 0, _index * sizeof(uint64), callback);
+        _state = Mapping;
+    }
+    else if (_state == Mapped)
+    {
+        // Read the results from mapped buffer
+        if (Type == GPUQueryType::Timer)
+        {
+            // Timestamp calculates a difference between two queries (begin/end) in nanoseconds (result is in microseconds)
+            result = Math::Max(_mapped[index + 1] - _mapped[index], 0ull) / 1000;
+        }
+        else
+        {
+            // Occlusion outputs number of fragment samples that pass all the tests (scissor, stencil, depth, etc.)
+            result = _mapped[index];
+        }
+        return true;
+    }
+    return false;
+}
+
+void GPUQuerySetWebGPU::OnRead()
+{
+    // Get mapped buffer pointer
+    ASSERT(_state == Mapping);
+    _state = Mapped;
+    _mapped = (const uint64*)wgpuBufferGetConstMappedRange(_readBuffer, 0, _index * sizeof(uint64));
+}
+
 GPUDataUploaderWebGPU::Allocation GPUDataUploaderWebGPU::Allocate(uint32 size, WGPUBufferUsage usage, uint32 alignment)
 {
     // Find a free buffer from the current frame
@@ -167,6 +293,7 @@ bool GPUDeviceWebGPU::Init()
     if (wgpuAdapterGetLimits(Adapter->Adapter, &limits) == WGPUStatus_Success)
     {
         MinUniformBufferOffsetAlignment = limits.minUniformBufferOffsetAlignment;
+        TimestampQuery = features.Contains(WGPUFeatureName_TimestampQuery);
         Limits.HasInstancing = true;
         Limits.HasDrawIndirect = true;
         Limits.HasDepthAsSRV = true;
@@ -174,11 +301,11 @@ bool GPUDeviceWebGPU::Init()
         Limits.HasDepthClip = features.Contains(WGPUFeatureName_DepthClipControl);
         Limits.HasReadOnlyDepth = true;
         Limits.MaximumSamplerAnisotropy = 4;
-        Limits.MaximumTexture1DSize = Math::Min<int32>(GPU_MAX_TEXTURE_SIZE, limits.maxTextureDimension1D);
-        Limits.MaximumTexture2DSize = Math::Min<int32>(GPU_MAX_TEXTURE_SIZE, limits.maxTextureDimension2D);
-        Limits.MaximumTexture3DSize = Math::Min<int32>(GPU_MAX_TEXTURE_SIZE, limits.maxTextureDimension3D);
-        Limits.MaximumMipLevelsCount = Math::Min<int32>(GPU_MAX_TEXTURE_MIP_LEVELS, (int32)log2(limits.maxTextureDimension2D));
-        Limits.MaximumTexture1DArraySize = Limits.MaximumTexture2DArraySize = Math::Min<int32>(GPU_MAX_TEXTURE_ARRAY_SIZE, limits.maxTextureArrayLayers);
+        Limits.MaximumTexture1DSize = limits.maxTextureDimension1D;
+        Limits.MaximumTexture2DSize = limits.maxTextureDimension2D;
+        Limits.MaximumTexture3DSize = limits.maxTextureDimension3D;
+        Limits.MaximumMipLevelsCount = (int32)log2(limits.maxTextureDimension2D);
+        Limits.MaximumTexture1DArraySize = Limits.MaximumTexture2DArraySize = limits.maxTextureArrayLayers;
         if (limits.maxTextureArrayLayers >= 6)
             Limits.MaximumTextureCubeSize = Limits.MaximumTexture2DSize;
 
@@ -624,7 +751,11 @@ void GPUDeviceWebGPU::Dispose()
     preDispose();
 
     // Clear device resources
+    for (int32 i = 0; i < QuerySetsCount; i++)
+        Delete(QuerySets[i]);
+    QuerySetsCount = 0;
     DataUploader.ReleaseGPU();
+    SAFE_DELETE_GPU_RESOURCE(DefaultRenderTarget);
     SAFE_DELETE_GPU_RESOURCES(DefaultTexture);
     SAFE_DELETE_GPU_RESOURCES(DefaultSamplers);
     SAFE_DELETE(_mainContext);
@@ -653,12 +784,68 @@ void GPUDeviceWebGPU::Dispose()
 
 void GPUDeviceWebGPU::WaitForGPU()
 {
+    // TODO: this could use onSubmittedWorkDone (assuming any submit has been already done)
+}
+
+GPUQueryWebGPU GPUDeviceWebGPU::AllocateQuery(GPUQueryType type)
+{
+    // Ignore if device doesn't support timer queries
+    if (type == GPUQueryType::Timer && !TimestampQuery)
+        return {};
+
+    // Get query set with free space
+    int32 setIndex = 0;
+    for (; setIndex < QuerySetsCount; setIndex++)
+    {
+        auto heap = QuerySets[setIndex];
+        if (heap->Type == type && heap->CanAllocate())
+            break;
+    }
+    if (setIndex == QuerySetsCount)
+    {
+        if (setIndex == WEBGPU_MAX_QUERY_SETS)
+        {
+#if !BUILD_RELEASE
+            static bool SingleTimeLog = true;
+            if (SingleTimeLog)
+            {
+                SingleTimeLog = false;
+                LOG(Error, "Run out of the query sets capacity.");
+            }
+#endif
+            return {};
+        }
+
+        // Allocate a new query heap
+        PROFILE_MEM(GraphicsCommands);
+        uint32 size = type == GPUQueryType::Occlusion ? 4096 : 1024;
+        auto set = New<GPUQuerySetWebGPU>(Device, type, size);
+        QuerySets[QuerySetsCount++] = set;
+    }
+
+    // Allocate query from the set
+    GPUQueryWebGPU query;
+    {
+        static_assert(sizeof(GPUQueryWebGPU) == sizeof(uint64), "Invalid WebGPU query size.");
+        query.Set = setIndex;
+        query.Index = QuerySets[setIndex]->Allocate();
+    }
+    return query;
 }
 
 bool GPUDeviceWebGPU::GetQueryResult(uint64 queryID, uint64& result, bool wait)
 {
-    // TODO: impl queries
-    return false;
+    if (queryID == 0)
+    {
+        // Invalid query
+        result = 0;
+        return true;
+    }
+
+    GPUQueryWebGPU query;
+    query.Raw = queryID;
+    auto set = QuerySets[query.Set];
+    return set->Read(query.Index, result, wait);
 }
 
 GPUTexture* GPUDeviceWebGPU::CreateTexture(const StringView& name)
diff --git a/Source/Engine/GraphicsDevice/WebGPU/GPUDeviceWebGPU.h b/Source/Engine/GraphicsDevice/WebGPU/GPUDeviceWebGPU.h
index d87957775..5b56126ac 100644
--- a/Source/Engine/GraphicsDevice/WebGPU/GPUDeviceWebGPU.h
+++ b/Source/Engine/GraphicsDevice/WebGPU/GPUDeviceWebGPU.h
@@ -30,6 +30,62 @@ namespace GPUBindGroupsWebGPU
     };
 };
 
+/// <summary>
+/// GPU query ID packed into 64-bits.
+/// </summary>
+struct GPUQueryWebGPU
+{
+    union
+    {
+        struct
+        {
+            uint32 Set;
+            uint32 Index;
+        };
+        uint64 Raw;
+    };
+};
+
+/// <summary>
+/// Set of GPU queries allocated in batch with functionality to read results via a separate CPU buffer.
+/// </summary>
+class GPUQuerySetWebGPU
+{
+private:
+    WGPUDevice _device;
+    uint32 _count;
+    uint32 _index = 0;
+    enum States
+    {
+        Active,
+        Resolved,
+        Mapping,
+        Mapped,
+    } _state = Active;
+#if COMPILE_WITH_PROFILER
+    uint64 _memorySize;
+#endif
+    WGPUBuffer _queryBuffer;
+    WGPUBuffer _readBuffer;
+    const uint64* _mapped = nullptr;
+
+public:
+    const GPUQueryType Type;
+    WGPUQuerySet Set;
+
+public:
+    GPUQuerySetWebGPU(WGPUDevice device, GPUQueryType type, uint32 count);
+    ~GPUQuerySetWebGPU();
+
+    bool CanAllocate() const;
+    uint32 Allocate();
+    void Resolve(WGPUCommandEncoder encoder);
+    bool Read(uint32 index, uint64& result, bool wait);
+
+private:
+    void OnRead();
+};
+
 /// <summary>
 /// Pool for uploading data to GPU buffers. It manages large buffers and suballocates for multiple small updates, minimizing the number of buffer creations and copies.
 /// </summary>
@@ -79,11 +135,17 @@ public:
     WGPUInstance WebGPUInstance;
     WGPUDevice Device = nullptr;
     WGPUQueue Queue = nullptr;
+    GPUTextureWebGPU* DefaultRenderTarget = nullptr;
     GPUSamplerWebGPU* DefaultSamplers[6] = {};
     GPUTextureWebGPU* DefaultTexture[10] = {};
     WGPUBuffer DefaultBuffer = nullptr;
     GPUDataUploaderWebGPU DataUploader;
     uint32 MinUniformBufferOffsetAlignment = 1;
+    bool TimestampQuery = false;
+    uint32 QuerySetsCount = 0;
+    GPUQuerySetWebGPU* QuerySets[WEBGPU_MAX_QUERY_SETS] = {};
+
+    GPUQueryWebGPU AllocateQuery(GPUQueryType type);
 
 public:
     // [GPUDeviceDX]
diff --git a/Source/Engine/GraphicsDevice/WebGPU/IncludeWebGPU.h b/Source/Engine/GraphicsDevice/WebGPU/IncludeWebGPU.h
index a7f3a4ec7..06c3b3d9f 100644
--- a/Source/Engine/GraphicsDevice/WebGPU/IncludeWebGPU.h
+++ b/Source/Engine/GraphicsDevice/WebGPU/IncludeWebGPU.h
@@ -23,4 +23,6 @@
 // Utiltiy macro to get WGPUStringView for a text constant
 #define WEBGPU_STR(str) { str, ARRAY_COUNT(str) - 1 }
 
+#define WEBGPU_MAX_QUERY_SETS 8
+
 #endif
diff --git a/Source/Engine/Profiler/ProfilerGPU.cpp b/Source/Engine/Profiler/ProfilerGPU.cpp
index 1ede560f7..2cf40afb2 100644
--- a/Source/Engine/Profiler/ProfilerGPU.cpp
+++ b/Source/Engine/Profiler/ProfilerGPU.cpp
@@ -398,6 +398,7 @@ void GraphicsDumping::Print()
     auto& draw = Items[0];
     {
         // The root item is always the drawing by engine
+        draw.Time = Math::Max(draw.Time, 0.000001f);
         if (draw.Count == 1)
             sb.AppendFormat(TEXT("  Frame time: {} ms ({} FPS)"), Utilities::RoundTo2DecimalPlaces(draw.Time), (int32)(1000.0f / draw.Time)).AppendLine();
         else
diff --git a/Source/Engine/Renderer/AmbientOcclusionPass.cpp b/Source/Engine/Renderer/AmbientOcclusionPass.cpp
index 437b3077d..07a22e8f0 100644
--- a/Source/Engine/Renderer/AmbientOcclusionPass.cpp
+++ b/Source/Engine/Renderer/AmbientOcclusionPass.cpp
@@ -230,6 +230,9 @@ void AmbientOcclusionPass::Render(RenderContext& renderContext)
         Math::Min(renderContext.Buffers->GetWidth(), renderContext.Buffers->GetHeight()) < 16 ||
         checkIfSkipPass())
         return;
+    auto device = GPUDevice::Instance;
+    if (device->Limits.MaximumTexture2DArraySize < 4)
+        return;
     PROFILE_GPU_CPU("Ambient Occlusion");
 
     settings = ASSAO_Settings();
@@ -270,7 +273,6 @@ void AmbientOcclusionPass::Render(RenderContext& renderContext)
     settings.SkipHalfPixels = true;
 
     // Cache data
-    auto device = GPUDevice::Instance;
     auto context = device->GetMainContext();
     int32 m_sizeX = renderContext.Buffers->GetWidth();
     int32 m_sizeY = renderContext.Buffers->GetHeight();