Add timer and occlusion queries support to WebGPU

This commit is contained in:
Wojtek Figat
2026-03-03 21:35:42 +01:00
parent 5641bf63e8
commit ad3c2be510
9 changed files with 373 additions and 10 deletions

View File

@@ -72,6 +72,7 @@ void GPUContextWebGPU::FrameBegin()
GPUContext::FrameBegin();
// Setup
_usedQuerySets = 0;
_renderPassDirty = false;
_pipelineDirty = false;
_bindGroupDirty = false;
@@ -424,12 +425,53 @@ void GPUContextWebGPU::DrawIndexedInstancedIndirect(GPUBuffer* bufferForArgs, ui
uint64 GPUContextWebGPU::BeginQuery(GPUQueryType type)
{
// TODO: impl timer/occlusion queries
return 0;
auto query = _device->AllocateQuery(type);
if (query.Raw)
{
ASSERT_LOW_LAYER(query.Set < WEBGPU_MAX_QUERY_SETS);
auto set = _device->QuerySets[query.Set];
if (set->Type == GPUQueryType::Timer)
{
// Put a new timestamp write
WriteTimestamp(set, query.Index);
}
else if (_activeOcclusionQuerySet == query.Set && _renderPass)
{
// Begin occlusion query on the active set
wgpuRenderPassEncoderBeginOcclusionQuery(_renderPass, query.Index);
}
else
{
// Set the next pending occlusion query set to use for the next pass (or frame)
_pendingOcclusionQuerySet = query.Set;
}
// Mark query set as used (to be resolved on the frame end)
static_assert(sizeof(_usedQuerySets) * 8 >= WEBGPU_MAX_QUERY_SETS, "Not enough bits in flags of used queries set.");
_usedQuerySets |= 1u << query.Set;
}
return query.Raw;
}
void GPUContextWebGPU::EndQuery(uint64 queryID)
{
if (queryID)
{
GPUQueryWebGPU query;
query.Raw = queryID;
auto set = _device->QuerySets[query.Set];
if (set->Type == GPUQueryType::Timer)
{
// Put a new timestamp write
WriteTimestamp(set, query.Index + 1);
}
else if (_activeOcclusionQuerySet == query.Set && _renderPass)
{
// End occlusion query on the active set
wgpuRenderPassEncoderEndOcclusionQuery(_renderPass);
}
}
}
void GPUContextWebGPU::SetViewport(const Viewport& viewport)
@@ -496,6 +538,18 @@ void GPUContextWebGPU::Flush()
if (_renderPass)
EndRenderPass();
// Flush pending actions
FlushTimestamps();
_pendingTimestampWrites.Clear();
// Resolve used queries
for (uint32 setIndex = 0; setIndex < _device->QuerySetsCount; setIndex++)
{
if (_usedQuerySets & (1u << setIndex))
_device->QuerySets[setIndex]->Resolve(Encoder);
}
_usedQuerySets = 0;
// End commands recording
WGPUCommandBufferDescriptor commandBufferDesc = WGPU_COMMAND_BUFFER_DESCRIPTOR_INIT;
WGPUCommandBuffer commandBuffer = wgpuCommandEncoderFinish(Encoder, &commandBufferDesc);
@@ -724,6 +778,15 @@ void GPUContextWebGPU::CopySubresource(GPUResource* dstResource, uint32 dstSubre
}
}
void GPUContextWebGPU::WriteTimestamp(GPUQuerySetWebGPU* set, uint32 index)
{
WGPUPassTimestampWrites write = WGPU_PASS_TIMESTAMP_WRITES_INIT;
write.querySet = set->Set;
write.beginningOfPassWriteIndex = index;
write.endOfPassWriteIndex = 0; // makePassTimestampWrites doesn't pass undefined properly thus it has to be a valid query (index 0 is left as dummy)
_pendingTimestampWrites.Add(write);
}
bool GPUContextWebGPU::FindClear(const GPUTextureViewWebGPU* view, PendingClear& clear)
{
for (auto& e : _pendingClears)
@@ -928,6 +991,15 @@ void GPUContextWebGPU::FlushRenderPass()
{
_pipelineKey.DepthStencilFormat = WGPUTextureFormat_Undefined;
}
if (_pendingOcclusionQuerySet != _activeOcclusionQuerySet)
{
_activeOcclusionQuerySet = _pendingOcclusionQuerySet;
renderPassDesc.occlusionQuerySet = _device->QuerySets[_activeOcclusionQuerySet]->Set;
}
FlushTimestamps(1);
if (_pendingTimestampWrites.HasItems())
renderPassDesc.timestampWrites = &_pendingTimestampWrites.Last();
_pendingTimestampWrites.Clear();
ASSERT(attachmentSize.Packed != 0);
_renderPass = wgpuCommandEncoderBeginRenderPass(Encoder, &renderPassDesc);
ASSERT(_renderPass);
@@ -1100,4 +1172,29 @@ void GPUContextWebGPU::FlushBindGroup()
}
}
void GPUContextWebGPU::FlushTimestamps(int32 skipLast)
{
for (int32 i = 0; i < _pendingTimestampWrites.Count() - skipLast; i++)
{
// WebGPU timestamps have very bad API design made for single-file examples, not real game engines so drain writes here with dummy render passes
// Also, webgpu.h wrapper doesn't pass timestampWrites as array but just a single item...
WGPURenderPassDescriptor dummyDesc = WGPU_RENDER_PASS_DESCRIPTOR_INIT;
if (!_device->DefaultRenderTarget)
{
_device->DefaultRenderTarget = (GPUTextureWebGPU*)_device->CreateTexture(TEXT("DefaultRenderTarget"));
_device->DefaultRenderTarget->Init(GPUTextureDescription::New2D(1, 1, PixelFormat::R8G8B8A8_UNorm, GPUTextureFlags::RenderTarget));
}
WGPURenderPassColorAttachment dummyAttachment = WGPU_RENDER_PASS_COLOR_ATTACHMENT_INIT;
dummyAttachment.view = ((GPUTextureViewWebGPU*)_device->DefaultRenderTarget->View(0))->ViewRender;
dummyAttachment.loadOp = WGPULoadOp_Clear;
dummyAttachment.storeOp = WGPUStoreOp_Discard;
dummyDesc.colorAttachmentCount = 1;
dummyDesc.colorAttachments = &dummyAttachment;
dummyDesc.timestampWrites = &_pendingTimestampWrites[i];
auto renderPass = wgpuCommandEncoderBeginRenderPass(Encoder, &dummyDesc);
wgpuRenderPassEncoderEnd(renderPass);
wgpuRenderPassEncoderRelease(renderPass);
}
}
#endif

View File

@@ -43,6 +43,10 @@ private:
GPUDeviceWebGPU* _device;
uint32 _minUniformBufferOffsetAlignment;
int32 _activeOcclusionQuerySet = -1;
int32 _pendingOcclusionQuerySet = -1;
uint32 _usedQuerySets = 0;
Array<WGPUPassTimestampWrites> _pendingTimestampWrites;
// State tracking
uint32 _renderPassDirty : 1;
@@ -85,6 +89,7 @@ public:
WGPUCommandEncoder Encoder = nullptr;
private:
void WriteTimestamp(GPUQuerySetWebGPU* set, uint32 index);
bool FindClear(const GPUTextureViewWebGPU* view, PendingClear& clear);
void ManualClear(const PendingClear& clear);
void OnDrawCall();
@@ -92,6 +97,7 @@ private:
void EndRenderPass();
void FlushRenderPass();
void FlushBindGroup();
void FlushTimestamps(int32 skipLast = 0);
public:
// [GPUContext]

View File

@@ -30,6 +30,132 @@ GPUVertexLayoutWebGPU::GPUVertexLayoutWebGPU(GPUDeviceWebGPU* device, const Elem
SetElements(elements, explicitOffsets);
}
GPUQuerySetWebGPU::GPUQuerySetWebGPU(WGPUDevice device, GPUQueryType type, uint32 count)
: _device(device)
, _count(count)
, Type(type)
{
// Timer queries use 2 items for begin/end timestamps
ASSERT_LOW_LAYER(count % 2 == 0 || type != GPUQueryType::Timer);
if (type == GPUQueryType::Timer)
_index = 2; // Skip first item in timer queries due to bug in makePassTimestampWrites that cannot pass undefined value properly
// Create query set
WGPUQuerySetDescriptor desc = WGPU_QUERY_SET_DESCRIPTOR_INIT;
desc.type = type == GPUQueryType::Timer ? WGPUQueryType_Timestamp : WGPUQueryType_Occlusion;
desc.count = count;
Set = wgpuDeviceCreateQuerySet(device, &desc);
ASSERT(Set);
// Create buffer for queries data
WGPUBufferDescriptor bufferDesc = WGPU_BUFFER_DESCRIPTOR_INIT;
bufferDesc.size = count * sizeof(uint64);
bufferDesc.usage = WGPUBufferUsage_QueryResolve | WGPUBufferUsage_CopySrc;
_queryBuffer = wgpuDeviceCreateBuffer(device, &bufferDesc);
ASSERT(_queryBuffer);
// Create buffer for reading copied queries data on CPU
bufferDesc.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst;
_readBuffer = wgpuDeviceCreateBuffer(device, &bufferDesc);
ASSERT(_readBuffer);
#if COMPILE_WITH_PROFILER
_memorySize = bufferDesc.size * 3; // Set + QueryBuffer + ReadBuffer
PROFILE_MEM_INC(GraphicsCommands, _memorySize);
#endif
}
GPUQuerySetWebGPU::~GPUQuerySetWebGPU()
{
PROFILE_MEM_DEC(GraphicsCommands, _memorySize);
wgpuBufferDestroy(_readBuffer);
wgpuBufferRelease(_readBuffer);
wgpuBufferDestroy(_queryBuffer);
wgpuBufferRelease(_queryBuffer);
wgpuQuerySetDestroy(Set);
wgpuQuerySetRelease(Set);
}
bool GPUQuerySetWebGPU::CanAllocate() const
{
return _index < _count && (_state == Active || _state == Mapped);
}
uint32 GPUQuerySetWebGPU::Allocate()
{
if (_state == Mapped)
{
// Start a new batch from the beginning
wgpuBufferUnmap(_readBuffer);
_state = Active;
_index = 2;
_mapped = nullptr;
}
uint32 index = _index;
_index += Type == GPUQueryType::Timer ? 2 : 1;
return index;
}
void GPUQuerySetWebGPU::Resolve(WGPUCommandEncoder encoder)
{
ASSERT(_index != 0 && _state == Active);
wgpuCommandEncoderResolveQuerySet(encoder, Set, 0, _index, _queryBuffer, 0);
wgpuCommandEncoderCopyBufferToBuffer(encoder, _queryBuffer, 0, _readBuffer, 0, _index * sizeof(uint64));
_state = Resolved;
}
bool GPUQuerySetWebGPU::Read(uint32 index, uint64& result, bool wait)
{
if (_state == Resolved)
{
// Start mapping the buffer
ASSERT(!wait); // TODO: impl wgpuBufferMapAsync with waiting (see GPUBufferWebGPU::Map)
WGPUBufferMapCallbackInfo callback = WGPU_BUFFER_MAP_CALLBACK_INFO_INIT;
callback.mode = WGPUCallbackMode_AllowSpontaneous;
callback.userdata1 = this;
callback.callback = [](WGPUMapAsyncStatus status, WGPUStringView message, WGPU_NULLABLE void* userdata1, WGPU_NULLABLE void* userdata2)
{
if (status == WGPUMapAsyncStatus_Success)
{
auto set = (GPUQuerySetWebGPU*)userdata1;
set->OnRead();
}
#if !BUILD_RELEASE
else
{
LOG(Error, "Query Set map failed with status {}, {}", (uint32)status, WEBGPU_TO_STR(message));
}
#endif
};
wgpuBufferMapAsync(_readBuffer, WGPUMapMode_Read, 0, _index * sizeof(uint64), callback);
_state = Mapping;
}
else if (_state == Mapped)
{
// Read the results from mapped buffer
if (Type == GPUQueryType::Timer)
{
// Timestamp calculates a difference between two queries (begin/end) in nanoseconds (result is in microseconds)
result = Math::Max(_mapped[index + 1] - _mapped[index], 0ull) / 1000;
}
else
{
// Occlusion outputs number of fragment samples that pass all the tests (scissor, stencil, depth, etc.)
result = _mapped[index];
}
return true;
}
return false;
}
void GPUQuerySetWebGPU::OnRead()
{
// Get mapped buffer pointer
ASSERT(_state == Mapping);
_state = Mapped;
_mapped = (const uint64*)wgpuBufferGetConstMappedRange(_readBuffer, 0, _index * sizeof(uint64));
}
GPUDataUploaderWebGPU::Allocation GPUDataUploaderWebGPU::Allocate(uint32 size, WGPUBufferUsage usage, uint32 alignment)
{
// Find a free buffer from the current frame
@@ -167,6 +293,7 @@ bool GPUDeviceWebGPU::Init()
if (wgpuAdapterGetLimits(Adapter->Adapter, &limits) == WGPUStatus_Success)
{
MinUniformBufferOffsetAlignment = limits.minUniformBufferOffsetAlignment;
TimestampQuery = features.Contains(WGPUFeatureName_TimestampQuery);
Limits.HasInstancing = true;
Limits.HasDrawIndirect = true;
Limits.HasDepthAsSRV = true;
@@ -174,11 +301,11 @@ bool GPUDeviceWebGPU::Init()
Limits.HasDepthClip = features.Contains(WGPUFeatureName_DepthClipControl);
Limits.HasReadOnlyDepth = true;
Limits.MaximumSamplerAnisotropy = 4;
Limits.MaximumTexture1DSize = Math::Min<int32>(GPU_MAX_TEXTURE_SIZE, limits.maxTextureDimension1D);
Limits.MaximumTexture2DSize = Math::Min<int32>(GPU_MAX_TEXTURE_SIZE, limits.maxTextureDimension2D);
Limits.MaximumTexture3DSize = Math::Min<int32>(GPU_MAX_TEXTURE_SIZE, limits.maxTextureDimension3D);
Limits.MaximumMipLevelsCount = Math::Min<int32>(GPU_MAX_TEXTURE_MIP_LEVELS, (int32)log2(limits.maxTextureDimension2D));
Limits.MaximumTexture1DArraySize = Limits.MaximumTexture2DArraySize = Math::Min<int32>(GPU_MAX_TEXTURE_ARRAY_SIZE, limits.maxTextureArrayLayers);
Limits.MaximumTexture1DSize = limits.maxTextureDimension1D;
Limits.MaximumTexture2DSize = limits.maxTextureDimension2D;
Limits.MaximumTexture3DSize = limits.maxTextureDimension3D;
Limits.MaximumMipLevelsCount = (int32)log2(limits.maxTextureDimension2D);
Limits.MaximumTexture1DArraySize = Limits.MaximumTexture2DArraySize = limits.maxTextureArrayLayers;
if (limits.maxTextureArrayLayers >= 6)
Limits.MaximumTextureCubeSize = Limits.MaximumTexture2DSize;
@@ -624,7 +751,11 @@ void GPUDeviceWebGPU::Dispose()
preDispose();
// Clear device resources
for (int32 i = 0; i < QuerySetsCount; i++)
Delete(QuerySets[i]);
QuerySetsCount = 0;
DataUploader.ReleaseGPU();
SAFE_DELETE_GPU_RESOURCE(DefaultRenderTarget);
SAFE_DELETE_GPU_RESOURCES(DefaultTexture);
SAFE_DELETE_GPU_RESOURCES(DefaultSamplers);
SAFE_DELETE(_mainContext);
@@ -653,12 +784,68 @@ void GPUDeviceWebGPU::Dispose()
void GPUDeviceWebGPU::WaitForGPU()
{
// TODO: this could use onSubmittedWorkDone (assuming any submit has been already done)
}
GPUQueryWebGPU GPUDeviceWebGPU::AllocateQuery(GPUQueryType type)
{
// Ignore if device doesn't support timer queries
if (type == GPUQueryType::Timer && !TimestampQuery)
return {};
// Get query set with free space
int32 setIndex = 0;
for (; setIndex < QuerySetsCount; setIndex++)
{
auto heap = QuerySets[setIndex];
if (heap->Type == type && heap->CanAllocate())
break;
}
if (setIndex == QuerySetsCount)
{
if (setIndex == WEBGPU_MAX_QUERY_SETS)
{
#if !BUILD_RELEASE
static bool SingleTimeLog = true;
if (SingleTimeLog)
{
SingleTimeLog = false;
LOG(Error, "Run out of the query sets capacity.");
}
#endif
return {};
}
// Allocate a new query heap
PROFILE_MEM(GraphicsCommands);
uint32 size = type == GPUQueryType::Occlusion ? 4096 : 1024;
auto set = New<GPUQuerySetWebGPU>(Device, type, size);
QuerySets[QuerySetsCount++] = set;
}
// Allocate query from the set
GPUQueryWebGPU query;
{
static_assert(sizeof(GPUQueryWebGPU) == sizeof(uint64), "Invalid WebGPU query size.");
query.Set = setIndex;
query.Index = QuerySets[setIndex]->Allocate();
}
return query;
}
bool GPUDeviceWebGPU::GetQueryResult(uint64 queryID, uint64& result, bool wait)
{
// TODO: impl queries
return false;
if (queryID == 0)
{
// Invalid query
result = 0;
return true;
}
GPUQueryWebGPU query;
query.Raw = queryID;
auto set = QuerySets[query.Set];
return set->Read(query.Index, result, wait);
}
GPUTexture* GPUDeviceWebGPU::CreateTexture(const StringView& name)

View File

@@ -30,6 +30,62 @@ namespace GPUBindGroupsWebGPU
};
};
/// <summary>
/// GPU query ID packed into 64-bits.
/// </summary>
struct GPUQueryWebGPU
{
union
{
struct
{
uint32 Set;
uint32 Index;
};
uint64 Raw;
};
};
/// <summary>
/// Set of GPU queries allocated in batch with functionality to read results via a separate CPU buffer.
/// </summary>
class GPUQuerySetWebGPU
{
private:
WGPUDevice _device;
uint32 _count;
uint32 _index = 0;
enum States
{
Active,
Resolved,
Mapping,
Mapped,
} _state = Active;
#if COMPILE_WITH_PROFILER
uint64 _memorySize;
#endif
WGPUBuffer _queryBuffer;
WGPUBuffer _readBuffer;
const uint64* _mapped = nullptr;
public:
const GPUQueryType Type;
WGPUQuerySet Set;
public:
GPUQuerySetWebGPU(WGPUDevice device, GPUQueryType type, uint32 count);
~GPUQuerySetWebGPU();
bool CanAllocate() const;
uint32 Allocate();
void Resolve(WGPUCommandEncoder encoder);
bool Read(uint32 index, uint64& result, bool wait);
private:
void OnRead();
};
/// <summary>
/// Pool for uploading data to GPU buffers. It manages large buffers and suballocates for multiple small updates, minimizing the number of buffer creations and copies.
/// </summary>
@@ -79,11 +135,17 @@ public:
WGPUInstance WebGPUInstance;
WGPUDevice Device = nullptr;
WGPUQueue Queue = nullptr;
GPUTextureWebGPU* DefaultRenderTarget = nullptr;
GPUSamplerWebGPU* DefaultSamplers[6] = {};
GPUTextureWebGPU* DefaultTexture[10] = {};
WGPUBuffer DefaultBuffer = nullptr;
GPUDataUploaderWebGPU DataUploader;
uint32 MinUniformBufferOffsetAlignment = 1;
bool TimestampQuery = false;
uint32 QuerySetsCount = 0;
GPUQuerySetWebGPU* QuerySets[WEBGPU_MAX_QUERY_SETS] = {};
GPUQueryWebGPU AllocateQuery(GPUQueryType type);
public:
// [GPUDeviceDX]

View File

@@ -23,4 +23,6 @@
// Utiltiy macro to get WGPUStringView for a text constant
#define WEBGPU_STR(str) { str, ARRAY_COUNT(str) - 1 }
#define WEBGPU_MAX_QUERY_SETS 8
#endif