Refactor WebGPU ASYNCIFY to use JSPI

Smaller build size and better performance. Also, link time goes down a lot
This commit is contained in:
Wojtek Figat
2026-03-18 23:08:39 +01:00
parent 750fd1f941
commit a5ec8565e4
8 changed files with 130 additions and 28 deletions

View File

@@ -31,10 +31,10 @@ void* GPUBufferWebGPU::Map(GPUResourceMapMode mode)
userData.Call(status == WGPUMapAsyncStatus_Success, status, message);
};
wgpuBufferMapAsync(Buffer, mapMode, 0, _desc.Size, mapRequest.Info);
auto mapRequestResult = mapRequest.Wait();
auto mapRequestResult = mapRequest.Wait(_device->WebGPUInstance);
if (mapRequestResult == WGPUWaitStatus_TimedOut)
{
LOG(Error, "WebGPU buffer map request has timed out after {}s", mapRequest.Data.WaitTime);
LOG(Error, "WebGPU buffer map request has timed out after {}s", (int32)mapRequest.Data.WaitTime);
return nullptr;
}
if (mapRequestResult == WGPUWaitStatus_Error)

View File

@@ -618,10 +618,10 @@ bool GPUDeviceWebGPU::Init()
userData.Call(status == WGPURequestDeviceStatus_Success, status, message);
};
wgpuAdapterRequestDevice(Adapter->Adapter, &deviceDesc, deviceRequest.Info);
auto deviceRequestResult = deviceRequest.Wait();
auto deviceRequestResult = deviceRequest.Wait(WebGPUInstance);
if (deviceRequestResult == WGPUWaitStatus_TimedOut)
{
LOG(Fatal, "WebGPU device request has timed out after {}s", deviceRequest.Data.WaitTime);
LOG(Fatal, "WebGPU device request has timed out after {}s", (int32)deviceRequest.Data.WaitTime);
return true;
}
if (deviceRequestResult == WGPUWaitStatus_Error)
@@ -701,6 +701,11 @@ GPUDevice* CreateGPUDeviceWebGPU()
{
// Create instance
WGPUInstanceDescriptor instanceDesc = WGPU_INSTANCE_DESCRIPTOR_INIT;
#if !WEBGPU_ASYNCIFY && 0
WGPUInstanceFeatureName instanceFeatures[1] = { WGPUInstanceFeatureName_TimedWaitAny };
instanceDesc.requiredFeatureCount = 1;
instanceDesc.requiredFeatures = instanceFeatures;
#endif
WGPUInstance instance = wgpuCreateInstance(&instanceDesc);
if (!instance)
{
@@ -727,10 +732,10 @@ GPUDevice* CreateGPUDeviceWebGPU()
userData.Call(status == WGPURequestAdapterStatus_Success, status, message);
};
wgpuInstanceRequestAdapter(instance, &adapterOptions, adapterRequest.Info);
auto adapterRequestResult = adapterRequest.Wait();
auto adapterRequestResult = adapterRequest.Wait(instance);
if (adapterRequestResult == WGPUWaitStatus_TimedOut)
{
LOG(Fatal, "WebGPU adapter request has timed out after {}s", adapterRequest.Data.WaitTime);
LOG(Fatal, "WebGPU adapter request has timed out after {}s", (int32)adapterRequest.Data.WaitTime);
return nullptr;
}
if (adapterRequestResult == WGPUWaitStatus_Error)
@@ -796,7 +801,7 @@ void GPUDeviceWebGPU::Dispose()
void GPUDeviceWebGPU::WaitForGPU()
{
if (QueueSubmits == 0)
if (QueueSubmits == 0 || Engine::FatalError != FatalErrorType::None)
return;
QueueSubmits = 0;
AsyncCallbackWebGPU<WGPUQueueWorkDoneCallbackInfo> workDone(WGPU_QUEUE_WORK_DONE_CALLBACK_INFO_INIT);
@@ -806,10 +811,10 @@ void GPUDeviceWebGPU::WaitForGPU()
userData.Call(status == WGPUQueueWorkDoneStatus_Success, status, message);
};
wgpuQueueOnSubmittedWorkDone(Queue, workDone.Info);
auto workDoneResult = workDone.Wait();
auto workDoneResult = workDone.Wait(WebGPUInstance);
if (workDoneResult == WGPUWaitStatus_TimedOut)
{
LOG(Error, "WebGPU queue wait has timed out after {}s", workDone.Data.WaitTime);
LOG(Error, "WebGPU queue wait has timed out after {}s", (int32)workDone.Data.WaitTime);
return;
}
if (workDoneResult == WGPUWaitStatus_Error)

View File

@@ -9,15 +9,37 @@ using Flax.Build.Platforms;
/// </summary>
public class GraphicsDeviceWebGPU : GraphicsDeviceBaseModule
{
/// <summary>
/// Using ASYNCIFY leads to simple code by waiting on async WebGPU API callbacks with emscripten_sleep but doubles the code size and adds some overhead.
/// https://emscripten.org/docs/porting/asyncify.html <br/>
/// 0 - no async <br/>
/// 1 - via Asyncify (causes the WASM to be much larger) <br/>
/// 2 - via JSPI (experimental) <br/>
/// </summary>
public int WithAsyncify = 2;
/// <inheritdoc />
public override void Setup(BuildOptions options)
{
base.Setup(options);
var port = "--use-port=emdawnwebgpu:cpp_bindings=false";
options.CompileEnv.CustomArgs.Add(port);
options.LinkEnv.CustomArgs.Add("-sASYNCIFY");
options.OutputFiles.Add(port);
options.CompileEnv.CustomArgs.Add(port);
if (WithAsyncify == 2)
{
options.PrivateDefinitions.Add("WEBGPU_ASYNCIFY=2");
options.LinkEnv.CustomArgs.Add("-sJSPI");
options.LinkEnv.CustomArgs.Add("-sDEFAULT_LIBRARY_FUNCS_TO_INCLUDE=$getWasmTableEntry");
}
else if (WithAsyncify == 1)
{
options.PrivateDefinitions.Add("WEBGPU_ASYNCIFY");
options.LinkEnv.CustomArgs.Add("-sASYNCIFY");
options.LinkEnv.CustomArgs.Add("-sASYNCIFY_STACK_SIZE=8192");
//options.LinkEnv.CustomArgs.Add("-sASYNCIFY_ONLY=[\"main\",\"WebGPUAsyncWait(AsyncWaitParamsWebGPU)\"]"); // TODO: try indirect calls only to reduce the code size
options.LinkEnv.CustomArgs.Add("-sEXPORT_ALL"); // This bloats JS but otherwise dynamic calls don't work properly
}
options.PublicDefinitions.Add("GRAPHICS_API_WEBGPU");
options.PrivateIncludePaths.Add(Path.Combine(EmscriptenSdk.Instance.EmscriptenPath, "emscripten/cache/ports/emdawnwebgpu/emdawnwebgpu_pkg/webgpu/include"));
options.PrivateDependencies.Add("lz4");

View File

@@ -4,6 +4,36 @@
#include "RenderToolsWebGPU.h"
#include "Engine/Graphics/PixelFormat.h"
#include <emscripten/emscripten.h>
WGPUWaitStatus WebGPUAsyncWait(AsyncWaitParamsWebGPU params)
{
#if 0
// This needs WGPUInstanceFeatureName_TimedWaitAny which works only with ASYNCIFY enabled
WGPUFutureWaitInfo futureWaitInfo;
futureWaitInfo.future = future;
futureWaitInfo.completed = WGPU_FALSE;
uint64 timeoutNS = 5000000000ull; // Wait max 5 second
return wgpuInstanceWaitAny(params.Instance, 1, &futureWaitInfo, timeoutNS);
#endif
#if WEBGPU_ASYNCIFY
auto startTime = Platform::GetTimeSeconds();
int32 ticksLeft = 500; // Wait max 5 second
while (Platform::AtomicRead(&params.Data->Result) == 0 && ticksLeft-- > 0)
emscripten_sleep(10);
if (ticksLeft <= 0)
{
params.Data->WaitTime = Platform::GetTimeSeconds() - startTime;
return WGPUWaitStatus_TimedOut;
}
return params.Data->Result == 1 ? WGPUWaitStatus_Success : WGPUWaitStatus_Error;
#else
// Not possible to implement it here with stack preservation (need to go back with main thread to the browser)
// Make GPU adapter/device requests register custom retry via emscripten_set_main_loop with coroutine or something like that to make it work without ASYNCIFY
return WGPUWaitStatus_Error;
#endif
}
WGPUVertexFormat RenderToolsWebGPU::ToVertexFormat(PixelFormat format)
{

View File

@@ -6,7 +6,6 @@
#include "Engine/Core/Types/String.h"
#include "IncludeWebGPU.h"
#include <emscripten/emscripten.h>
enum class PixelFormat : unsigned;
@@ -29,6 +28,13 @@ struct AsyncCallbackDataWebGPU
}
};
struct AsyncWaitParamsWebGPU
{
WGPUInstance Instance;
AsyncCallbackDataWebGPU* Data;
};
WGPUWaitStatus WebGPUAsyncWait(AsyncWaitParamsWebGPU params);
/// <summary>
/// Helper utility to run WebGPU APIs that use async callback in sync by waiting on the spontaneous call back with an active-waiting loop.
/// </summary>
@@ -45,18 +51,9 @@ struct AsyncCallbackWebGPU
Info.userdata1 = &Data;
}
WGPUWaitStatus Wait()
FORCE_INLINE WGPUWaitStatus Wait(WGPUInstance instance)
{
auto startTime = Platform::GetTimeSeconds();
int32 ticksLeft = 500; // Wait max 5 second
while (Platform::AtomicRead(&Data.Result) == 0 && ticksLeft-- > 0)
emscripten_sleep(10);
if (ticksLeft <= 0)
{
Data.WaitTime = Platform::GetTimeSeconds() - startTime;
return WGPUWaitStatus_TimedOut;
}
return Data.Result == 1 ? WGPUWaitStatus_Success : WGPUWaitStatus_Error;
return WebGPUAsyncWait({ instance, &Data });
}
};