Refactor WebGPU ASYNCIFY to use JSPI

Smaller build size and better performance. Also, link time goes down a lot
2026-03-18 23:08:39 +01:00
parent 750fd1f941
commit a5ec8565e4
8 changed files with 130 additions and 28 deletions
--- a/Source/Engine/GraphicsDevice/WebGPU/GPUBufferWebGPU.cpp
+++ b/Source/Engine/GraphicsDevice/WebGPU/GPUBufferWebGPU.cpp
@@ -31,10 +31,10 @@ void* GPUBufferWebGPU::Map(GPUResourceMapMode mode)
        userData.Call(status == WGPUMapAsyncStatus_Success, status, message);
    };
    wgpuBufferMapAsync(Buffer, mapMode, 0, _desc.Size, mapRequest.Info);
-    auto mapRequestResult = mapRequest.Wait();
+    auto mapRequestResult = mapRequest.Wait(_device->WebGPUInstance);
    if (mapRequestResult == WGPUWaitStatus_TimedOut)
    {
-        LOG(Error, "WebGPU buffer map request has timed out after {}s", mapRequest.Data.WaitTime);
+        LOG(Error, "WebGPU buffer map request has timed out after {}s", (int32)mapRequest.Data.WaitTime);
        return nullptr;
    }
    if (mapRequestResult == WGPUWaitStatus_Error)
--- a/Source/Engine/GraphicsDevice/WebGPU/GPUDeviceWebGPU.cpp
+++ b/Source/Engine/GraphicsDevice/WebGPU/GPUDeviceWebGPU.cpp
@@ -618,10 +618,10 @@ bool GPUDeviceWebGPU::Init()
        userData.Call(status == WGPURequestDeviceStatus_Success, status, message);
    };
    wgpuAdapterRequestDevice(Adapter->Adapter, &deviceDesc, deviceRequest.Info);
-    auto deviceRequestResult = deviceRequest.Wait();
+    auto deviceRequestResult = deviceRequest.Wait(WebGPUInstance);
    if (deviceRequestResult == WGPUWaitStatus_TimedOut)
    {
-        LOG(Fatal, "WebGPU device request has timed out after {}s", deviceRequest.Data.WaitTime);
+        LOG(Fatal, "WebGPU device request has timed out after {}s", (int32)deviceRequest.Data.WaitTime);
        return true;
    }
    if (deviceRequestResult == WGPUWaitStatus_Error)
@@ -701,6 +701,11 @@ GPUDevice* CreateGPUDeviceWebGPU()
 {
    // Create instance
    WGPUInstanceDescriptor instanceDesc = WGPU_INSTANCE_DESCRIPTOR_INIT;
+#if !WEBGPU_ASYNCIFY && 0
+    WGPUInstanceFeatureName instanceFeatures[1] = { WGPUInstanceFeatureName_TimedWaitAny };
+    instanceDesc.requiredFeatureCount = 1;
+    instanceDesc.requiredFeatures = instanceFeatures;
+#endif
    WGPUInstance instance = wgpuCreateInstance(&instanceDesc);
    if (!instance)
    {
@@ -727,10 +732,10 @@ GPUDevice* CreateGPUDeviceWebGPU()
        userData.Call(status == WGPURequestAdapterStatus_Success, status, message);
    };
    wgpuInstanceRequestAdapter(instance, &adapterOptions, adapterRequest.Info);
-    auto adapterRequestResult = adapterRequest.Wait();
+    auto adapterRequestResult = adapterRequest.Wait(instance);
    if (adapterRequestResult == WGPUWaitStatus_TimedOut)
    {
-        LOG(Fatal, "WebGPU adapter request has timed out after {}s", adapterRequest.Data.WaitTime);
+        LOG(Fatal, "WebGPU adapter request has timed out after {}s", (int32)adapterRequest.Data.WaitTime);
        return nullptr;
    }
    if (adapterRequestResult == WGPUWaitStatus_Error)
@@ -796,7 +801,7 @@ void GPUDeviceWebGPU::Dispose()

 void GPUDeviceWebGPU::WaitForGPU()
 {
-    if (QueueSubmits == 0)
+    if (QueueSubmits == 0 || Engine::FatalError != FatalErrorType::None)
        return;
    QueueSubmits = 0;
    AsyncCallbackWebGPU<WGPUQueueWorkDoneCallbackInfo> workDone(WGPU_QUEUE_WORK_DONE_CALLBACK_INFO_INIT);
@@ -806,10 +811,10 @@ void GPUDeviceWebGPU::WaitForGPU()
        userData.Call(status == WGPUQueueWorkDoneStatus_Success, status, message);
    };
    wgpuQueueOnSubmittedWorkDone(Queue, workDone.Info);
-    auto workDoneResult = workDone.Wait();
+    auto workDoneResult = workDone.Wait(WebGPUInstance);
    if (workDoneResult == WGPUWaitStatus_TimedOut)
    {
-        LOG(Error, "WebGPU queue wait has timed out after {}s", workDone.Data.WaitTime);
+        LOG(Error, "WebGPU queue wait has timed out after {}s", (int32)workDone.Data.WaitTime);
        return;
    }
    if (workDoneResult == WGPUWaitStatus_Error)
--- a/Source/Engine/GraphicsDevice/WebGPU/GraphicsDeviceWebGPU.Build.cs
+++ b/Source/Engine/GraphicsDevice/WebGPU/GraphicsDeviceWebGPU.Build.cs
@@ -9,15 +9,37 @@ using Flax.Build.Platforms;
 /// </summary>
 public class GraphicsDeviceWebGPU : GraphicsDeviceBaseModule
 {
+    /// <summary>
+    /// Using ASYNCIFY leads to simple code by waiting on async WebGPU API callbacks with emscripten_sleep but doubles the code size and adds some overhead.
+    /// https://emscripten.org/docs/porting/asyncify.html <br/>
+    /// 0 - no async <br/>
+    /// 1 - via Asyncify (causes the WASM to be much larger) <br/>
+    /// 2 - via JSPI (experimental) <br/>
+    /// </summary>
+    public int WithAsyncify = 2;
+
    /// <inheritdoc />
    public override void Setup(BuildOptions options)
    {
        base.Setup(options);

        var port = "--use-port=emdawnwebgpu:cpp_bindings=false";
-        options.CompileEnv.CustomArgs.Add(port);
-        options.LinkEnv.CustomArgs.Add("-sASYNCIFY");
        options.OutputFiles.Add(port);
+        options.CompileEnv.CustomArgs.Add(port);
+        if (WithAsyncify == 2)
+        {
+            options.PrivateDefinitions.Add("WEBGPU_ASYNCIFY=2");
+            options.LinkEnv.CustomArgs.Add("-sJSPI");
+            options.LinkEnv.CustomArgs.Add("-sDEFAULT_LIBRARY_FUNCS_TO_INCLUDE=$getWasmTableEntry");
+        }
+        else if (WithAsyncify == 1)
+        {
+            options.PrivateDefinitions.Add("WEBGPU_ASYNCIFY");
+            options.LinkEnv.CustomArgs.Add("-sASYNCIFY");
+            options.LinkEnv.CustomArgs.Add("-sASYNCIFY_STACK_SIZE=8192");
+            //options.LinkEnv.CustomArgs.Add("-sASYNCIFY_ONLY=[\"main\",\"WebGPUAsyncWait(AsyncWaitParamsWebGPU)\"]"); // TODO: try indirect calls only to reduce the code size
+            options.LinkEnv.CustomArgs.Add("-sEXPORT_ALL"); // This bloats JS but otherwise dynamic calls don't work properly
+        }
        options.PublicDefinitions.Add("GRAPHICS_API_WEBGPU");
        options.PrivateIncludePaths.Add(Path.Combine(EmscriptenSdk.Instance.EmscriptenPath, "emscripten/cache/ports/emdawnwebgpu/emdawnwebgpu_pkg/webgpu/include"));
        options.PrivateDependencies.Add("lz4");
--- a/Source/Engine/GraphicsDevice/WebGPU/RenderToolsWebGPU.cpp
+++ b/Source/Engine/GraphicsDevice/WebGPU/RenderToolsWebGPU.cpp
@@ -4,6 +4,36 @@

 #include "RenderToolsWebGPU.h"
 #include "Engine/Graphics/PixelFormat.h"
+#include <emscripten/emscripten.h>
+
+WGPUWaitStatus WebGPUAsyncWait(AsyncWaitParamsWebGPU params)
+{
+#if 0
+    // This needs WGPUInstanceFeatureName_TimedWaitAny which works only with ASYNCIFY enabled
+    WGPUFutureWaitInfo futureWaitInfo;
+    futureWaitInfo.future = future;
+    futureWaitInfo.completed = WGPU_FALSE;
+    uint64 timeoutNS = 5000000000ull; // Wait max 5 second
+    return wgpuInstanceWaitAny(params.Instance, 1, &futureWaitInfo, timeoutNS);
+#endif
+
+#if WEBGPU_ASYNCIFY
+    auto startTime = Platform::GetTimeSeconds();
+    int32 ticksLeft = 500; // Wait max 5 second
+    while (Platform::AtomicRead(&params.Data->Result) == 0 && ticksLeft-- > 0)
+        emscripten_sleep(10);
+    if (ticksLeft <= 0)
+    {
+        params.Data->WaitTime = Platform::GetTimeSeconds() - startTime;
+        return WGPUWaitStatus_TimedOut;
+    }
+    return params.Data->Result == 1 ? WGPUWaitStatus_Success : WGPUWaitStatus_Error;
+#else
+    // Not possible to implement it here with stack preservation (need to go back with main thread to the browser)
+    // Make GPU adapter/device requests register custom retry via emscripten_set_main_loop with coroutine or something like that to make it work without ASYNCIFY
+    return WGPUWaitStatus_Error;
+#endif
+}

 WGPUVertexFormat RenderToolsWebGPU::ToVertexFormat(PixelFormat format)
 {
--- a/Source/Engine/GraphicsDevice/WebGPU/RenderToolsWebGPU.h
+++ b/Source/Engine/GraphicsDevice/WebGPU/RenderToolsWebGPU.h
@@ -6,7 +6,6 @@

 #include "Engine/Core/Types/String.h"
 #include "IncludeWebGPU.h"
-#include <emscripten/emscripten.h>

 enum class PixelFormat : unsigned;

@@ -29,6 +28,13 @@ struct AsyncCallbackDataWebGPU
    }
 };

+struct AsyncWaitParamsWebGPU
+{
+    WGPUInstance Instance;
+    AsyncCallbackDataWebGPU* Data;
+};
+WGPUWaitStatus WebGPUAsyncWait(AsyncWaitParamsWebGPU params);
+
 /// <summary>
 /// Helper utility to run WebGPU APIs that use async callback in sync by waiting on the spontaneous call back with an active-waiting loop.
 /// </summary>
@@ -45,18 +51,9 @@ struct AsyncCallbackWebGPU
        Info.userdata1 = &Data;
    }

-    WGPUWaitStatus Wait()
+    FORCE_INLINE WGPUWaitStatus Wait(WGPUInstance instance)
    {
-        auto startTime = Platform::GetTimeSeconds();
-        int32 ticksLeft = 500; // Wait max 5 second
-        while (Platform::AtomicRead(&Data.Result) == 0 && ticksLeft-- > 0)
-            emscripten_sleep(10);
-        if (ticksLeft <= 0)
-        {
-            Data.WaitTime = Platform::GetTimeSeconds() - startTime;
-            return WGPUWaitStatus_TimedOut;
-        }
-        return Data.Result == 1 ? WGPUWaitStatus_Success : WGPUWaitStatus_Error;
+        return WebGPUAsyncWait({ instance, &Data });
    }
 };