Refactor WebGPU ASYNCIFY to use JSPI

Smaller build size and better performance. Also, link time goes down a lot
This commit is contained in:
Wojtek Figat
2026-03-18 23:08:39 +01:00
parent 750fd1f941
commit a5ec8565e4
8 changed files with 130 additions and 28 deletions

View File

@@ -31,10 +31,10 @@ void* GPUBufferWebGPU::Map(GPUResourceMapMode mode)
userData.Call(status == WGPUMapAsyncStatus_Success, status, message);
};
wgpuBufferMapAsync(Buffer, mapMode, 0, _desc.Size, mapRequest.Info);
auto mapRequestResult = mapRequest.Wait();
auto mapRequestResult = mapRequest.Wait(_device->WebGPUInstance);
if (mapRequestResult == WGPUWaitStatus_TimedOut)
{
LOG(Error, "WebGPU buffer map request has timed out after {}s", mapRequest.Data.WaitTime);
LOG(Error, "WebGPU buffer map request has timed out after {}s", (int32)mapRequest.Data.WaitTime);
return nullptr;
}
if (mapRequestResult == WGPUWaitStatus_Error)

View File

@@ -618,10 +618,10 @@ bool GPUDeviceWebGPU::Init()
userData.Call(status == WGPURequestDeviceStatus_Success, status, message);
};
wgpuAdapterRequestDevice(Adapter->Adapter, &deviceDesc, deviceRequest.Info);
auto deviceRequestResult = deviceRequest.Wait();
auto deviceRequestResult = deviceRequest.Wait(WebGPUInstance);
if (deviceRequestResult == WGPUWaitStatus_TimedOut)
{
LOG(Fatal, "WebGPU device request has timed out after {}s", deviceRequest.Data.WaitTime);
LOG(Fatal, "WebGPU device request has timed out after {}s", (int32)deviceRequest.Data.WaitTime);
return true;
}
if (deviceRequestResult == WGPUWaitStatus_Error)
@@ -701,6 +701,11 @@ GPUDevice* CreateGPUDeviceWebGPU()
{
// Create instance
WGPUInstanceDescriptor instanceDesc = WGPU_INSTANCE_DESCRIPTOR_INIT;
#if !WEBGPU_ASYNCIFY && 0
WGPUInstanceFeatureName instanceFeatures[1] = { WGPUInstanceFeatureName_TimedWaitAny };
instanceDesc.requiredFeatureCount = 1;
instanceDesc.requiredFeatures = instanceFeatures;
#endif
WGPUInstance instance = wgpuCreateInstance(&instanceDesc);
if (!instance)
{
@@ -727,10 +732,10 @@ GPUDevice* CreateGPUDeviceWebGPU()
userData.Call(status == WGPURequestAdapterStatus_Success, status, message);
};
wgpuInstanceRequestAdapter(instance, &adapterOptions, adapterRequest.Info);
auto adapterRequestResult = adapterRequest.Wait();
auto adapterRequestResult = adapterRequest.Wait(instance);
if (adapterRequestResult == WGPUWaitStatus_TimedOut)
{
LOG(Fatal, "WebGPU adapter request has timed out after {}s", adapterRequest.Data.WaitTime);
LOG(Fatal, "WebGPU adapter request has timed out after {}s", (int32)adapterRequest.Data.WaitTime);
return nullptr;
}
if (adapterRequestResult == WGPUWaitStatus_Error)
@@ -796,7 +801,7 @@ void GPUDeviceWebGPU::Dispose()
void GPUDeviceWebGPU::WaitForGPU()
{
if (QueueSubmits == 0)
if (QueueSubmits == 0 || Engine::FatalError != FatalErrorType::None)
return;
QueueSubmits = 0;
AsyncCallbackWebGPU<WGPUQueueWorkDoneCallbackInfo> workDone(WGPU_QUEUE_WORK_DONE_CALLBACK_INFO_INIT);
@@ -806,10 +811,10 @@ void GPUDeviceWebGPU::WaitForGPU()
userData.Call(status == WGPUQueueWorkDoneStatus_Success, status, message);
};
wgpuQueueOnSubmittedWorkDone(Queue, workDone.Info);
auto workDoneResult = workDone.Wait();
auto workDoneResult = workDone.Wait(WebGPUInstance);
if (workDoneResult == WGPUWaitStatus_TimedOut)
{
LOG(Error, "WebGPU queue wait has timed out after {}s", workDone.Data.WaitTime);
LOG(Error, "WebGPU queue wait has timed out after {}s", (int32)workDone.Data.WaitTime);
return;
}
if (workDoneResult == WGPUWaitStatus_Error)

View File

@@ -9,15 +9,37 @@ using Flax.Build.Platforms;
/// </summary>
public class GraphicsDeviceWebGPU : GraphicsDeviceBaseModule
{
/// <summary>
/// Using ASYNCIFY leads to simple code by waiting on async WebGPU API callbacks with emscripten_sleep but doubles the code size and adds some overhead.
/// https://emscripten.org/docs/porting/asyncify.html <br/>
/// 0 - no async <br/>
/// 1 - via Asyncify (causes the WASM to be much larger) <br/>
/// 2 - via JSPI (experimental) <br/>
/// </summary>
public int WithAsyncify = 2;
/// <inheritdoc />
public override void Setup(BuildOptions options)
{
base.Setup(options);
var port = "--use-port=emdawnwebgpu:cpp_bindings=false";
options.CompileEnv.CustomArgs.Add(port);
options.LinkEnv.CustomArgs.Add("-sASYNCIFY");
options.OutputFiles.Add(port);
options.CompileEnv.CustomArgs.Add(port);
if (WithAsyncify == 2)
{
options.PrivateDefinitions.Add("WEBGPU_ASYNCIFY=2");
options.LinkEnv.CustomArgs.Add("-sJSPI");
options.LinkEnv.CustomArgs.Add("-sDEFAULT_LIBRARY_FUNCS_TO_INCLUDE=$getWasmTableEntry");
}
else if (WithAsyncify == 1)
{
options.PrivateDefinitions.Add("WEBGPU_ASYNCIFY");
options.LinkEnv.CustomArgs.Add("-sASYNCIFY");
options.LinkEnv.CustomArgs.Add("-sASYNCIFY_STACK_SIZE=8192");
//options.LinkEnv.CustomArgs.Add("-sASYNCIFY_ONLY=[\"main\",\"WebGPUAsyncWait(AsyncWaitParamsWebGPU)\"]"); // TODO: try indirect calls only to reduce the code size
options.LinkEnv.CustomArgs.Add("-sEXPORT_ALL"); // This bloats JS but otherwise dynamic calls don't work properly
}
options.PublicDefinitions.Add("GRAPHICS_API_WEBGPU");
options.PrivateIncludePaths.Add(Path.Combine(EmscriptenSdk.Instance.EmscriptenPath, "emscripten/cache/ports/emdawnwebgpu/emdawnwebgpu_pkg/webgpu/include"));
options.PrivateDependencies.Add("lz4");

View File

@@ -4,6 +4,36 @@
#include "RenderToolsWebGPU.h"
#include "Engine/Graphics/PixelFormat.h"
#include <emscripten/emscripten.h>
WGPUWaitStatus WebGPUAsyncWait(AsyncWaitParamsWebGPU params)
{
#if 0
// This needs WGPUInstanceFeatureName_TimedWaitAny which works only with ASYNCIFY enabled
WGPUFutureWaitInfo futureWaitInfo;
futureWaitInfo.future = future;
futureWaitInfo.completed = WGPU_FALSE;
uint64 timeoutNS = 5000000000ull; // Wait max 5 second
return wgpuInstanceWaitAny(params.Instance, 1, &futureWaitInfo, timeoutNS);
#endif
#if WEBGPU_ASYNCIFY
auto startTime = Platform::GetTimeSeconds();
int32 ticksLeft = 500; // Wait max 5 second
while (Platform::AtomicRead(&params.Data->Result) == 0 && ticksLeft-- > 0)
emscripten_sleep(10);
if (ticksLeft <= 0)
{
params.Data->WaitTime = Platform::GetTimeSeconds() - startTime;
return WGPUWaitStatus_TimedOut;
}
return params.Data->Result == 1 ? WGPUWaitStatus_Success : WGPUWaitStatus_Error;
#else
// Not possible to implement it here with stack preservation (need to go back with main thread to the browser)
// Make GPU adapter/device requests register custom retry via emscripten_set_main_loop with coroutine or something like that to make it work without ASYNCIFY
return WGPUWaitStatus_Error;
#endif
}
WGPUVertexFormat RenderToolsWebGPU::ToVertexFormat(PixelFormat format)
{

View File

@@ -6,7 +6,6 @@
#include "Engine/Core/Types/String.h"
#include "IncludeWebGPU.h"
#include <emscripten/emscripten.h>
enum class PixelFormat : unsigned;
@@ -29,6 +28,13 @@ struct AsyncCallbackDataWebGPU
}
};
struct AsyncWaitParamsWebGPU
{
WGPUInstance Instance;
AsyncCallbackDataWebGPU* Data;
};
WGPUWaitStatus WebGPUAsyncWait(AsyncWaitParamsWebGPU params);
/// <summary>
/// Helper utility to run WebGPU APIs that use async callback in sync by waiting on the spontaneous call back with an active-waiting loop.
/// </summary>
@@ -45,18 +51,9 @@ struct AsyncCallbackWebGPU
Info.userdata1 = &Data;
}
WGPUWaitStatus Wait()
FORCE_INLINE WGPUWaitStatus Wait(WGPUInstance instance)
{
auto startTime = Platform::GetTimeSeconds();
int32 ticksLeft = 500; // Wait max 5 second
while (Platform::AtomicRead(&Data.Result) == 0 && ticksLeft-- > 0)
emscripten_sleep(10);
if (ticksLeft <= 0)
{
Data.WaitTime = Platform::GetTimeSeconds() - startTime;
return WGPUWaitStatus_TimedOut;
}
return Data.Result == 1 ? WGPUWaitStatus_Success : WGPUWaitStatus_Error;
return WebGPUAsyncWait({ instance, &Data });
}
};

View File

@@ -5,8 +5,33 @@
#include "Engine/Engine/Engine.h"
#include <emscripten/emscripten.h>
// Reference: https://github.com/kainino0x/webgpu-cross-platform-demo/blob/f5c69c6fccbb2584c1b6f9e559f9a41a38a9b5ad/main.cpp#L692-L704
// Reference: https://github.com/kainino0x/webgpu-cross-platform-demo/blob/c26ea3e29ed9f73f9b39bddf7964b482ce3c6964/main.cpp#L737-L758
#define WEB_LOOP_MODE 2 // 0 - default, 1 - Asyncify, 2 - JSPI
#if WEB_LOOP_MODE != 0
// Workaround for JSPI not working in emscripten_set_main_loop. Loosely based on this code:
// https://github.com/emscripten-core/emscripten/issues/22493#issuecomment-2330275282
// This code only works with JSPI is enabled.
typedef bool (*FrameCallback)(); // If callback returns true, continues the loop.
EM_JS(void, requestAnimationFrameLoopWithJSPI, (FrameCallback callback), {
#if WEB_LOOP_MODE == 2
var callback = WebAssembly.promising(getWasmTableEntry(callback));
#elif WEB_LOOP_MODE == 1
var callback = () = > globalThis['Module']['ccall']("callback", "boolean", [], [], { async: true });
#endif
async function tick() {
// Start the frame callback. 'await' means we won't call
// requestAnimationFrame again until it completes.
var keepLooping = await callback();
if (keepLooping) requestAnimationFrame(tick);
}
requestAnimationFrame(tick);
})
#endif
class PlatformMain
{
#if WEB_LOOP_MODE == 0
static void Loop()
{
// Tick engine
@@ -16,11 +41,30 @@ class PlatformMain
{
// Exit engine
Engine::OnExit();
emscripten_cancel_main_loop();
emscripten_force_exit(Engine::ExitCode);
return;
}
}
#else
static bool Loop()
{
if (Engine::FatalError != FatalErrorType::None)
return false;
// Tick engine
Engine::OnLoop();
if (Engine::ShouldExit())
{
// Exit engine
Engine::OnExit();
emscripten_cancel_main_loop();
emscripten_force_exit(Engine::ExitCode);
return false;
}
return true;
}
#endif
public:
static int32 Main()
@@ -31,7 +75,11 @@ public:
return result;
// Setup main loop to be called by Emscripten
#if WEB_LOOP_MODE == 0
emscripten_set_main_loop(Loop, -1, false);
#else
requestAnimationFrameLoopWithJSPI(Loop);
#endif
emscripten_set_main_loop_timing(EM_TIMING_RAF, 1); // Run main loop on each animation frame (vsync)
// Run the first loop

View File

@@ -125,7 +125,6 @@ void WebPlatform::SetThreadAffinityMask(uint64 affinityMask)
void WebPlatform::Sleep(int32 milliseconds)
{
//emscripten_sleep(milliseconds);
emscripten_thread_sleep(milliseconds);
}

View File

@@ -123,7 +123,7 @@ namespace Flax.Build.Platforms
if (options.CompileEnv.FavorSizeOrSpeed == FavorSizeOrSpeed.SmallCode)
args.Add("-Oz");
if (options.CompileEnv.FavorSizeOrSpeed == FavorSizeOrSpeed.FastCode)
args.Add("-O3");
args.Add(debugInformation ? "-O2" : "-O3");
else if (optimization && options.Configuration == TargetConfiguration.Release)
args.Add("-O3");
else if (optimization)
@@ -290,7 +290,9 @@ namespace Flax.Build.Platforms
{
args.Add(string.Format("-o \"{0}\"", outputFilePath.Replace('\\', '/')));
// Debug options
//args.Add("--minify=0");
//args.Add("-sASSERTIONS=2");
AddSharedArgs(args, options, options.LinkEnv.DebugInformation, options.LinkEnv.Optimization);
@@ -307,7 +309,6 @@ namespace Flax.Build.Platforms
initialMemory = Math.Max(initialMemory, 64); // Address Sanitizer needs more memory
args.Add($"-sINITIAL_MEMORY={initialMemory}MB");
args.Add("-sSTACK_SIZE=4MB");
args.Add("-sASYNCIFY_STACK_SIZE=8192");
args.Add("-sALLOW_MEMORY_GROWTH=1");
//args.Add("-sSAFE_HEAP=1");
args.Add("-sABORTING_MALLOC=0");