diff --git a/Source/Editor/Windows/AboutDialog.cs b/Source/Editor/Windows/AboutDialog.cs
index bb320fb87..d50908f28 100644
--- a/Source/Editor/Windows/AboutDialog.cs
+++ b/Source/Editor/Windows/AboutDialog.cs
@@ -131,6 +131,7 @@ namespace FlaxEditor.Windows
                 "LZ4 Library - Copyright (c) Yann Collet. All rights reserved.",
                 "fmt - www.fmtlib.net",
                 "minimp3 - www.github.com/lieff/minimp3",
+                "Tracy Profiler - www.github.com/wolfpld/tracy",
                 "Ogg and Vorbis - Xiph.org Foundation",
                 "OpenAL Soft - www.github.com/kcat/openal-soft",
                 "OpenFBX - www.github.com/nem0/OpenFBX",
diff --git a/Source/Engine/Content/Asset.cpp b/Source/Engine/Content/Asset.cpp
index 11b1bed39..7fcd2f405 100644
--- a/Source/Engine/Content/Asset.cpp
+++ b/Source/Engine/Content/Asset.cpp
@@ -357,7 +357,15 @@ bool Asset::onLoad(LoadAssetTask* task)
     Locker.Lock();
 
     // Load asset
-    const LoadResult result = loadAsset();
+    LoadResult result;
+    {
+#if TRACY_ENABLE
+        ZoneScoped;
+        const StringView name(GetPath());
+        ZoneName(*name, name.Length());
+#endif
+        result = loadAsset();
+    }
     const bool isLoaded = result == LoadResult::Ok;
     const bool failed = !isLoaded;
     _loadFailed = failed;
diff --git a/Source/Engine/Engine/Engine.cpp b/Source/Engine/Engine/Engine.cpp
index 012b3290c..24d691c56 100644
--- a/Source/Engine/Engine/Engine.cpp
+++ b/Source/Engine/Engine/Engine.cpp
@@ -166,6 +166,7 @@ int32 Engine::Main(const Char* cmdLine)
 			}
 		}
 #endif
+
         // App paused logic
         if (Platform::GetIsPaused())
         {
@@ -202,6 +203,7 @@ int32 Engine::Main(const Char* cmdLine)
         {
             OnDraw();
             Time::OnEndDraw();
+            FrameMark;
             canDraw = false;
         }
 
diff --git a/Source/Engine/Platform/Base/PlatformBase.cpp b/Source/Engine/Platform/Base/PlatformBase.cpp
index 751f63dab..f6aa0e2d6 100644
--- a/Source/Engine/Platform/Base/PlatformBase.cpp
+++ b/Source/Engine/Platform/Base/PlatformBase.cpp
@@ -14,7 +14,7 @@
 #include "Engine/Core/Math/Rectangle.h"
 #include "Engine/Core/Utilities.h"
 #if COMPILE_WITH_PROFILER
-#include "Engine/Profiler/ProfilerMemory.h"
+#include "Engine/Profiler/ProfilerCPU.h"
 #endif
 #include "Engine/Threading/Threading.h"
 #include "Engine/Engine/CommandLine.h"
@@ -165,6 +165,44 @@ void PlatformBase::Exit()
 {
 }
 
+#if COMPILE_WITH_PROFILER
+
+void PlatformBase::OnMemoryAlloc(void* ptr, uint64 size)
+{
+    if (!ptr)
+        return;
+
+#if TRACY_ENABLE
+    // Track memory allocation in Tracy
+    //tracy::Profiler::MemAlloc(ptr, size, false);
+    tracy::Profiler::MemAllocCallstack(ptr, size, 12, false);
+#endif
+
+    // Register allocation during the current CPU event
+    auto thread = ProfilerCPU::GetCurrentThread();
+    if (thread != nullptr && thread->Buffer.GetCount() != 0)
+    {
+        auto& activeEvent = thread->Buffer.Last().Event();
+        if (activeEvent.End < ZeroTolerance)
+        {
+            activeEvent.NativeMemoryAllocation += (int32)size;
+        }
+    }
+}
+
+void PlatformBase::OnMemoryFree(void* ptr)
+{
+    if (!ptr)
+        return;
+
+#if TRACY_ENABLE
+    // Track memory allocation in Tracy
+    tracy::Profiler::MemFree(ptr, false);
+#endif
+}
+
+#endif
+
 void* PlatformBase::AllocatePages(uint64 numPages, uint64 pageSize)
 {
     // Fallback to the default memory allocation
@@ -460,15 +498,6 @@ Vector2 PlatformBase::GetVirtualDesktopSize()
     return Platform::GetVirtualDesktopBounds().Size;
 }
 
-#if COMPILE_WITH_PROFILER
-
-void PlatformBase::TrackAllocation(uint64 size)
-{
-    ProfilerMemory::OnAllocation((uint32)size, false);
-}
-
-#endif
-
 void PlatformBase::GetEnvironmentVariables(Dictionary<String, String>& result)
 {
     // Not supported
diff --git a/Source/Engine/Platform/Base/PlatformBase.h b/Source/Engine/Platform/Base/PlatformBase.h
index 0d972f412..a1a857451 100644
--- a/Source/Engine/Platform/Base/PlatformBase.h
+++ b/Source/Engine/Platform/Base/PlatformBase.h
@@ -299,7 +299,8 @@ public:
     static void Prefetch(void const* ptr) = delete;
 
 #if COMPILE_WITH_PROFILER
-    static void TrackAllocation(uint64 size);
+    static void OnMemoryAlloc(void* ptr, uint64 size);
+    static void OnMemoryFree(void* ptr);
 #endif
 
     /// <summary>
diff --git a/Source/Engine/Platform/Base/ThreadBase.cpp b/Source/Engine/Platform/Base/ThreadBase.cpp
index 41f5f0a57..ddcba4f7a 100644
--- a/Source/Engine/Platform/Base/ThreadBase.cpp
+++ b/Source/Engine/Platform/Base/ThreadBase.cpp
@@ -4,6 +4,10 @@
 #include "Engine/Threading/IRunnable.h"
 #include "Engine/Threading/ThreadRegistry.h"
 #include "Engine/Core/Log.h"
+#if TRACY_ENABLE
+#include "Engine/Core/Math/Math.h"
+#include <ThirdParty/tracy/Tracy.h>
+#endif
 
 Delegate<Thread*> ThreadBase::ThreadStarting;
 Delegate<Thread*, int32> ThreadBase::ThreadExiting;
@@ -70,6 +74,13 @@ int32 ThreadBase::Run()
     ASSERT(_runnable);
     const auto thread = static_cast<Thread*>(this);
     _id = Platform::GetCurrentThreadID();
+#if TRACY_ENABLE
+    char threadName[100];
+    const int32 threadNameLength = Math::Min<int32>(ARRAY_COUNT(threadName) - 1, _name.Length());
+    StringUtils::ConvertUTF162ANSI(*_name, threadName, threadNameLength);
+    threadName[threadNameLength] = 0;
+    tracy::SetThreadName(threadName);
+#endif
     ThreadRegistry::Add(thread);
     ThreadStarting(thread);
     int32 exitCode = 1;
diff --git a/Source/Engine/Platform/Win32/Win32Platform.cpp b/Source/Engine/Platform/Win32/Win32Platform.cpp
index a342b3c2c..3bf261fa6 100644
--- a/Source/Engine/Platform/Win32/Win32Platform.cpp
+++ b/Source/Engine/Platform/Win32/Win32Platform.cpp
@@ -305,14 +305,18 @@ void Win32Platform::Prefetch(void const* ptr)
 
 void* Win32Platform::Allocate(uint64 size, uint64 alignment)
 {
+    void* ptr = _aligned_malloc((size_t)size, (size_t)alignment);
 #if COMPILE_WITH_PROFILER
-    TrackAllocation(size);
+    OnMemoryAlloc(ptr, size);
 #endif
-    return _aligned_malloc((size_t)size, (size_t)alignment);
+    return ptr;
 }
 
 void Win32Platform::Free(void* ptr)
 {
+#if COMPILE_WITH_PROFILER
+    OnMemoryFree(ptr);
+#endif
     _aligned_free(ptr);
 }
 
diff --git a/Source/Engine/Platform/Windows/WindowsPlatform.cpp b/Source/Engine/Platform/Windows/WindowsPlatform.cpp
index baf724421..00a5c0d0f 100644
--- a/Source/Engine/Platform/Windows/WindowsPlatform.cpp
+++ b/Source/Engine/Platform/Windows/WindowsPlatform.cpp
@@ -37,9 +37,30 @@ namespace
     int32 SystemDpi = 96;
 #if CRASH_LOG_ENABLE
     CriticalSection SymLocker;
+#if TRACY_ENABLE
+    bool SymInitialized = true;
+#else
     bool SymInitialized = false;
-    bool SymModulesDirty = true;
+#endif
     Array<String> SymbolsPath;
+
+    void OnSymbolsPathModified()
+    {
+        if (!SymInitialized)
+            return;
+        HANDLE process = GetCurrentProcess();
+        SymCleanup(process);
+        String symbolSearchPath;
+        for (auto& path : SymbolsPath)
+        {
+            symbolSearchPath += path;
+            symbolSearchPath += ";";
+        }
+        symbolSearchPath += Platform::GetWorkingDirectory();
+        SymInitializeW(process, *symbolSearchPath, TRUE);
+        //SymSetSearchPathW(process, *symbolSearchPath);
+        //SymRefreshModuleList(process);
+    }
 #endif
 }
 
@@ -378,6 +399,20 @@ void WindowsPlatform::PreInit(void* hInstance)
         Error(TEXT("OLE initalization failed!"));
         exit(-1);
     }
+
+#if CRASH_LOG_ENABLE
+    TCHAR buffer[MAX_PATH] = { 0 };
+    SymLocker.Lock();
+    if (::GetModuleFileNameW(::GetModuleHandleW(nullptr), buffer, MAX_PATH))
+        SymbolsPath.Add(StringUtils::GetDirectoryName(buffer));
+    if (::GetEnvironmentVariableW(TEXT("_NT_SYMBOL_PATH"), buffer, MAX_PATH))
+        SymbolsPath.Add(StringUtils::GetDirectoryName(buffer));
+    DWORD options = SymGetOptions();
+    options |= SYMOPT_LOAD_LINES | SYMOPT_FAIL_CRITICAL_ERRORS | SYMOPT_DEFERRED_LOADS | SYMOPT_EXACT_SYMBOLS;
+    SymSetOptions(options);
+    OnSymbolsPathModified();
+    SymLocker.Unlock();
+#endif
 }
 
 bool WindowsPlatform::IsWindows10()
@@ -604,11 +639,13 @@ void WindowsPlatform::Exit()
 {
 #if CRASH_LOG_ENABLE
     SymLocker.Lock();
+#if !TRACY_ENABLE
     if (SymInitialized)
     {
         SymInitialized = false;
         SymCleanup(GetCurrentProcess());
     }
+#endif
     SymbolsPath.Resize(0);
     SymLocker.Unlock();
 #endif
@@ -650,25 +687,20 @@ void WindowsPlatform::SetHighDpiAwarenessEnabled(bool enable)
     const HMODULE shCoreDll = LoadLibraryW(L"Shcore.dll");
     if (!shCoreDll)
         return;
-
     typedef enum _PROCESS_DPI_AWARENESS
     {
         PROCESS_DPI_UNAWARE = 0,
         PROCESS_SYSTEM_DPI_AWARE = 1,
         PROCESS_PER_MONITOR_DPI_AWARE = 2
     } PROCESS_DPI_AWARENESS;
-
     typedef HRESULT (STDAPICALLTYPE *SetProcessDpiAwarenessProc)(PROCESS_DPI_AWARENESS Value);
     const SetProcessDpiAwarenessProc setProcessDpiAwareness = (SetProcessDpiAwarenessProc)GetProcAddress(shCoreDll, "SetProcessDpiAwareness");
-
     if (setProcessDpiAwareness)
     {
         setProcessDpiAwareness(enable ? PROCESS_PER_MONITOR_DPI_AWARE : PROCESS_DPI_UNAWARE);
     }
-
     SystemDpi = CalculateDpi(shCoreDll);
-
-    FreeLibrary(shCoreDll);
+    ::FreeLibrary(shCoreDll);
 }
 
 BatteryInfo WindowsPlatform::GetBatteryInfo()
@@ -1108,10 +1140,9 @@ void* WindowsPlatform::LoadLibrary(const Char* filename)
     SymLocker.Lock();
     const auto folder = StringUtils::GetDirectoryName(filename);
     if (!SymbolsPath.Contains(folder))
-        SymbolsPath.Add(folder);
-    if (SymInitialized)
     {
-        SymModulesDirty = true;
+        SymbolsPath.Add(folder);
+        OnSymbolsPathModified();
     }
     SymLocker.Unlock();
 #endif
@@ -1131,46 +1162,16 @@ Array<PlatformBase::StackFrame> WindowsPlatform::GetStackFrames(int32 skipCount,
     if (!SymInitialized)
     {
         SymInitialized = true;
-
-        // Build search path
         String symbolSearchPath;
-        TCHAR ModulePath[MAX_PATH] = { 0 };
-        if (::GetModuleFileName(::GetModuleHandle(nullptr), ModulePath, MAX_PATH))
-        {
-            symbolSearchPath += StringUtils::GetDirectoryName(ModulePath);
-            symbolSearchPath += ";";
-        }
         for (auto& path : SymbolsPath)
         {
             symbolSearchPath += path;
             symbolSearchPath += ";";
         }
-        String _NT_SYMBOL_PATH;
-        if (!Platform::GetEnvironmentVariable(TEXT("_NT_SYMBOL_PATH"), _NT_SYMBOL_PATH))
-        {
-            symbolSearchPath += _NT_SYMBOL_PATH;
-            symbolSearchPath += ";";
-        }
         symbolSearchPath += Platform::GetWorkingDirectory();
-        symbolSearchPath += ";";
-
-        DWORD options = SymGetOptions();
-        options |= SYMOPT_LOAD_LINES;
-        options |= SYMOPT_FAIL_CRITICAL_ERRORS;
-        options |= SYMOPT_DEFERRED_LOADS;
-        options |= SYMOPT_EXACT_SYMBOLS;
-        SymSetOptions(options);
-
         SymInitializeW(process, *symbolSearchPath, TRUE);
     }
 
-    // Refresh modules if needed
-    if (SymModulesDirty)
-    {
-        SymModulesDirty = false;
-        SymRefreshModuleList(process);
-    }
-
     // Capture the context if missing
     /*EXCEPTION_POINTERS exceptionPointers;
     CONTEXT contextData;
diff --git a/Source/Engine/Profiler/Profiler.Build.cs b/Source/Engine/Profiler/Profiler.Build.cs
index aeda888e5..289781271 100644
--- a/Source/Engine/Profiler/Profiler.Build.cs
+++ b/Source/Engine/Profiler/Profiler.Build.cs
@@ -27,5 +27,13 @@ public class Profiler : EngineModule
         options.PrivateDependencies.Clear();
 
         options.PublicDefinitions.Add("COMPILE_WITH_PROFILER");
+
+        // Tracy profiling tools
+        switch (options.Platform.Target)
+        {
+        case TargetPlatform.Windows:
+            options.PublicDependencies.Add("tracy");
+            break;
+        }
     }
 }
diff --git a/Source/Engine/Profiler/ProfilerCPU.h b/Source/Engine/Profiler/ProfilerCPU.h
index 88ec738bf..d5cd55f07 100644
--- a/Source/Engine/Profiler/ProfilerCPU.h
+++ b/Source/Engine/Profiler/ProfilerCPU.h
@@ -8,6 +8,7 @@
 #include "Engine/Core/Collections/Array.h"
 #include "Engine/Core/Math/Math.h"
 #include "Engine/Scripting/ScriptingType.h"
+#include <ThirdParty/tracy/Tracy.h>
 
 #if COMPILE_WITH_PROFILER
 
@@ -393,12 +394,22 @@ struct TIsPODType<ProfilerCPU::Event>
 };
 
 // Shortcut macros for profiling a single code block execution on CPU
-#define PROFILE_CPU_NAMED(name) ScopeProfileBlockCPU ProfileBlockCPU(TEXT(name))
+// Use ZoneTransient for Tracy for code that can be hot-reloaded (eg. in Editor)
+
+#if USE_EDITOR
+#define PROFILE_CPU_NAMED(name) ZoneTransientN(___tracy_scoped_zone, name, true); ScopeProfileBlockCPU ProfileBlockCPU(TEXT(name))
+#else
+#define PROFILE_CPU_NAMED(name) ZoneNamedN(___tracy_scoped_zone, name, true); ScopeProfileBlockCPU ProfileBlockCPU(TEXT(name))
+#endif
 
 #if defined(_MSC_VER)
-#define PROFILE_CPU() ScopeProfileBlockCPU ProfileBlockCPU(TEXT(__FUNCTION__))
+#if USE_EDITOR
+#define PROFILE_CPU() ZoneTransient(___tracy_scoped_zone, true); ScopeProfileBlockCPU ProfileBlockCPU(TEXT(__FUNCTION__))
 #else
-#define PROFILE_CPU() \
+#define PROFILE_CPU() ZoneNamed(___tracy_scoped_zone, true); ScopeProfileBlockCPU ProfileBlockCPU(TEXT(__FUNCTION__))
+#endif
+#else
+#define PROFILE_CPU() ZoneTransient(___tracy_scoped_zone, true); \
 	const char* _functionName = __FUNCTION__; \
 	const int32 _functionNameLength = ARRAY_COUNT(__FUNCTION__); \
 	Char _functionNameBuffer[_functionNameLength + 1]; \
diff --git a/Source/Engine/Profiler/ProfilerMemory.cpp b/Source/Engine/Profiler/ProfilerMemory.cpp
deleted file mode 100644
index ae8bff067..000000000
--- a/Source/Engine/Profiler/ProfilerMemory.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2012-2021 Wojciech Figat. All rights reserved.
-
-#if COMPILE_WITH_PROFILER
-
-#include "ProfilerMemory.h"
-#include "ProfilerCPU.h"
-
-void ProfilerMemory::OnAllocation(int32 bytes, bool isGC)
-{
-    // Register allocation during the current CPU event
-    auto thread = ProfilerCPU::GetCurrentThread();
-    if (thread != nullptr && thread->Buffer.GetCount() != 0)
-    {
-        auto& activeEvent = thread->Buffer.Last().Event();
-        if (activeEvent.End < ZeroTolerance)
-        {
-            if (isGC)
-                activeEvent.ManagedMemoryAllocation += bytes;
-            else
-                activeEvent.NativeMemoryAllocation += bytes;
-        }
-    }
-}
-
-#endif
diff --git a/Source/Engine/Profiler/ProfilerMemory.h b/Source/Engine/Profiler/ProfilerMemory.h
deleted file mode 100644
index a3d0098f7..000000000
--- a/Source/Engine/Profiler/ProfilerMemory.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2012-2021 Wojciech Figat. All rights reserved.
-
-#pragma once
-
-#include "Engine/Core/Types/BaseTypes.h"
-
-#if COMPILE_WITH_PROFILER
-
-/// <summary>
-/// Provides memory allocations measuring methods.
-/// </summary>
-class FLAXENGINE_API ProfilerMemory
-{
-public:
-
-    /// <summary>
-    /// Called on memory allocation.
-    /// </summary>
-    /// <param name="bytes">The allocated bytes count.</param>
-    /// <param name="isGC">True if allocation comes from the Garbage Collector, otherwise false.</param>
-    static void OnAllocation(int32 bytes, bool isGC);
-};
-
-#endif
diff --git a/Source/Engine/Scripting/ManagedCLR/MCore.Mono.cpp b/Source/Engine/Scripting/ManagedCLR/MCore.Mono.cpp
index 289378ffa..95d66706f 100644
--- a/Source/Engine/Scripting/ManagedCLR/MCore.Mono.cpp
+++ b/Source/Engine/Scripting/ManagedCLR/MCore.Mono.cpp
@@ -15,7 +15,6 @@
 #include "Engine/Threading/Threading.h"
 #include "Engine/Platform/Thread.h"
 #include "Engine/Scripting/MException.h"
-#include "Engine/Profiler/ProfilerMemory.h"
 #include "Engine/Profiler/ProfilerCPU.h"
 #include <ThirdParty/mono-2.0/mono/jit/jit.h>
 #include <ThirdParty/mono-2.0/mono/utils/mono-counters.h>
@@ -182,7 +181,16 @@ void OnGCAllocation(MonoProfiler* profiler, MonoObject* obj)
 #endif
 
 #if COMPILE_WITH_PROFILER
-    ProfilerMemory::OnAllocation(size, true);
+    // Register allocation during the current CPU event
+    auto thread = ProfilerCPU::GetCurrentThread();
+    if (thread != nullptr && thread->Buffer.GetCount() != 0)
+    {
+        auto& activeEvent = thread->Buffer.Last().Event();
+        if (activeEvent.End < ZeroTolerance)
+        {
+            activeEvent.ManagedMemoryAllocation += size;
+        }
+    }
 #endif
 }
 
diff --git a/Source/ThirdParty/tracy/LICENSE b/Source/ThirdParty/tracy/LICENSE
new file mode 100644
index 000000000..c2a76e56c
--- /dev/null
+++ b/Source/ThirdParty/tracy/LICENSE
@@ -0,0 +1,27 @@
+Tracy Profiler (https://github.com/wolfpld/tracy) is licensed under the
+3-clause BSD license.
+
+Copyright (c) 2017-2021, Bartosz Taudul <wolf@nereid.pl>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Source/ThirdParty/tracy/Tracy.h b/Source/ThirdParty/tracy/Tracy.h
new file mode 100644
index 000000000..cb115740f
--- /dev/null
+++ b/Source/ThirdParty/tracy/Tracy.h
@@ -0,0 +1,252 @@
+#ifndef __TRACY_HPP__
+#define __TRACY_HPP__
+
+#include "common/TracySystem.hpp"
+
+#ifndef TRACY_ENABLE
+
+#define ZoneNamed(x,y)
+#define ZoneNamedN(x,y,z)
+#define ZoneNamedC(x,y,z)
+#define ZoneNamedNC(x,y,z,w)
+
+#define ZoneTransient(x,y)
+#define ZoneTransientN(x,y,z)
+
+#define ZoneScoped
+#define ZoneScopedN(x)
+#define ZoneScopedC(x)
+#define ZoneScopedNC(x,y)
+
+#define ZoneText(x,y)
+#define ZoneTextV(x,y,z)
+#define ZoneName(x,y)
+#define ZoneNameV(x,y,z)
+#define ZoneColor(x)
+#define ZoneColorV(x,y)
+#define ZoneValue(x)
+#define ZoneValueV(x,y)
+
+#define FrameMark
+#define FrameMarkNamed(x)
+
+#define TracyPlot(x,y)
+#define TracyPlotConfig(x,y)
+
+#define TracyMessage(x,y)
+#define TracyMessageL(x)
+#define TracyMessageC(x,y,z)
+#define TracyMessageLC(x,y)
+#define TracyAppInfo(x,y)
+
+#define TracyAlloc(x,y)
+#define TracyFree(x)
+#define TracySecureAlloc(x,y)
+#define TracySecureFree(x)
+
+#define TracyAllocN(x,y,z)
+#define TracyFreeN(x,y)
+#define TracySecureAllocN(x,y,z)
+#define TracySecureFreeN(x,y)
+
+#define ZoneNamedS(x,y,z)
+#define ZoneNamedNS(x,y,z,w)
+#define ZoneNamedCS(x,y,z,w)
+#define ZoneNamedNCS(x,y,z,w,a)
+
+#define ZoneTransientS(x,y,z)
+#define ZoneTransientNS(x,y,z,w)
+
+#define ZoneScopedS(x)
+#define ZoneScopedNS(x,y)
+#define ZoneScopedCS(x,y)
+#define ZoneScopedNCS(x,y,z)
+
+#define TracyAllocS(x,y,z)
+#define TracyFreeS(x,y)
+#define TracySecureAllocS(x,y,z)
+#define TracySecureFreeS(x,y)
+
+#define TracyAllocNS(x,y,z,w)
+#define TracyFreeNS(x,y,z)
+#define TracySecureAllocNS(x,y,z,w)
+#define TracySecureFreeNS(x,y,z)
+
+#define TracyMessageS(x,y,z)
+#define TracyMessageLS(x,y)
+#define TracyMessageCS(x,y,z,w)
+#define TracyMessageLCS(x,y,z)
+
+#define TracyParameterRegister(x)
+#define TracyParameterSetup(x,y,z,w)
+
+#else
+
+#include <string.h>
+
+#include "client/TracyCallstack.h"
+
+namespace tracy
+{
+class TRACY_API Profiler
+{
+public:
+    static void SendFrameMark( const char* name );
+    static void PlotData( const char* name, int64_t val );
+    static void PlotData( const char* name, float val );
+    static void PlotData( const char* name, double val );
+    static void ConfigurePlot( const char* name, PlotFormatType type );
+    static void Message( const char* txt, size_t size, int callstack );
+    static void Message( const char* txt, int callstack );
+    static void MessageColor( const char* txt, size_t size, uint32_t color, int callstack );
+    static void MessageColor( const char* txt, uint32_t color, int callstack );
+    static void MessageAppInfo( const char* txt, size_t size );
+    static void MemAlloc( const void* ptr, size_t size, bool secure );
+    static void MemFree( const void* ptr, bool secure );
+    static void MemAllocCallstack( const void* ptr, size_t size, int depth, bool secure );
+    static void MemFreeCallstack( const void* ptr, int depth, bool secure );
+    static void MemAllocNamed( const void* ptr, size_t size, bool secure, const char* name );
+    static void MemFreeNamed( const void* ptr, bool secure, const char* name );
+    static void MemAllocCallstackNamed( const void* ptr, size_t size, int depth, bool secure, const char* name );
+    static void MemFreeCallstackNamed( const void* ptr, int depth, bool secure, const char* name );
+    static void ParameterRegister( ParameterCallback cb );
+    static void ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val );
+};
+}
+
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
+#  define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
+#  define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
+#  define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
+
+#  define ZoneTransient( varname, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, TRACY_CALLSTACK, active );
+#  define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), TRACY_CALLSTACK, active );
+#else
+#  define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active );
+#  define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active );
+#  define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active );
+#  define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active );
+
+#  define ZoneTransient( varname, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, active );
+#  define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), active );
+#endif
+
+#define ZoneScoped ZoneNamed( ___tracy_scoped_zone, true )
+#define ZoneScopedN( name ) ZoneNamedN( ___tracy_scoped_zone, name, true )
+#define ZoneScopedC( color ) ZoneNamedC( ___tracy_scoped_zone, color, true )
+#define ZoneScopedNC( name, color ) ZoneNamedNC( ___tracy_scoped_zone, name, color, true )
+
+#define ZoneText( txt, size ) ___tracy_scoped_zone.Text( txt, size );
+#define ZoneTextV( varname, txt, size ) varname.Text( txt, size );
+#define ZoneName( txt, size ) ___tracy_scoped_zone.Name( txt, size );
+#define ZoneNameV( varname, txt, size ) varname.Name( txt, size );
+#define ZoneColor( color ) ___tracy_scoped_zone.Color( color );
+#define ZoneColorV( varname, color ) varname.Color( color );
+#define ZoneValue( value ) ___tracy_scoped_zone.Value( value );
+#define ZoneValueV( varname, value ) varname.Value( value );
+
+#define FrameMark tracy::Profiler::SendFrameMark( nullptr );
+#define FrameMarkNamed( name ) tracy::Profiler::SendFrameMark( name );
+
+#define TracyPlot( name, val ) tracy::Profiler::PlotData( name, val );
+#define TracyPlotConfig( name, type ) tracy::Profiler::ConfigurePlot( name, type );
+
+#define TracyAppInfo( txt, size ) tracy::Profiler::MessageAppInfo( txt, size );
+
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, TRACY_CALLSTACK );
+#  define TracyMessageL( txt ) tracy::Profiler::Message( txt, TRACY_CALLSTACK );
+#  define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, TRACY_CALLSTACK );
+#  define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, TRACY_CALLSTACK );
+
+#  define TracyAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, false );
+#  define TracyFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, false );
+#  define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, true );
+#  define TracySecureFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, true );
+
+#  define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, false, name );
+#  define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, false, name );
+#  define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, true, name );
+#  define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, true, name );
+#else
+#  define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, 0 );
+#  define TracyMessageL( txt ) tracy::Profiler::Message( txt, 0 );
+#  define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, 0 );
+#  define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, 0 );
+
+#  define TracyAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, false );
+#  define TracyFree( ptr ) tracy::Profiler::MemFree( ptr, false );
+#  define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, true );
+#  define TracySecureFree( ptr ) tracy::Profiler::MemFree( ptr, true );
+
+#  define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, false, name );
+#  define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, false, name );
+#  define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, true, name );
+#  define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, true, name );
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+#  define ZoneNamedS( varname, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
+#  define ZoneNamedNS( varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
+#  define ZoneNamedCS( varname, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
+#  define ZoneNamedNCS( varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
+
+#  define ZoneTransientS( varname, depth, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, depth, active );
+#  define ZoneTransientNS( varname, name, depth, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), depth, active );
+
+#  define ZoneScopedS( depth ) ZoneNamedS( ___tracy_scoped_zone, depth, true )
+#  define ZoneScopedNS( name, depth ) ZoneNamedNS( ___tracy_scoped_zone, name, depth, true )
+#  define ZoneScopedCS( color, depth ) ZoneNamedCS( ___tracy_scoped_zone, color, depth, true )
+#  define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color, depth, true )
+
+#  define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, false );
+#  define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, false );
+#  define TracySecureAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, true );
+#  define TracySecureFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, true );
+
+#  define TracyAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, false, name );
+#  define TracyFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, false, name );
+#  define TracySecureAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, true, name );
+#  define TracySecureFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, true, name );
+
+#  define TracyMessageS( txt, size, depth ) tracy::Profiler::Message( txt, size, depth );
+#  define TracyMessageLS( txt, depth ) tracy::Profiler::Message( txt, depth );
+#  define TracyMessageCS( txt, size, color, depth ) tracy::Profiler::MessageColor( txt, size, color, depth );
+#  define TracyMessageLCS( txt, color, depth ) tracy::Profiler::MessageColor( txt, color, depth );
+#else
+#  define ZoneNamedS( varname, depth, active ) ZoneNamed( varname, active )
+#  define ZoneNamedNS( varname, name, depth, active ) ZoneNamedN( varname, name, active )
+#  define ZoneNamedCS( varname, color, depth, active ) ZoneNamedC( varname, color, active )
+#  define ZoneNamedNCS( varname, name, color, depth, active ) ZoneNamedNC( varname, name, color, active )
+
+#  define ZoneTransientS( varname, depth, active ) ZoneTransient( varname, active )
+#  define ZoneTransientNS( varname, name, depth, active ) ZoneTransientN( varname, name, active )
+
+#  define ZoneScopedS( depth ) ZoneScoped
+#  define ZoneScopedNS( name, depth ) ZoneScopedN( name )
+#  define ZoneScopedCS( color, depth ) ZoneScopedC( color )
+#  define ZoneScopedNCS( name, color, depth ) ZoneScopedNC( name, color )
+
+#  define TracyAllocS( ptr, size, depth ) TracyAlloc( ptr, size )
+#  define TracyFreeS( ptr, depth ) TracyFree( ptr )
+#  define TracySecureAllocS( ptr, size, depth ) TracySecureAlloc( ptr, size )
+#  define TracySecureFreeS( ptr, depth ) TracySecureFree( ptr )
+
+#  define TracyAllocNS( ptr, size, depth, name ) TracyAlloc( ptr, size, name )
+#  define TracyFreeNS( ptr, depth, name ) TracyFree( ptr, name )
+#  define TracySecureAllocNS( ptr, size, depth, name ) TracySecureAlloc( ptr, size, name )
+#  define TracySecureFreeNS( ptr, depth, name ) TracySecureFree( ptr, name )
+
+#  define TracyMessageS( txt, size, depth ) TracyMessage( txt, size )
+#  define TracyMessageLS( txt, depth ) TracyMessageL( txt )
+#  define TracyMessageCS( txt, size, color, depth ) TracyMessageC( txt, size, color )
+#  define TracyMessageLCS( txt, color, depth ) TracyMessageLC( txt, color )
+#endif
+
+#define TracyParameterRegister( cb ) tracy::Profiler::ParameterRegister( cb );
+#define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val );
+
+#endif
+
+#endif
diff --git a/Source/ThirdParty/tracy/TracyClient.cpp b/Source/ThirdParty/tracy/TracyClient.cpp
new file mode 100644
index 000000000..dd56765c1
--- /dev/null
+++ b/Source/ThirdParty/tracy/TracyClient.cpp
@@ -0,0 +1,53 @@
+//
+//          Tracy profiler
+//         ----------------
+//
+// For fast integration, compile and
+// link with this source file (and none
+// other) in your executable (or in the
+// main DLL / shared object on multi-DLL
+// projects).
+//
+
+// Define TRACY_ENABLE to enable profiler.
+
+#include "common/TracySystem.cpp"
+
+#ifdef TRACY_ENABLE
+
+#ifdef _MSC_VER
+#  pragma warning(push, 0)
+#endif
+
+#include <ThirdParty/LZ4/lz4.h>
+#include "client/TracyProfiler.cpp"
+#include "client/TracyCallstack.cpp"
+#include "client/TracySysTime.cpp"
+#include "client/TracySysTrace.cpp"
+#include "common/TracySocket.cpp"
+#include "client/tracy_rpmalloc.cpp"
+
+#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
+#  include "libbacktrace/alloc.cpp"
+#  include "libbacktrace/dwarf.cpp"
+#  include "libbacktrace/fileline.cpp"
+#  include "libbacktrace/mmapio.cpp"
+#  include "libbacktrace/posix.cpp"
+#  include "libbacktrace/sort.cpp"
+#  include "libbacktrace/state.cpp"
+#  if TRACY_HAS_CALLSTACK == 4
+#    include "libbacktrace/macho.cpp"
+#  else
+#    include "libbacktrace/elf.cpp"
+#  endif
+#endif
+
+#ifdef _MSC_VER
+#  pragma comment(lib, "ws2_32.lib")
+#  pragma comment(lib, "dbghelp.lib")
+#  pragma comment(lib, "advapi32.lib")
+#  pragma comment(lib, "user32.lib")
+#  pragma warning(pop)
+#endif
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracyArmCpuTable.hpp b/Source/ThirdParty/tracy/client/TracyArmCpuTable.hpp
new file mode 100644
index 000000000..ff7d976c8
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyArmCpuTable.hpp
@@ -0,0 +1,349 @@
+namespace tracy
+{
+
+#if defined __linux__ && defined __ARM_ARCH
+
+static const char* DecodeArmImplementer( uint32_t v )
+{
+    static char buf[16];
+    switch( v )
+    {
+    case 0x41: return "ARM";
+    case 0x42: return "Broadcom";
+    case 0x43: return "Cavium";
+    case 0x44: return "DEC";
+    case 0x46: return "Fujitsu";
+    case 0x48: return "HiSilicon";
+    case 0x49: return "Infineon";
+    case 0x4d: return "Motorola";
+    case 0x4e: return "Nvidia";
+    case 0x50: return "Applied Micro";
+    case 0x51: return "Qualcomm";
+    case 0x53: return "Samsung";
+    case 0x54: return "Texas Instruments";
+    case 0x56: return "Marvell";
+    case 0x61: return "Apple";
+    case 0x66: return "Faraday";
+    case 0x68: return "HXT";
+    case 0x69: return "Intel";
+    case 0xc0: return "Ampere Computing";
+    default: break;
+    }
+    sprintf( buf, "0x%x", v );
+    return buf;
+}
+
+static const char* DecodeArmPart( uint32_t impl, uint32_t part )
+{
+    static char buf[16];
+    switch( impl )
+    {
+    case 0x41:
+        switch( part )
+        {
+        case 0x810: return "810";
+        case 0x920: return "920";
+        case 0x922: return "922";
+        case 0x926: return "926";
+        case 0x940: return "940";
+        case 0x946: return "946";
+        case 0x966: return "966";
+        case 0xa20: return "1020";
+        case 0xa22: return "1022";
+        case 0xa26: return "1026";
+        case 0xb02: return "11 MPCore";
+        case 0xb36: return "1136";
+        case 0xb56: return "1156";
+        case 0xb76: return "1176";
+        case 0xc05: return " Cortex-A5";
+        case 0xc07: return " Cortex-A7";
+        case 0xc08: return " Cortex-A8";
+        case 0xc09: return " Cortex-A9";
+        case 0xc0c: return " Cortex-A12";
+        case 0xc0d: return " Rockchip RK3288";
+        case 0xc0f: return " Cortex-A15";
+        case 0xc0e: return " Cortex-A17";
+        case 0xc14: return " Cortex-R4";
+        case 0xc15: return " Cortex-R5";
+        case 0xc17: return " Cortex-R7";
+        case 0xc18: return " Cortex-R8";
+        case 0xc20: return " Cortex-M0";
+        case 0xc21: return " Cortex-M1";
+        case 0xc23: return " Cortex-M3";
+        case 0xc24: return " Cortex-M4";
+        case 0xc27: return " Cortex-M7";
+        case 0xc60: return " Cortex-M0+";
+        case 0xd00: return " AArch64 simulator";
+        case 0xd01: return " Cortex-A32";
+        case 0xd02: return " Cortex-A34";
+        case 0xd03: return " Cortex-A53";
+        case 0xd04: return " Cortex-A35";
+        case 0xd05: return " Cortex-A55";
+        case 0xd06: return " Cortex-A65";
+        case 0xd07: return " Cortex-A57";
+        case 0xd08: return " Cortex-A72";
+        case 0xd09: return " Cortex-A73";
+        case 0xd0a: return " Cortex-A75";
+        case 0xd0b: return " Cortex-A76";
+        case 0xd0c: return " Neoverse N1";
+        case 0xd0d: return " Cortex-A77";
+        case 0xd0e: return " Cortex-A76AE";
+        case 0xd0f: return " AEMv8";
+        case 0xd13: return " Cortex-R52";
+        case 0xd20: return " Cortex-M23";
+        case 0xd21: return " Cortex-M33";
+        case 0xd40: return " Zeus";
+        case 0xd41: return " Cortex-A78";
+        case 0xd43: return " Cortex-A65AE";
+        case 0xd44: return " Cortex-X1";
+        case 0xd4a: return " Neoverse E1";
+        default: break;
+        }
+    case 0x42:
+        switch( part )
+        {
+        case 0xf: return " Brahma B15";
+        case 0x100: return " Brahma B53";
+        case 0x516: return " ThunderX2";
+        default: break;
+        }
+    case 0x43:
+        switch( part )
+        {
+        case 0xa0: return " ThunderX";
+        case 0xa1: return " ThunderX 88XX";
+        case 0xa2: return " ThunderX 81XX";
+        case 0xa3: return " ThunderX 83XX";
+        case 0xaf: return " ThunderX2 99xx";
+        case 0xb0: return " OcteonTX2";
+        case 0xb1: return " OcteonTX2 T98";
+        case 0xb2: return " OcteonTX2 T96";
+        case 0xb3: return " OcteonTX2 F95";
+        case 0xb4: return " OcteonTX2 F95N";
+        case 0xb5: return " OcteonTX2 F95MM";
+        case 0xb8: return " ThunderX3 T110";
+        default: break;
+        }
+    case 0x44:
+        switch( part )
+        {
+        case 0xa10: return " SA110";
+        case 0xa11: return " SA1100";
+        default: break;
+        }
+    case 0x46:
+        switch( part )
+        {
+        case 0x1: return " A64FX";
+        default: break;
+        }
+    case 0x48:
+        switch( part )
+        {
+        case 0xd01: return " TSV100";
+        case 0xd40: return " Kirin 980";
+        default: break;
+        }
+    case 0x4e:
+        switch( part )
+        {
+        case 0x0: return " Denver";
+        case 0x3: return " Denver 2";
+        case 0x4: return " Carmel";
+        default: break;
+        }
+    case 0x50:
+        switch( part )
+        {
+        case 0x0: return " X-Gene";
+        default: break;
+        }
+    case 0x51:
+        switch( part )
+        {
+        case 0xf: return " Scorpion";
+        case 0x2d: return " Scorpion";
+        case 0x4d: return " Krait";
+        case 0x6f: return " Krait";
+        case 0x200: return " Kryo";
+        case 0x201: return " Kryo Silver (Snapdragon 821)";
+        case 0x205: return " Kryo Gold";
+        case 0x211: return " Kryo Silver (Snapdragon 820)";
+        case 0x800: return " Kryo 260 / 280 Gold";
+        case 0x801: return " Kryo 260 / 280 Silver";
+        case 0x802: return " Kryo 385 Gold";
+        case 0x803: return " Kryo 385 Silver";
+        case 0x804: return " Kryo 485 Gold";
+        case 0xc00: return " Falkor";
+        case 0xc01: return " Saphira";
+        default: break;
+        }
+    case 0x53:
+        switch( part )
+        {
+        case 0x1: return " Exynos M1/M2";
+        case 0x2: return " Exynos M3";
+        default: break;
+        }
+    case 0x56:
+        switch( part )
+        {
+        case 0x131: return " Feroceon 88FR131";
+        case 0x581: return " PJ4 / PJ4B";
+        case 0x584: return " PJ4B-MP / PJ4C";
+        default: break;
+        }
+    case 0x61:
+        switch( part )
+        {
+        case 0x1: return " Cyclone";
+        case 0x2: return " Typhoon";
+        case 0x3: return " Typhoon/Capri";
+        case 0x4: return " Twister";
+        case 0x5: return " Twister/Elba/Malta";
+        case 0x6: return " Hurricane";
+        case 0x7: return " Hurricane/Myst";
+        default: break;
+        }
+    case 0x66:
+        switch( part )
+        {
+        case 0x526: return " FA526";
+        case 0x626: return " FA626";
+        default: break;
+        }
+    case 0x68:
+        switch( part )
+        {
+        case 0x0: return " Phecda";
+        default: break;
+        }
+    default: break;
+    }
+    sprintf( buf, " 0x%x", part );
+    return buf;
+}
+
+#elif defined __APPLE__ && TARGET_OS_IPHONE == 1
+
+static const char* DecodeIosDevice( const char* id )
+{
+    static const char* DeviceTable[] = {
+        "i386", "32-bit simulator",
+        "x86_64", "64-bit simulator",
+        "iPhone1,1", "iPhone",
+        "iPhone1,2", "iPhone 3G",
+        "iPhone2,1", "iPhone 3GS",
+        "iPhone3,1", "iPhone 4 (GSM)",
+        "iPhone3,2", "iPhone 4 (GSM)",
+        "iPhone3,3", "iPhone 4 (CDMA)",
+        "iPhone4,1", "iPhone 4S",
+        "iPhone5,1", "iPhone 5 (A1428)",
+        "iPhone5,2", "iPhone 5 (A1429)",
+        "iPhone5,3", "iPhone 5c (A1456/A1532)",
+        "iPhone5,4", "iPhone 5c (A1507/A1516/1526/A1529)",
+        "iPhone6,1", "iPhone 5s (A1433/A1533)",
+        "iPhone6,2", "iPhone 5s (A1457/A1518/A1528/A1530)",
+        "iPhone7,1", "iPhone 6 Plus",
+        "iPhone7,2", "iPhone 6",
+        "iPhone8,1", "iPhone 6S",
+        "iPhone8,2", "iPhone 6S Plus",
+        "iPhone8,4", "iPhone SE",
+        "iPhone9,1", "iPhone 7 (CDMA)",
+        "iPhone9,2", "iPhone 7 Plus (CDMA)",
+        "iPhone9,3", "iPhone 7 (GSM)",
+        "iPhone9,4", "iPhone 7 Plus (GSM)",
+        "iPhone10,1", "iPhone 8 (CDMA)",
+        "iPhone10,2", "iPhone 8 Plus (CDMA)",
+        "iPhone10,3", "iPhone X (CDMA)",
+        "iPhone10,4", "iPhone 8 (GSM)",
+        "iPhone10,5", "iPhone 8 Plus (GSM)",
+        "iPhone10,6", "iPhone X (GSM)",
+        "iPhone11,2", "iPhone XS",
+        "iPhone11,4", "iPhone XS Max",
+        "iPhone11,6", "iPhone XS Max China",
+        "iPhone11,8", "iPhone XR",
+        "iPhone12,1", "iPhone 11",
+        "iPhone12,3", "iPhone 11 Pro",
+        "iPhone12,5", "iPhone 11 Pro Max",
+        "iPhone12,8", "iPhone SE 2nd Gen",
+        "iPad1,1", "iPad (A1219/A1337)",
+        "iPad2,1", "iPad 2 (A1395)",
+        "iPad2,2", "iPad 2 (A1396)",
+        "iPad2,3", "iPad 2 (A1397)",
+        "iPad2,4", "iPad 2 (A1395)",
+        "iPad2,5", "iPad Mini (A1432)",
+        "iPad2,6", "iPad Mini (A1454)",
+        "iPad2,7", "iPad Mini (A1455)",
+        "iPad3,1", "iPad 3 (A1416)",
+        "iPad3,2", "iPad 3 (A1403)",
+        "iPad3,3", "iPad 3 (A1430)",
+        "iPad3,4", "iPad 4 (A1458)",
+        "iPad3,5", "iPad 4 (A1459)",
+        "iPad3,6", "iPad 4 (A1460)",
+        "iPad4,1", "iPad Air (A1474)",
+        "iPad4,2", "iPad Air (A1475)",
+        "iPad4,3", "iPad Air (A1476)",
+        "iPad4,4", "iPad Mini 2 (A1489)",
+        "iPad4,5", "iPad Mini 2 (A1490)",
+        "iPad4,6", "iPad Mini 2 (A1491)",
+        "iPad4,7", "iPad Mini 3 (A1599)",
+        "iPad4,8", "iPad Mini 3 (A1600)",
+        "iPad4,9", "iPad Mini 3 (A1601)",
+        "iPad5,1", "iPad Mini 4 (A1538)",
+        "iPad5,2", "iPad Mini 4 (A1550)",
+        "iPad5,3", "iPad Air 2 (A1566)",
+        "iPad5,4", "iPad Air 2 (A1567)",
+        "iPad6,3", "iPad Pro 9.7\" (A1673)",
+        "iPad6,4", "iPad Pro 9.7\" (A1674)",
+        "iPad6,5", "iPad Pro 9.7\" (A1675)",
+        "iPad6,7", "iPad Pro 12.9\" (A1584)",
+        "iPad6,8", "iPad Pro 12.9\" (A1652)",
+        "iPad6,11", "iPad 5th gen (A1822)",
+        "iPad6,12", "iPad 5th gen (A1823)",
+        "iPad7,1", "iPad Pro 12.9\" 2nd gen (A1670)",
+        "iPad7,2", "iPad Pro 12.9\" 2nd gen (A1671/A1821)",
+        "iPad7,3", "iPad Pro 10.5\" (A1701)",
+        "iPad7,4", "iPad Pro 10.5\" (A1709)",
+        "iPad7,5", "iPad 6th gen (A1893)",
+        "iPad7,6", "iPad 6th gen (A1954)",
+        "iPad7,11", "iPad 7th gen 10.2\" (Wifi)",
+        "iPad7,12", "iPad 7th gen 10.2\" (Wifi+Cellular)",
+        "iPad8,1", "iPad Pro 11\" (A1980)",
+        "iPad8,2", "iPad Pro 11\" (A1980)",
+        "iPad8,3", "iPad Pro 11\" (A1934/A1979/A2013)",
+        "iPad8,4", "iPad Pro 11\" (A1934/A1979/A2013)",
+        "iPad8,5", "iPad Pro 12.9\" 3rd gen (A1876)",
+        "iPad8,6", "iPad Pro 12.9\" 3rd gen (A1876)",
+        "iPad8,7", "iPad Pro 12.9\" 3rd gen (A1895/A1983/A2014)",
+        "iPad8,8", "iPad Pro 12.9\" 3rd gen (A1895/A1983/A2014)",
+        "iPad8,9", "iPad Pro 11\" 2nd gen (Wifi)",
+        "iPad8,10", "iPad Pro 11\" 2nd gen (Wifi+Cellular)",
+        "iPad8,11", "iPad Pro 12.9\" 4th gen (Wifi)",
+        "iPad8,12", "iPad Pro 12.9\" 4th gen (Wifi+Cellular)",
+        "iPad11,1", "iPad Mini 5th gen (A2133)",
+        "iPad11,2", "iPad Mini 5th gen (A2124/A2125/A2126)",
+        "iPad11,3", "iPad Air 3rd gen (A2152)",
+        "iPad11,4", "iPad Air 3rd gen (A2123/A2153/A2154)",
+        "iPod1,1", "iPod Touch",
+        "iPod2,1", "iPod Touch 2nd gen",
+        "iPod3,1", "iPod Touch 3rd gen",
+        "iPod4,1", "iPod Touch 4th gen",
+        "iPod5,1", "iPod Touch 5th gen",
+        "iPod7,1", "iPod Touch 6th gen",
+        "iPod9,1", "iPod Touch 7th gen",
+        nullptr
+    };
+
+    auto ptr = DeviceTable;
+    while( *ptr )
+    {
+        if( strcmp( ptr[0], id ) == 0 ) return ptr[1];
+        ptr += 2;
+    }
+    return id;
+}
+
+#endif
+
+}
diff --git a/Source/ThirdParty/tracy/client/TracyCallstack.cpp b/Source/ThirdParty/tracy/client/TracyCallstack.cpp
new file mode 100644
index 000000000..10698cb19
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyCallstack.cpp
@@ -0,0 +1,768 @@
+#include <new>
+#include <stdio.h>
+#include <string.h>
+#include "TracyCallstack.hpp"
+#include "TracyFastVector.hpp"
+#include "../common/TracyAlloc.hpp"
+
+#ifdef TRACY_HAS_CALLSTACK
+
+#if TRACY_HAS_CALLSTACK == 1
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#  include <windows.h>
+#  include <psapi.h>
+#  ifdef _MSC_VER
+#    pragma warning( push )
+#    pragma warning( disable : 4091 )
+#  endif
+#  include <dbghelp.h>
+#  ifdef _MSC_VER
+#    pragma warning( pop )
+#  endif
+#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
+#  include "../libbacktrace/backtrace.hpp"
+#  include <dlfcn.h>
+#  include <cxxabi.h>
+#elif TRACY_HAS_CALLSTACK == 5
+#  include <dlfcn.h>
+#  include <cxxabi.h>
+#endif
+
+#ifdef TRACY_DBGHELP_LOCK
+#  include "TracyProfiler.hpp"
+
+#  define DBGHELP_INIT TracyConcat( TRACY_DBGHELP_LOCK, Init() )
+#  define DBGHELP_LOCK TracyConcat( TRACY_DBGHELP_LOCK, Lock() );
+#  define DBGHELP_UNLOCK TracyConcat( TRACY_DBGHELP_LOCK, Unlock() );
+
+extern "C"
+{
+    void DBGHELP_INIT;
+    void DBGHELP_LOCK;
+    void DBGHELP_UNLOCK;
+};
+#endif
+
+namespace tracy
+{
+
+static inline char* CopyString( const char* src, size_t sz )
+{
+    assert( strlen( src ) == sz );
+    auto dst = (char*)tracy_malloc( sz + 1 );
+    memcpy( dst, src, sz );
+    dst[sz] = '\0';
+    return dst;
+}
+
+static inline char* CopyString( const char* src )
+{
+    const auto sz = strlen( src );
+    auto dst = (char*)tracy_malloc( sz + 1 );
+    memcpy( dst, src, sz );
+    dst[sz] = '\0';
+    return dst;
+}
+
+
+#if TRACY_HAS_CALLSTACK == 1
+
+enum { MaxCbTrace = 16 };
+enum { MaxNameSize = 8*1024 };
+
+int cb_num;
+CallstackEntry cb_data[MaxCbTrace];
+
+extern "C"
+{
+    typedef unsigned long (__stdcall *t_RtlWalkFrameChain)( void**, unsigned long, unsigned long );
+    t_RtlWalkFrameChain RtlWalkFrameChain = 0;
+}
+
+#if defined __MINGW32__ && API_VERSION_NUMBER < 12
+extern "C" {
+// Actual required API_VERSION_NUMBER is unknown because it is undocumented. These functions are not present in at least v11.
+DWORD IMAGEAPI SymAddrIncludeInlineTrace(HANDLE hProcess, DWORD64 Address);
+BOOL IMAGEAPI SymQueryInlineTrace(HANDLE hProcess, DWORD64 StartAddress, DWORD StartContext, DWORD64 StartRetAddress,
+    DWORD64 CurAddress, LPDWORD CurContext, LPDWORD CurFrameIndex);
+BOOL IMAGEAPI SymFromInlineContext(HANDLE hProcess, DWORD64 Address, ULONG InlineContext, PDWORD64 Displacement,
+    PSYMBOL_INFO Symbol);
+BOOL IMAGEAPI SymGetLineFromInlineContext(HANDLE hProcess, DWORD64 qwAddr, ULONG InlineContext,
+    DWORD64 qwModuleBaseAddress, PDWORD pdwDisplacement, PIMAGEHLP_LINE64 Line64);
+};
+#endif
+
+#ifndef __CYGWIN__
+struct ModuleCache
+{
+    uint64_t start;
+    uint64_t end;
+    char* name;
+};
+
+static FastVector<ModuleCache>* s_modCache;
+#endif
+
+void InitCallstack()
+{
+    RtlWalkFrameChain = (t_RtlWalkFrameChain)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlWalkFrameChain" );
+
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_INIT;
+    DBGHELP_LOCK;
+#endif
+
+    //SymInitialize( GetCurrentProcess(), "C:\\Flax\\FlaxEngine\\Binaries\\Editor\\Win64\\Debug;C:\\Flax\\FlaxEngine\\Cache\\Projects", true );
+    SymInitialize( GetCurrentProcess(), nullptr, true );
+    SymSetOptions( SYMOPT_LOAD_LINES );
+
+#ifndef __CYGWIN__
+    HMODULE mod[1024];
+    DWORD needed;
+    HANDLE proc = GetCurrentProcess();
+
+    s_modCache = (FastVector<ModuleCache>*)tracy_malloc( sizeof( FastVector<ModuleCache> ) );
+    new(s_modCache) FastVector<ModuleCache>( 512 );
+
+    if( EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 )
+    {
+        const auto sz = needed / sizeof( HMODULE );
+        for( size_t i=0; i<sz; i++ )
+        {
+            MODULEINFO info;
+            if( GetModuleInformation( proc, mod[i], &info, sizeof( info ) ) != 0 )
+            {
+                const auto base = uint64_t( info.lpBaseOfDll );
+                char name[1024];
+                const auto res = GetModuleFileNameA( mod[i], name, 1021 );
+                if( res > 0 )
+                {
+                    auto ptr = name + res;
+                    while( ptr > name && *ptr != '\\' && *ptr != '/' ) ptr--;
+                    if( ptr > name ) ptr++;
+                    const auto namelen = name + res - ptr;
+                    auto cache = s_modCache->push_next();
+                    cache->start = base;
+                    cache->end = base + info.SizeOfImage;
+                    cache->name = (char*)tracy_malloc( namelen+3 );
+                    cache->name[0] = '[';
+                    memcpy( cache->name+1, ptr, namelen );
+                    cache->name[namelen+1] = ']';
+                    cache->name[namelen+2] = '\0';
+                }
+            }
+        }
+    }
+#endif
+
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_UNLOCK;
+#endif
+}
+
+TRACY_API uintptr_t* CallTrace( int depth )
+{
+    auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) );
+    const auto num = RtlWalkFrameChain( (void**)( trace + 1 ), depth, 0 );
+    *trace = num;
+    return trace;
+}
+
+const char* DecodeCallstackPtrFast( uint64_t ptr )
+{
+    static char ret[MaxNameSize];
+    const auto proc = GetCurrentProcess();
+
+    char buf[sizeof( SYMBOL_INFO ) + MaxNameSize];
+    auto si = (SYMBOL_INFO*)buf;
+    si->SizeOfStruct = sizeof( SYMBOL_INFO );
+    si->MaxNameLen = MaxNameSize;
+
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_LOCK;
+#endif
+    if( SymFromAddr( proc, ptr, nullptr, si ) == 0 )
+    {
+        *ret = '\0';
+    }
+    else
+    {
+        memcpy( ret, si->Name, si->NameLen );
+        ret[si->NameLen] = '\0';
+    }
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_UNLOCK;
+#endif
+    return ret;
+}
+
+static const char* GetModuleName( uint64_t addr )
+{
+    if( ( addr & 0x8000000000000000 ) != 0 ) return "[kernel]";
+
+#ifndef __CYGWIN__
+    for( auto& v : *s_modCache )
+    {
+        if( addr >= v.start && addr < v.end )
+        {
+            return v.name;
+        }
+    }
+
+    HMODULE mod[1024];
+    DWORD needed;
+    HANDLE proc = GetCurrentProcess();
+
+    if( EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 )
+    {
+        const auto sz = needed / sizeof( HMODULE );
+        for( size_t i=0; i<sz; i++ )
+        {
+            MODULEINFO info;
+            if( GetModuleInformation( proc, mod[i], &info, sizeof( info ) ) != 0 )
+            {
+                const auto base = uint64_t( info.lpBaseOfDll );
+                if( addr >= base && addr < base + info.SizeOfImage )
+                {
+                    char name[1024];
+                    const auto res = GetModuleFileNameA( mod[i], name, 1021 );
+                    if( res > 0 )
+                    {
+                        auto ptr = name + res;
+                        while( ptr > name && *ptr != '\\' && *ptr != '/' ) ptr--;
+                        if( ptr > name ) ptr++;
+                        const auto namelen = name + res - ptr;
+                        auto cache = s_modCache->push_next();
+                        cache->start = base;
+                        cache->end = base + info.SizeOfImage;
+                        cache->name = (char*)tracy_malloc( namelen+3 );
+                        cache->name[0] = '[';
+                        memcpy( cache->name+1, ptr, namelen );
+                        cache->name[namelen+1] = ']';
+                        cache->name[namelen+2] = '\0';
+                        return cache->name;
+                    }
+                }
+            }
+        }
+    }
+#endif
+
+    return "[unknown]";
+}
+
+CallstackSymbolData DecodeSymbolAddress( uint64_t ptr )
+{
+    CallstackSymbolData sym;
+    IMAGEHLP_LINE64 line;
+    DWORD displacement = 0;
+    line.SizeOfStruct = sizeof(IMAGEHLP_LINE64);
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_LOCK;
+#endif
+    const auto res = SymGetLineFromAddr64( GetCurrentProcess(), ptr, &displacement, &line );
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_UNLOCK;
+#endif
+    if( res == 0 )
+    {
+        sym.file = "[unknown]";
+        sym.line = 0;
+    }
+    else
+    {
+        sym.file = line.FileName;
+        sym.line = line.LineNumber;
+    }
+    sym.needFree = false;
+    return sym;
+}
+
+CallstackSymbolData DecodeCodeAddress( uint64_t ptr )
+{
+    CallstackSymbolData sym;
+    const auto proc = GetCurrentProcess();
+    bool done = false;
+
+    IMAGEHLP_LINE64 line;
+    DWORD displacement = 0;
+    line.SizeOfStruct = sizeof(IMAGEHLP_LINE64);
+
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_LOCK;
+#endif
+#ifndef __CYGWIN__
+    DWORD inlineNum = SymAddrIncludeInlineTrace( proc, ptr );
+    DWORD ctx = 0;
+    DWORD idx;
+    BOOL doInline = FALSE;
+    if( inlineNum != 0 ) doInline = SymQueryInlineTrace( proc, ptr, 0, ptr, ptr, &ctx, &idx );
+    if( doInline )
+    {
+        if( SymGetLineFromInlineContext( proc, ptr, ctx, 0, &displacement, &line ) != 0 )
+        {
+            sym.file = line.FileName;
+            sym.line = line.LineNumber;
+            done = true;
+        }
+    }
+#endif
+    if( !done )
+    {
+        if( SymGetLineFromAddr64( proc, ptr, &displacement, &line ) == 0 )
+        {
+            sym.file = "[unknown]";
+            sym.line = 0;
+        }
+        else
+        {
+            sym.file = line.FileName;
+            sym.line = line.LineNumber;
+        }
+    }
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_UNLOCK;
+#endif
+    sym.needFree = false;
+    return sym;
+}
+
+CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
+{
+    int write;
+    const auto proc = GetCurrentProcess();
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_LOCK;
+#endif
+#ifndef __CYGWIN__
+    DWORD inlineNum = SymAddrIncludeInlineTrace( proc, ptr );
+    if( inlineNum > MaxCbTrace - 1 ) inlineNum = MaxCbTrace - 1;
+    DWORD ctx = 0;
+    DWORD idx;
+    BOOL doInline = FALSE;
+    if( inlineNum != 0 ) doInline = SymQueryInlineTrace( proc, ptr, 0, ptr, ptr, &ctx, &idx );
+    if( doInline )
+    {
+        write = inlineNum;
+        cb_num = 1 + inlineNum;
+    }
+    else
+#endif
+    {
+        write = 0;
+        cb_num = 1;
+    }
+
+    char buf[sizeof( SYMBOL_INFO ) + MaxNameSize];
+    auto si = (SYMBOL_INFO*)buf;
+    si->SizeOfStruct = sizeof( SYMBOL_INFO );
+    si->MaxNameLen = MaxNameSize;
+
+    const auto moduleName = GetModuleName( ptr );
+    const auto symValid = SymFromAddr( proc, ptr, nullptr, si ) != 0;
+
+    IMAGEHLP_LINE64 line;
+    DWORD displacement = 0;
+    line.SizeOfStruct = sizeof(IMAGEHLP_LINE64);
+
+    {
+        const char* filename;
+        if( SymGetLineFromAddr64( proc, ptr, &displacement, &line ) == 0 )
+        {
+            filename = "[unknown]";
+            cb_data[write].line = 0;
+        }
+        else
+        {
+            filename = line.FileName;
+            cb_data[write].line = line.LineNumber;
+        }
+
+        cb_data[write].name = symValid ? CopyString( si->Name, si->NameLen ) : CopyString( moduleName );
+        cb_data[write].file = CopyString( filename );
+        if( symValid )
+        {
+            cb_data[write].symLen = si->Size;
+            cb_data[write].symAddr = si->Address;
+        }
+        else
+        {
+            cb_data[write].symLen = 0;
+            cb_data[write].symAddr = 0;
+        }
+    }
+
+#ifndef __CYGWIN__
+    if( doInline )
+    {
+        for( DWORD i=0; i<inlineNum; i++ )
+        {
+            auto& cb = cb_data[i];
+            const auto symInlineValid = SymFromInlineContext( proc, ptr, ctx, nullptr, si ) != 0;
+            const char* filename;
+            if( SymGetLineFromInlineContext( proc, ptr, ctx, 0, &displacement, &line ) == 0 )
+            {
+                filename = "[unknown]";
+                cb.line = 0;
+            }
+            else
+            {
+                filename = line.FileName;
+                cb.line = line.LineNumber;
+            }
+
+            cb.name = symInlineValid ? CopyString( si->Name, si->NameLen ) : CopyString( moduleName );
+            cb.file = CopyString( filename );
+            if( symInlineValid )
+            {
+                cb.symLen = si->Size;
+                cb.symAddr = si->Address;
+            }
+            else
+            {
+                cb.symLen = 0;
+                cb.symAddr = 0;
+            }
+
+            ctx++;
+        }
+    }
+#endif
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_UNLOCK;
+#endif
+
+    return { cb_data, uint8_t( cb_num ), moduleName };
+}
+
+#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
+
+enum { MaxCbTrace = 16 };
+
+struct backtrace_state* cb_bts;
+int cb_num;
+CallstackEntry cb_data[MaxCbTrace];
+int cb_fixup;
+
+void InitCallstack()
+{
+    cb_bts = backtrace_create_state( nullptr, 0, nullptr, nullptr );
+}
+
+static int FastCallstackDataCb( void* data, uintptr_t pc, uintptr_t lowaddr, const char* fn, int lineno, const char* function )
+{
+    if( function )
+    {
+        strcpy( (char*)data, function );
+    }
+    else
+    {
+        const char* symname = nullptr;
+        auto vptr = (void*)pc;
+        Dl_info dlinfo;
+        if( dladdr( vptr, &dlinfo ) )
+        {
+            symname = dlinfo.dli_sname;
+        }
+        if( symname )
+        {
+            strcpy( (char*)data, symname );
+        }
+        else
+        {
+            *(char*)data = '\0';
+        }
+    }
+    return 1;
+}
+
+static void FastCallstackErrorCb( void* data, const char* /*msg*/, int /*errnum*/ )
+{
+    *(char*)data = '\0';
+}
+
+const char* DecodeCallstackPtrFast( uint64_t ptr )
+{
+    static char ret[1024];
+    backtrace_pcinfo( cb_bts, ptr, FastCallstackDataCb, FastCallstackErrorCb, ret );
+    return ret;
+}
+
+static int SymbolAddressDataCb( void* data, uintptr_t pc, uintptr_t lowaddr, const char* fn, int lineno, const char* function )
+{
+    auto& sym = *(CallstackSymbolData*)data;
+    if( !fn )
+    {
+        sym.file = "[unknown]";
+        sym.line = 0;
+        sym.needFree = false;
+    }
+    else
+    {
+        sym.file = CopyString( fn );
+        sym.line = lineno;
+        sym.needFree = true;
+    }
+
+    return 1;
+}
+
+static void SymbolAddressErrorCb( void* data, const char* /*msg*/, int /*errnum*/ )
+{
+    auto& sym = *(CallstackSymbolData*)data;
+    sym.file = "[unknown]";
+    sym.line = 0;
+    sym.needFree = false;
+}
+
+CallstackSymbolData DecodeSymbolAddress( uint64_t ptr )
+{
+    CallstackSymbolData sym;
+    backtrace_pcinfo( cb_bts, ptr, SymbolAddressDataCb, SymbolAddressErrorCb, &sym );
+    return sym;
+}
+
+CallstackSymbolData DecodeCodeAddress( uint64_t ptr )
+{
+    return DecodeSymbolAddress( ptr );
+}
+
+static int CallstackDataCb( void* /*data*/, uintptr_t pc, uintptr_t lowaddr, const char* fn, int lineno, const char* function )
+{
+    enum { DemangleBufLen = 64*1024 };
+    char demangled[DemangleBufLen];
+
+    cb_data[cb_num].symLen = 0;
+    cb_data[cb_num].symAddr = (uint64_t)lowaddr;
+
+    if( !fn && !function )
+    {
+        const char* symname = nullptr;
+        auto vptr = (void*)pc;
+        ptrdiff_t symoff = 0;
+
+        Dl_info dlinfo;
+        if( dladdr( vptr, &dlinfo ) )
+        {
+            symname = dlinfo.dli_sname;
+            symoff = (char*)pc - (char*)dlinfo.dli_saddr;
+
+            if( symname && symname[0] == '_' )
+            {
+                size_t len = DemangleBufLen;
+                int status;
+                abi::__cxa_demangle( symname, demangled, &len, &status );
+                if( status == 0 )
+                {
+                    symname = demangled;
+                }
+            }
+        }
+
+        if( !symname ) symname = "[unknown]";
+
+        if( symoff == 0 )
+        {
+            cb_data[cb_num].name = CopyString( symname );
+        }
+        else
+        {
+            char buf[32];
+            const auto offlen = sprintf( buf, " + %td", symoff );
+            const auto namelen = strlen( symname );
+            auto name = (char*)tracy_malloc( namelen + offlen + 1 );
+            memcpy( name, symname, namelen );
+            memcpy( name + namelen, buf, offlen );
+            name[namelen + offlen] = '\0';
+            cb_data[cb_num].name = name;
+        }
+
+        cb_data[cb_num].file = CopyString( "[unknown]" );
+        cb_data[cb_num].line = 0;
+    }
+    else
+    {
+        if( !fn ) fn = "[unknown]";
+        if( !function )
+        {
+            function = "[unknown]";
+        }
+        else
+        {
+            if( function[0] == '_' )
+            {
+                size_t len = DemangleBufLen;
+                int status;
+                abi::__cxa_demangle( function, demangled, &len, &status );
+                if( status == 0 )
+                {
+                    function = demangled;
+                }
+            }
+        }
+
+        cb_data[cb_num].name = CopyString( function );
+        cb_data[cb_num].file = CopyString( fn );
+        cb_data[cb_num].line = lineno;
+    }
+
+    if( ++cb_num >= MaxCbTrace )
+    {
+        return 1;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+static void CallstackErrorCb( void* /*data*/, const char* /*msg*/, int /*errnum*/ )
+{
+    for( int i=0; i<cb_num; i++ )
+    {
+        tracy_free( (void*)cb_data[i].name );
+        tracy_free( (void*)cb_data[i].file );
+    }
+
+    cb_data[0].name = CopyString( "[error]" );
+    cb_data[0].file = CopyString( "[error]" );
+    cb_data[0].line = 0;
+
+    cb_num = 1;
+}
+
+void SymInfoCallback( void* /*data*/, uintptr_t pc, const char* symname, uintptr_t symval, uintptr_t symsize )
+{
+    cb_data[cb_num-1].symLen = (uint32_t)symsize;
+    cb_data[cb_num-1].symAddr = (uint64_t)symval;
+}
+
+void SymInfoError( void* /*data*/, const char* /*msg*/, int /*errnum*/ )
+{
+    cb_data[cb_num-1].symLen = 0;
+    cb_data[cb_num-1].symAddr = 0;
+}
+
+CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
+{
+    cb_num = 0;
+    backtrace_pcinfo( cb_bts, ptr, CallstackDataCb, CallstackErrorCb, nullptr );
+    assert( cb_num > 0 );
+
+    backtrace_syminfo( cb_bts, ptr, SymInfoCallback, SymInfoError, nullptr );
+
+    const char* symloc = nullptr;
+    Dl_info dlinfo;
+    if( dladdr( (void*)ptr, &dlinfo ) ) symloc = dlinfo.dli_fname;
+
+    return { cb_data, uint8_t( cb_num ), symloc ? symloc : "[unknown]" };
+}
+
+#elif TRACY_HAS_CALLSTACK == 5
+
+void InitCallstack()
+{
+}
+
+const char* DecodeCallstackPtrFast( uint64_t ptr )
+{
+    static char ret[1024];
+    auto vptr = (void*)ptr;
+    const char* symname = nullptr;
+    Dl_info dlinfo;
+    if( dladdr( vptr, &dlinfo ) && dlinfo.dli_sname )
+    {
+        symname = dlinfo.dli_sname;
+    }
+    if( symname )
+    {
+        strcpy( ret, symname );
+    }
+    else
+    {
+        *ret = '\0';
+    }
+    return ret;
+}
+
+CallstackSymbolData DecodeSymbolAddress( uint64_t ptr )
+{
+    const char* symloc = nullptr;
+    Dl_info dlinfo;
+    if( dladdr( (void*)ptr, &dlinfo ) ) symloc = dlinfo.dli_fname;
+    if( !symloc ) symloc = "[unknown]";
+    return CallstackSymbolData { symloc, 0, false };
+}
+
+CallstackSymbolData DecodeCodeAddress( uint64_t ptr )
+{
+    return DecodeSymbolAddress( ptr );
+}
+
+CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
+{
+    static CallstackEntry cb;
+    cb.line = 0;
+
+    char* demangled = nullptr;
+    const char* symname = nullptr;
+    const char* symloc = nullptr;
+    auto vptr = (void*)ptr;
+    ptrdiff_t symoff = 0;
+    void* symaddr = nullptr;
+
+    Dl_info dlinfo;
+    if( dladdr( vptr, &dlinfo ) )
+    {
+        symloc = dlinfo.dli_fname;
+        symname = dlinfo.dli_sname;
+        symoff = (char*)ptr - (char*)dlinfo.dli_saddr;
+        symaddr = dlinfo.dli_saddr;
+
+        if( symname && symname[0] == '_' )
+        {
+            size_t len = 0;
+            int status;
+            demangled = abi::__cxa_demangle( symname, nullptr, &len, &status );
+            if( status == 0 )
+            {
+                symname = demangled;
+            }
+        }
+    }
+
+    if( !symname ) symname = "[unknown]";
+    if( !symloc ) symloc = "[unknown]";
+
+    if( symoff == 0 )
+    {
+        cb.name = CopyString( symname );
+    }
+    else
+    {
+        char buf[32];
+        const auto offlen = sprintf( buf, " + %td", symoff );
+        const auto namelen = strlen( symname );
+        auto name = (char*)tracy_malloc( namelen + offlen + 1 );
+        memcpy( name, symname, namelen );
+        memcpy( name + namelen, buf, offlen );
+        name[namelen + offlen] = '\0';
+        cb.name = name;
+    }
+
+    cb.file = CopyString( "[unknown]" );
+    cb.symLen = 0;
+    cb.symAddr = (uint64_t)symaddr;
+
+    if( demangled ) free( demangled );
+
+    return { &cb, 1, symloc };
+}
+
+#endif
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracyCallstack.h b/Source/ThirdParty/tracy/client/TracyCallstack.h
new file mode 100644
index 000000000..87d8ce721
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyCallstack.h
@@ -0,0 +1,28 @@
+#ifndef __TRACYCALLSTACK_H__
+#define __TRACYCALLSTACK_H__
+
+#if !defined _WIN32 && !defined __CYGWIN__
+#  include <sys/param.h>
+#endif
+
+#if defined _WIN32 || defined __CYGWIN__
+#  define TRACY_HAS_CALLSTACK 1
+#elif defined __ANDROID__
+#  if !defined __arm__ || __ANDROID_API__ >= 21
+#    define TRACY_HAS_CALLSTACK 2
+#  else
+#    define TRACY_HAS_CALLSTACK 5
+#  endif
+#elif defined __linux
+#  if defined _GNU_SOURCE && defined __GLIBC__
+#    define TRACY_HAS_CALLSTACK 3
+#  else
+#    define TRACY_HAS_CALLSTACK 2
+#  endif
+#elif defined __APPLE__
+#  define TRACY_HAS_CALLSTACK 4
+#elif defined BSD
+#  define TRACY_HAS_CALLSTACK 6
+#endif
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracyCallstack.hpp b/Source/ThirdParty/tracy/client/TracyCallstack.hpp
new file mode 100644
index 000000000..923eccc04
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyCallstack.hpp
@@ -0,0 +1,114 @@
+#ifndef __TRACYCALLSTACK_HPP__
+#define __TRACYCALLSTACK_HPP__
+
+#include "TracyCallstack.h"
+
+#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5
+#  include <unwind.h>
+#elif TRACY_HAS_CALLSTACK >= 3
+#  include <execinfo.h>
+#endif
+
+
+#ifdef TRACY_HAS_CALLSTACK
+
+#include <assert.h>
+#include <stdint.h>
+
+#include "../common/TracyAlloc.hpp"
+
+namespace tracy
+{
+
+struct CallstackSymbolData
+{
+    const char* file;
+    uint32_t line;
+    bool needFree;
+};
+
+struct CallstackEntry
+{
+    const char* name;
+    const char* file;
+    uint32_t line;
+    uint32_t symLen;
+    uint64_t symAddr;
+};
+
+struct CallstackEntryData
+{
+    const CallstackEntry* data;
+    uint8_t size;
+    const char* imageName;
+};
+
+CallstackSymbolData DecodeSymbolAddress( uint64_t ptr );
+CallstackSymbolData DecodeCodeAddress( uint64_t ptr );
+const char* DecodeCallstackPtrFast( uint64_t ptr );
+CallstackEntryData DecodeCallstackPtr( uint64_t ptr );
+void InitCallstack();
+
+#if TRACY_HAS_CALLSTACK == 1
+
+TRACY_API uintptr_t* CallTrace( int depth );
+
+static tracy_force_inline void* Callstack( int depth )
+{
+    assert( depth >= 1 && depth < 63 );
+    return CallTrace( depth );
+}
+
+#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5
+
+struct BacktraceState
+{
+    void** current;
+    void** end;
+};
+
+static _Unwind_Reason_Code tracy_unwind_callback( struct _Unwind_Context* ctx, void* arg )
+{
+    auto state = (BacktraceState*)arg;
+    uintptr_t pc = _Unwind_GetIP( ctx );
+    if( pc )
+    {
+        if( state->current == state->end ) return _URC_END_OF_STACK;
+        *state->current++ = (void*)pc;
+    }
+    return _URC_NO_REASON;
+}
+
+static tracy_force_inline void* Callstack( int depth )
+{
+    assert( depth >= 1 && depth < 63 );
+
+    auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) );
+    BacktraceState state = { (void**)(trace+1), (void**)(trace+1+depth) };
+    _Unwind_Backtrace( tracy_unwind_callback, &state );
+
+    *trace = (uintptr_t*)state.current - trace + 1;
+
+    return trace;
+}
+
+#elif TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
+
+static tracy_force_inline void* Callstack( int depth )
+{
+    assert( depth >= 1 );
+
+    auto trace = (uintptr_t*)tracy_malloc( ( 1 + (size_t)depth ) * sizeof( uintptr_t ) );
+    const auto num = (size_t)backtrace( (void**)(trace+1), depth );
+    *trace = num;
+
+    return trace;
+}
+
+#endif
+
+}
+
+#endif
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracyFastVector.hpp b/Source/ThirdParty/tracy/client/TracyFastVector.hpp
new file mode 100644
index 000000000..fc4108016
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyFastVector.hpp
@@ -0,0 +1,117 @@
+#ifndef __TRACYFASTVECTOR_HPP__
+#define __TRACYFASTVECTOR_HPP__
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "../common/TracyAlloc.hpp"
+
+namespace tracy
+{
+
+template<typename T>
+class FastVector
+{
+public:
+    using iterator = T*;
+    using const_iterator = const T*;
+
+    FastVector( size_t capacity )
+        : m_ptr( (T*)tracy_malloc( sizeof( T ) * capacity ) )
+        , m_write( m_ptr )
+        , m_end( m_ptr + capacity )
+    {
+        assert( capacity != 0 );
+    }
+
+    FastVector( const FastVector& ) = delete;
+    FastVector( FastVector&& ) = delete;
+
+    ~FastVector()
+    {
+        tracy_free( m_ptr );
+    }
+
+    FastVector& operator=( const FastVector& ) = delete;
+    FastVector& operator=( FastVector&& ) = delete;
+
+    bool empty() const { return m_ptr == m_write; }
+    size_t size() const { return m_write - m_ptr; }
+
+    T* data() { return m_ptr; }
+    const T* data() const { return m_ptr; };
+
+    T* begin() { return m_ptr; }
+    const T* begin() const { return m_ptr; }
+    T* end() { return m_write; }
+    const T* end() const { return m_write; }
+
+    T& front() { assert( !empty() ); return m_ptr[0]; }
+    const T& front() const { assert( !empty() ); return m_ptr[0]; }
+
+    T& back() { assert( !empty() ); return m_write[-1]; }
+    const T& back() const { assert( !empty() ); return m_write[-1]; }
+
+    T& operator[]( size_t idx ) { return m_ptr[idx]; }
+    const T& operator[]( size_t idx ) const { return m_ptr[idx]; }
+
+    T* push_next()
+    {
+        if( m_write == m_end ) AllocMore();
+        return m_write++;
+    }
+
+    T* prepare_next()
+    {
+        if( m_write == m_end ) AllocMore();
+        return m_write;
+    }
+
+    void commit_next()
+    {
+        m_write++;
+    }
+
+    void clear()
+    {
+        m_write = m_ptr;
+    }
+
+    void swap( FastVector& vec )
+    {
+        const auto ptr1 = m_ptr;
+        const auto ptr2 = vec.m_ptr;
+        const auto write1 = m_write;
+        const auto write2 = vec.m_write;
+        const auto end1 = m_end;
+        const auto end2 = vec.m_end;
+
+        m_ptr = ptr2;
+        vec.m_ptr = ptr1;
+        m_write = write2;
+        vec.m_write = write1;
+        m_end = end2;
+        vec.m_end = end1;
+    }
+
+private:
+    tracy_no_inline void AllocMore()
+    {
+        const auto cap = size_t( m_end - m_ptr ) * 2;
+        const auto size = size_t( m_write - m_ptr );
+        T* ptr = (T*)tracy_malloc( sizeof( T ) * cap );
+        memcpy( ptr, m_ptr, size * sizeof( T ) );
+        tracy_free( m_ptr );
+        m_ptr = ptr;
+        m_write = m_ptr + size;
+        m_end = m_ptr + cap;
+    }
+
+    T* m_ptr;
+    T* m_write;
+    T* m_end;
+};
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracyLock.hpp b/Source/ThirdParty/tracy/client/TracyLock.hpp
new file mode 100644
index 000000000..e513cdc5d
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyLock.hpp
@@ -0,0 +1,548 @@
+#ifndef __TRACYLOCK_HPP__
+#define __TRACYLOCK_HPP__
+
+#include <atomic>
+#include <limits>
+
+#include "../common/TracySystem.hpp"
+#include "../common/TracyAlign.hpp"
+#include "TracyProfiler.hpp"
+
+namespace tracy
+{
+
+class LockableCtx
+{
+public:
+    tracy_force_inline LockableCtx( const SourceLocationData* srcloc )
+        : m_id( GetLockCounter().fetch_add( 1, std::memory_order_relaxed ) )
+#ifdef TRACY_ON_DEMAND
+        , m_lockCount( 0 )
+        , m_active( false )
+#endif
+    {
+        assert( m_id != std::numeric_limits<uint32_t>::max() );
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockAnnounce );
+        MemWrite( &item->lockAnnounce.id, m_id );
+        MemWrite( &item->lockAnnounce.time, Profiler::GetTime() );
+        MemWrite( &item->lockAnnounce.lckloc, (uint64_t)srcloc );
+        MemWrite( &item->lockAnnounce.type, LockType::Lockable );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+    LockableCtx( const LockableCtx& ) = delete;
+    LockableCtx& operator=( const LockableCtx& ) = delete;
+
+    tracy_force_inline ~LockableCtx()
+    {
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockTerminate );
+        MemWrite( &item->lockTerminate.id, m_id );
+        MemWrite( &item->lockTerminate.time, Profiler::GetTime() );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline bool BeforeLock()
+    {
+#ifdef TRACY_ON_DEMAND
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return false;
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockWait );
+        MemWrite( &item->lockWait.thread, GetThreadHandle() );
+        MemWrite( &item->lockWait.id, m_id );
+        MemWrite( &item->lockWait.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+        return true;
+    }
+
+    tracy_force_inline void AfterLock()
+    {
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockObtain );
+        MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+        MemWrite( &item->lockObtain.id, m_id );
+        MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterUnlock()
+    {
+#ifdef TRACY_ON_DEMAND
+        m_lockCount.fetch_sub( 1, std::memory_order_relaxed );
+        if( !m_active.load( std::memory_order_relaxed ) ) return;
+        if( !GetProfiler().IsConnected() )
+        {
+            m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockRelease );
+        MemWrite( &item->lockRelease.thread, GetThreadHandle() );
+        MemWrite( &item->lockRelease.id, m_id );
+        MemWrite( &item->lockRelease.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterTryLock( bool acquired )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !acquired ) return;
+
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return;
+#endif
+
+        if( acquired )
+        {
+            auto item = Profiler::QueueSerial();
+            MemWrite( &item->hdr.type, QueueType::LockObtain );
+            MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+            MemWrite( &item->lockObtain.id, m_id );
+            MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+            Profiler::QueueSerialFinish();
+        }
+    }
+
+    tracy_force_inline void Mark( const SourceLocationData* srcloc )
+    {
+#ifdef TRACY_ON_DEMAND
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( !active ) return;
+        const auto connected = GetProfiler().IsConnected();
+        if( !connected )
+        {
+            if( active ) m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockMark );
+        MemWrite( &item->lockMark.thread, GetThreadHandle() );
+        MemWrite( &item->lockMark.id, m_id );
+        MemWrite( &item->lockMark.srcloc, (uint64_t)srcloc );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void CustomName( const char* name, size_t size )
+    {
+        assert( size < std::numeric_limits<uint16_t>::max() );
+        auto ptr = (char*)tracy_malloc( size );
+        memcpy( ptr, name, size );
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockName );
+        MemWrite( &item->lockNameFat.id, m_id );
+        MemWrite( &item->lockNameFat.name, (uint64_t)ptr );
+        MemWrite( &item->lockNameFat.size, (uint16_t)size );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+private:
+    uint32_t m_id;
+
+#ifdef TRACY_ON_DEMAND
+    std::atomic<uint32_t> m_lockCount;
+    std::atomic<bool> m_active;
+#endif
+};
+
+template<class T>
+class Lockable
+{
+public:
+    tracy_force_inline Lockable( const SourceLocationData* srcloc )
+        : m_ctx( srcloc )
+    {
+    }
+
+    Lockable( const Lockable& ) = delete;
+    Lockable& operator=( const Lockable& ) = delete;
+
+    tracy_force_inline void lock()
+    {
+        const auto runAfter = m_ctx.BeforeLock();
+        m_lockable.lock();
+        if( runAfter ) m_ctx.AfterLock();
+    }
+
+    tracy_force_inline void unlock()
+    {
+        m_lockable.unlock();
+        m_ctx.AfterUnlock();
+    }
+
+    tracy_force_inline bool try_lock()
+    {
+        const auto acquired = m_lockable.try_lock();
+        m_ctx.AfterTryLock( acquired );
+        return acquired;
+    }
+
+    tracy_force_inline void Mark( const SourceLocationData* srcloc )
+    {
+        m_ctx.Mark( srcloc );
+    }
+
+    tracy_force_inline void CustomName( const char* name, size_t size )
+    {
+        m_ctx.CustomName( name, size );
+    }
+
+private:
+    T m_lockable;
+    LockableCtx m_ctx;
+};
+
+
+class SharedLockableCtx
+{
+public:
+    tracy_force_inline SharedLockableCtx( const SourceLocationData* srcloc )
+        : m_id( GetLockCounter().fetch_add( 1, std::memory_order_relaxed ) )
+#ifdef TRACY_ON_DEMAND
+        , m_lockCount( 0 )
+        , m_active( false )
+#endif
+    {
+        assert( m_id != std::numeric_limits<uint32_t>::max() );
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockAnnounce );
+        MemWrite( &item->lockAnnounce.id, m_id );
+        MemWrite( &item->lockAnnounce.time, Profiler::GetTime() );
+        MemWrite( &item->lockAnnounce.lckloc, (uint64_t)srcloc );
+        MemWrite( &item->lockAnnounce.type, LockType::SharedLockable );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+    SharedLockableCtx( const SharedLockableCtx& ) = delete;
+    SharedLockableCtx& operator=( const SharedLockableCtx& ) = delete;
+
+    tracy_force_inline ~SharedLockableCtx()
+    {
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockTerminate );
+        MemWrite( &item->lockTerminate.id, m_id );
+        MemWrite( &item->lockTerminate.time, Profiler::GetTime() );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline bool BeforeLock()
+    {
+#ifdef TRACY_ON_DEMAND
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return false;
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockWait );
+        MemWrite( &item->lockWait.thread, GetThreadHandle() );
+        MemWrite( &item->lockWait.id, m_id );
+        MemWrite( &item->lockWait.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+        return true;
+    }
+
+    tracy_force_inline void AfterLock()
+    {
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockObtain );
+        MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+        MemWrite( &item->lockObtain.id, m_id );
+        MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterUnlock()
+    {
+#ifdef TRACY_ON_DEMAND
+        m_lockCount.fetch_sub( 1, std::memory_order_relaxed );
+        if( !m_active.load( std::memory_order_relaxed ) ) return;
+        if( !GetProfiler().IsConnected() )
+        {
+            m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockRelease );
+        MemWrite( &item->lockRelease.thread, GetThreadHandle() );
+        MemWrite( &item->lockRelease.id, m_id );
+        MemWrite( &item->lockRelease.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterTryLock( bool acquired )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !acquired ) return;
+
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return;
+#endif
+
+        if( acquired )
+        {
+            auto item = Profiler::QueueSerial();
+            MemWrite( &item->hdr.type, QueueType::LockObtain );
+            MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+            MemWrite( &item->lockObtain.id, m_id );
+            MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+            Profiler::QueueSerialFinish();
+        }
+    }
+
+    tracy_force_inline bool BeforeLockShared()
+    {
+#ifdef TRACY_ON_DEMAND
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return false;
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockSharedWait );
+        MemWrite( &item->lockWait.thread, GetThreadHandle() );
+        MemWrite( &item->lockWait.id, m_id );
+        MemWrite( &item->lockWait.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+        return true;
+    }
+
+    tracy_force_inline void AfterLockShared()
+    {
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockSharedObtain );
+        MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+        MemWrite( &item->lockObtain.id, m_id );
+        MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterUnlockShared()
+    {
+#ifdef TRACY_ON_DEMAND
+        m_lockCount.fetch_sub( 1, std::memory_order_relaxed );
+        if( !m_active.load( std::memory_order_relaxed ) ) return;
+        if( !GetProfiler().IsConnected() )
+        {
+            m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockSharedRelease );
+        MemWrite( &item->lockRelease.thread, GetThreadHandle() );
+        MemWrite( &item->lockRelease.id, m_id );
+        MemWrite( &item->lockRelease.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterTryLockShared( bool acquired )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !acquired ) return;
+
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return;
+#endif
+
+        if( acquired )
+        {
+            auto item = Profiler::QueueSerial();
+            MemWrite( &item->hdr.type, QueueType::LockSharedObtain );
+            MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+            MemWrite( &item->lockObtain.id, m_id );
+            MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+            Profiler::QueueSerialFinish();
+        }
+    }
+
+    tracy_force_inline void Mark( const SourceLocationData* srcloc )
+    {
+#ifdef TRACY_ON_DEMAND
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( !active ) return;
+        const auto connected = GetProfiler().IsConnected();
+        if( !connected )
+        {
+            if( active ) m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockMark );
+        MemWrite( &item->lockMark.thread, GetThreadHandle() );
+        MemWrite( &item->lockMark.id, m_id );
+        MemWrite( &item->lockMark.srcloc, (uint64_t)srcloc );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void CustomName( const char* name, size_t size )
+    {
+        assert( size < std::numeric_limits<uint16_t>::max() );
+        auto ptr = (char*)tracy_malloc( size );
+        memcpy( ptr, name, size );
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockName );
+        MemWrite( &item->lockNameFat.id, m_id );
+        MemWrite( &item->lockNameFat.name, (uint64_t)ptr );
+        MemWrite( &item->lockNameFat.size, (uint16_t)size );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+private:
+    uint32_t m_id;
+
+#ifdef TRACY_ON_DEMAND
+    std::atomic<uint32_t> m_lockCount;
+    std::atomic<bool> m_active;
+#endif
+};
+
+template<class T>
+class SharedLockable
+{
+public:
+    tracy_force_inline SharedLockable( const SourceLocationData* srcloc )
+        : m_ctx( srcloc )
+    {
+    }
+
+    SharedLockable( const SharedLockable& ) = delete;
+    SharedLockable& operator=( const SharedLockable& ) = delete;
+
+    tracy_force_inline void lock()
+    {
+        const auto runAfter = m_ctx.BeforeLock();
+        m_lockable.lock();
+        if( runAfter ) m_ctx.AfterLock();
+    }
+
+    tracy_force_inline void unlock()
+    {
+        m_lockable.unlock();
+        m_ctx.AfterUnlock();
+    }
+
+    tracy_force_inline bool try_lock()
+    {
+        const auto acquired = m_lockable.try_lock();
+        m_ctx.AfterTryLock( acquired );
+        return acquired;
+    }
+
+    tracy_force_inline void lock_shared()
+    {
+        const auto runAfter = m_ctx.BeforeLockShared();
+        m_lockable.lock_shared();
+        if( runAfter ) m_ctx.AfterLockShared();
+    }
+
+    tracy_force_inline void unlock_shared()
+    {
+        m_lockable.unlock_shared();
+        m_ctx.AfterUnlockShared();
+    }
+
+    tracy_force_inline bool try_lock_shared()
+    {
+        const auto acquired = m_lockable.try_lock_shared();
+        m_ctx.AfterTryLockShared( acquired );
+        return acquired;
+    }
+
+    tracy_force_inline void Mark( const SourceLocationData* srcloc )
+    {
+        m_ctx.Mark( srcloc );
+    }
+
+    tracy_force_inline void CustomName( const char* name, size_t size )
+    {
+        m_ctx.CustomName( name, size );
+    }
+
+private:
+    T m_lockable;
+    SharedLockableCtx m_ctx;
+};
+
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracyProfiler.cpp b/Source/ThirdParty/tracy/client/TracyProfiler.cpp
new file mode 100644
index 000000000..b8783a0eb
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyProfiler.cpp
@@ -0,0 +1,3573 @@
+#ifdef TRACY_ENABLE
+
+#ifdef _WIN32
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#  include <winsock2.h>
+#  include <windows.h>
+#  include <tlhelp32.h>
+#  include <inttypes.h>
+#  include <intrin.h>
+#else
+#  include <sys/time.h>
+#  include <sys/param.h>
+#endif
+
+#ifdef __CYGWIN__
+#  include <windows.h>
+#  include <unistd.h>
+#  include <tlhelp32.h>
+#endif
+
+#ifdef _GNU_SOURCE
+#  include <errno.h>
+#endif
+
+#ifdef __linux__
+#  include <dirent.h>
+#  include <signal.h>
+#  include <pthread.h>
+#  include <sys/types.h>
+#  include <sys/syscall.h>
+#endif
+
+#if defined __APPLE__ || defined BSD
+#  include <sys/types.h>
+#  include <sys/sysctl.h>
+#endif
+
+#if defined __APPLE__
+#  include "TargetConditionals.h"
+#  include <mach-o/dyld.h>
+#endif
+
+#ifdef __ANDROID__
+#  include <sys/mman.h>
+#  include <stdio.h>
+#  include <stdint.h>
+#  include <algorithm>
+#  include <vector>
+#endif
+
+#include <algorithm>
+#include <assert.h>
+#include <atomic>
+#include <chrono>
+#include <limits>
+#include <new>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <thread>
+
+#include "../common/TracyAlign.hpp"
+#include "../common/TracySocket.hpp"
+#include "../common/TracySystem.hpp"
+#include "tracy_rpmalloc.hpp"
+#include "TracyCallstack.hpp"
+#include "TracyScoped.hpp"
+#include "TracyProfiler.hpp"
+#include "TracyThread.hpp"
+#include "TracyArmCpuTable.hpp"
+#include "TracySysTrace.hpp"
+
+#ifdef TRACY_PORT
+#  ifndef TRACY_DATA_PORT
+#    define TRACY_DATA_PORT TRACY_PORT
+#  endif
+#  ifndef TRACY_BROADCAST_PORT
+#    define TRACY_BROADCAST_PORT TRACY_PORT
+#  endif
+#endif
+
+#ifdef __APPLE__
+#  define TRACY_DELAYED_INIT
+#else
+#  ifdef __GNUC__
+#    define init_order( val ) __attribute__ ((init_priority(val)))
+#  else
+#    define init_order(x)
+#  endif
+#endif
+
+#if defined _WIN32 || defined __CYGWIN__
+#  include <lmcons.h>
+extern "C" typedef LONG (WINAPI *t_RtlGetVersion)( PRTL_OSVERSIONINFOW );
+extern "C" typedef BOOL (WINAPI *t_GetLogicalProcessorInformationEx)( LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD );
+#else
+#  include <unistd.h>
+#  include <limits.h>
+#endif
+#if defined __linux__
+#  include <sys/sysinfo.h>
+#  include <sys/utsname.h>
+#endif
+
+#if !defined _WIN32 && !defined __CYGWIN__ && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+#  include <cpuid.h>
+#endif
+
+#if !( ( ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA ) || defined __linux__ )
+#  include <mutex>
+#endif
+
+namespace tracy
+{
+
+namespace
+{
+#  if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA
+    BOOL CALLBACK InitOnceCallback( PINIT_ONCE /*initOnce*/, PVOID /*Parameter*/, PVOID* /*Context*/)
+    {
+        rpmalloc_initialize();
+        return TRUE;
+    }
+    INIT_ONCE InitOnce = INIT_ONCE_STATIC_INIT;
+#  elif defined __linux__
+    void InitOnceCallback()
+    {
+        rpmalloc_initialize();
+    }
+    pthread_once_t once_control = PTHREAD_ONCE_INIT;
+#  else
+    void InitOnceCallback()
+    {
+        rpmalloc_initialize();
+    }
+    std::once_flag once_flag;
+#  endif
+}
+
+struct RPMallocInit
+{
+    RPMallocInit()
+    {
+#  if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA
+        InitOnceExecuteOnce( &InitOnce, InitOnceCallback, nullptr, nullptr );
+#  elif defined __linux__
+        pthread_once( &once_control, InitOnceCallback );
+#  else
+        std::call_once( once_flag, InitOnceCallback );
+#  endif
+        rpmalloc_thread_initialize();
+    }
+};
+
+#ifndef TRACY_DELAYED_INIT
+
+struct InitTimeWrapper
+{
+    int64_t val;
+};
+
+struct ProducerWrapper
+{
+    tracy::moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* ptr;
+};
+
+struct ThreadHandleWrapper
+{
+    uint64_t val;
+};
+#endif
+
+
+#if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+static inline void CpuId( uint32_t* regs, uint32_t leaf )
+{
+    memset(regs, 0, sizeof(uint32_t) * 4);
+#if defined _WIN32 || defined __CYGWIN__
+    __cpuidex( (int*)regs, leaf, 0 );
+#else
+    __get_cpuid( leaf, regs, regs+1, regs+2, regs+3 );
+#endif
+}
+
+static void InitFailure( const char* msg )
+{
+#if defined _WIN32 || defined __CYGWIN__
+    bool hasConsole = false;
+    bool reopen = false;
+    const auto attached = AttachConsole( ATTACH_PARENT_PROCESS );
+    if( attached )
+    {
+        hasConsole = true;
+        reopen = true;
+    }
+    else
+    {
+        const auto err = GetLastError();
+        if( err == ERROR_ACCESS_DENIED )
+        {
+            hasConsole = true;
+        }
+    }
+    if( hasConsole )
+    {
+        fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg );
+        if( reopen )
+        {
+            freopen( "CONOUT$", "w", stderr );
+            fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg );
+        }
+    }
+    else
+    {
+        MessageBoxA( nullptr, msg, "Tracy Profiler initialization failure", MB_ICONSTOP );
+    }
+#else
+    fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg );
+#endif
+    exit( 0 );
+}
+
+static int64_t SetupHwTimer()
+{
+#if !defined TRACY_TIMER_QPC && !defined TRACY_TIMER_FALLBACK
+    uint32_t regs[4];
+    CpuId( regs, 1 );
+    if( !( regs[3] & ( 1 << 4 ) ) ) InitFailure( "CPU doesn't support RDTSC instruction." );
+    CpuId( regs, 0x80000007 );
+    if( !( regs[3] & ( 1 << 8 ) ) )
+    {
+        const char* noCheck = getenv( "TRACY_NO_INVARIANT_CHECK" );
+        if( !noCheck || noCheck[0] != '1' )
+        {
+#if defined _WIN32 || defined __CYGWIN__
+            InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*.\nAlternatively you may rebuild the application with the TRACY_TIMER_QPC or TRACY_TIMER_FALLBACK define to use lower resolution timer." );
+#else
+            InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*.\nAlternatively you may rebuild the application with the TRACY_TIMER_FALLBACK define to use lower resolution timer." );
+#endif
+        }
+    }
+#endif
+
+    return Profiler::GetTime();
+}
+#else
+static int64_t SetupHwTimer()
+{
+    return Profiler::GetTime();
+}
+#endif
+
+static const char* GetProcessName()
+{
+    const char* processName = "unknown";
+#ifdef _WIN32
+    static char buf[_MAX_PATH];
+    GetModuleFileNameA( nullptr, buf, _MAX_PATH );
+    const char* ptr = buf;
+    while( *ptr != '\0' ) ptr++;
+    while( ptr > buf && *ptr != '\\' && *ptr != '/' ) ptr--;
+    if( ptr > buf ) ptr++;
+    processName = ptr;
+#elif defined __ANDROID__
+#  if __ANDROID_API__ >= 21
+    auto buf = getprogname();
+    if( buf ) processName = buf;
+#  endif
+#elif defined _GNU_SOURCE || defined __CYGWIN__
+    if( program_invocation_short_name ) processName = program_invocation_short_name;
+#elif defined __APPLE__ || defined BSD
+    auto buf = getprogname();
+    if( buf ) processName = buf;
+#endif
+    return processName;
+}
+
+static const char* GetProcessExecutablePath()
+{
+#ifdef _WIN32
+    static char buf[_MAX_PATH];
+    GetModuleFileNameA( nullptr, buf, _MAX_PATH );
+    return buf;
+#elif defined __ANDROID__
+    return nullptr;
+#elif defined _GNU_SOURCE || defined __CYGWIN__
+    return program_invocation_name;
+#elif defined __APPLE__
+    static char buf[1024];
+    uint32_t size = 1024;
+    _NSGetExecutablePath( buf, &size );
+    return buf;
+#elif defined __DragonFly__
+    static char buf[1024];
+    readlink( "/proc/curproc/file", buf, 1024 );
+    return buf;
+#elif defined __FreeBSD__
+    static char buf[1024];
+    int mib[4];
+    mib[0] = CTL_KERN;
+    mib[1] = KERN_PROC;
+    mib[2] = KERN_PROC_PATHNAME;
+    mib[3] = -1;
+    size_t cb = 1024;
+    sysctl( mib, 4, buf, &cb, nullptr, 0 );
+    return buf;
+#elif defined __NetBSD__
+    static char buf[1024];
+    readlink( "/proc/curproc/exe", buf, 1024 );
+    return buf;
+#else
+    return nullptr;
+#endif
+}
+
+#if defined __linux__ && defined __ARM_ARCH
+static uint32_t GetHex( char*& ptr, int skip )
+{
+    uint32_t ret;
+    ptr += skip;
+    char* end;
+    if( ptr[0] == '0' && ptr[1] == 'x' )
+    {
+        ptr += 2;
+        ret = strtol( ptr, &end, 16 );
+    }
+    else
+    {
+        ret = strtol( ptr, &end, 10 );
+    }
+    ptr = end;
+    return ret;
+}
+#endif
+
+static const char* GetHostInfo()
+{
+    static char buf[1024];
+    auto ptr = buf;
+#if defined _WIN32 || defined __CYGWIN__
+    t_RtlGetVersion RtlGetVersion = (t_RtlGetVersion)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlGetVersion" );
+    if( !RtlGetVersion )
+    {
+#  ifdef __CYGWIN__
+        ptr += sprintf( ptr, "OS: Windows (Cygwin)\n" );
+#  elif defined __MINGW32__
+        ptr += sprintf( ptr, "OS: Windows (MingW)\n" );
+#  else
+        ptr += sprintf( ptr, "OS: Windows\n" );
+#  endif
+    }
+    else
+    {
+        RTL_OSVERSIONINFOW ver = { sizeof( RTL_OSVERSIONINFOW ) };
+        RtlGetVersion( &ver );
+
+#  ifdef __CYGWIN__
+        ptr += sprintf( ptr, "OS: Windows %i.%i.%i (Cygwin)\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber );
+#  elif defined __MINGW32__
+        ptr += sprintf( ptr, "OS: Windows %i.%i.%i (MingW)\n", (int)ver.dwMajorVersion, (int)ver.dwMinorVersion, (int)ver.dwBuildNumber );
+#  else
+        ptr += sprintf( ptr, "OS: Windows %i.%i.%i\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber );
+#  endif
+    }
+#elif defined __linux__
+    struct utsname utsName;
+    uname( &utsName );
+#  if defined __ANDROID__
+    ptr += sprintf( ptr, "OS: Linux %s (Android)\n", utsName.release );
+#  else
+    ptr += sprintf( ptr, "OS: Linux %s\n", utsName.release );
+#  endif
+#elif defined __APPLE__
+#  if TARGET_OS_IPHONE == 1
+    ptr += sprintf( ptr, "OS: Darwin (iOS)\n" );
+#  elif TARGET_OS_MAC == 1
+    ptr += sprintf( ptr, "OS: Darwin (OSX)\n" );
+#  else
+    ptr += sprintf( ptr, "OS: Darwin (unknown)\n" );
+#  endif
+#elif defined __DragonFly__
+    ptr += sprintf( ptr, "OS: BSD (DragonFly)\n" );
+#elif defined __FreeBSD__
+    ptr += sprintf( ptr, "OS: BSD (FreeBSD)\n" );
+#elif defined __NetBSD__
+    ptr += sprintf( ptr, "OS: BSD (NetBSD)\n" );
+#elif defined __OpenBSD__
+    ptr += sprintf( ptr, "OS: BSD (OpenBSD)\n" );
+#else
+    ptr += sprintf( ptr, "OS: unknown\n" );
+#endif
+
+#if defined _MSC_VER
+#  if defined __clang__
+    ptr += sprintf( ptr, "Compiler: MSVC clang-cl %i.%i.%i\n", __clang_major__, __clang_minor__, __clang_patchlevel__ );
+#  else
+    ptr += sprintf( ptr, "Compiler: MSVC %i\n", _MSC_VER );
+#  endif
+#elif defined __clang__
+    ptr += sprintf( ptr, "Compiler: clang %i.%i.%i\n", __clang_major__, __clang_minor__, __clang_patchlevel__ );
+#elif defined __GNUC__
+    ptr += sprintf( ptr, "Compiler: gcc %i.%i\n", __GNUC__, __GNUC_MINOR__ );
+#else
+    ptr += sprintf( ptr, "Compiler: unknown\n" );
+#endif
+
+#if defined _WIN32 || defined __CYGWIN__
+#  ifndef __CYGWIN__
+    InitWinSock();
+#  endif
+    char hostname[512];
+    gethostname( hostname, 512 );
+
+    DWORD userSz = UNLEN+1;
+    char user[UNLEN+1];
+    GetUserNameA( user, &userSz );
+
+    ptr += sprintf( ptr, "User: %s@%s\n", user, hostname );
+#else
+    char hostname[_POSIX_HOST_NAME_MAX]{};
+    char user[_POSIX_LOGIN_NAME_MAX]{};
+
+    gethostname( hostname, _POSIX_HOST_NAME_MAX );
+#  if defined __ANDROID__
+    const auto login = getlogin();
+    if( login )
+    {
+        strcpy( user, login );
+    }
+    else
+    {
+        memcpy( user, "(?)", 4 );
+    }
+#  else
+    getlogin_r( user, _POSIX_LOGIN_NAME_MAX );
+#  endif
+
+    ptr += sprintf( ptr, "User: %s@%s\n", user, hostname );
+#endif
+
+#if defined __i386 || defined _M_IX86
+    ptr += sprintf( ptr, "Arch: x86\n" );
+#elif defined __x86_64__ || defined _M_X64
+    ptr += sprintf( ptr, "Arch: x64\n" );
+#elif defined __aarch64__
+    ptr += sprintf( ptr, "Arch: ARM64\n" );
+#elif defined __ARM_ARCH
+    ptr += sprintf( ptr, "Arch: ARM\n" );
+#else
+    ptr += sprintf( ptr, "Arch: unknown\n" );
+#endif
+
+#if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+    uint32_t regs[4];
+    char cpuModel[4*4*3];
+    auto modelPtr = cpuModel;
+    for( uint32_t i=0x80000002; i<0x80000005; ++i )
+    {
+        CpuId( regs, i );
+        memcpy( modelPtr, regs, sizeof( regs ) ); modelPtr += sizeof( regs );
+    }
+
+    ptr += sprintf( ptr, "CPU: %s\n", cpuModel );
+#elif defined __linux__ && defined __ARM_ARCH
+    bool cpuFound = false;
+    FILE* fcpuinfo = fopen( "/proc/cpuinfo", "rb" );
+    if( fcpuinfo )
+    {
+        enum { BufSize = 4*1024 };
+        char buf[BufSize];
+        const auto sz = fread( buf, 1, BufSize, fcpuinfo );
+        fclose( fcpuinfo );
+        const auto end = buf + sz;
+        auto cptr = buf;
+
+        uint32_t impl = 0;
+        uint32_t var = 0;
+        uint32_t part = 0;
+        uint32_t rev = 0;
+
+        while( end - cptr > 20 )
+        {
+            while( end - cptr > 20 && memcmp( cptr, "CPU ", 4 ) != 0 )
+            {
+                cptr += 4;
+                while( end - cptr > 20 && *cptr != '\n' ) cptr++;
+                cptr++;
+            }
+            if( end - cptr <= 20 ) break;
+            cptr += 4;
+            if( memcmp( cptr, "implementer\t: ", 14 ) == 0 )
+            {
+                if( impl != 0 ) break;
+                impl = GetHex( cptr, 14 );
+            }
+            else if( memcmp( cptr, "variant\t: ", 10 ) == 0 ) var = GetHex( cptr, 10 );
+            else if( memcmp( cptr, "part\t: ", 7 ) == 0 ) part = GetHex( cptr, 7 );
+            else if( memcmp( cptr, "revision\t: ", 11 ) == 0 ) rev = GetHex( cptr, 11 );
+            while( *cptr != '\n' && *cptr != '\0' ) cptr++;
+            cptr++;
+        }
+
+        if( impl != 0 || var != 0 || part != 0 || rev != 0 )
+        {
+            cpuFound = true;
+            ptr += sprintf( ptr, "CPU: %s%s r%ip%i\n", DecodeArmImplementer( impl ), DecodeArmPart( impl, part ), var, rev );
+        }
+    }
+    if( !cpuFound )
+    {
+        ptr += sprintf( ptr, "CPU: unknown\n" );
+    }
+#elif defined __APPLE__ && TARGET_OS_IPHONE == 1
+    {
+        size_t sz;
+        sysctlbyname( "hw.machine", nullptr, &sz, nullptr, 0 );
+        auto str = (char*)tracy_malloc( sz );
+        sysctlbyname( "hw.machine", str, &sz, nullptr, 0 );
+        ptr += sprintf( ptr, "Device: %s\n", DecodeIosDevice( str ) );
+        tracy_free( str );
+    }
+#else
+    ptr += sprintf( ptr, "CPU: unknown\n" );
+#endif
+
+    ptr += sprintf( ptr, "CPU cores: %i\n", std::thread::hardware_concurrency() );
+
+#if defined _WIN32 || defined __CYGWIN__
+    MEMORYSTATUSEX statex;
+    statex.dwLength = sizeof( statex );
+    GlobalMemoryStatusEx( &statex );
+#  ifdef _MSC_VER
+    ptr += sprintf( ptr, "RAM: %I64u MB\n", statex.ullTotalPhys / 1024 / 1024 );
+#  else
+    ptr += sprintf( ptr, "RAM: %llu MB\n", statex.ullTotalPhys / 1024 / 1024 );
+#  endif
+#elif defined __linux__
+    struct sysinfo sysInfo;
+    sysinfo( &sysInfo );
+    ptr += sprintf( ptr, "RAM: %lu MB\n", sysInfo.totalram / 1024 / 1024 );
+#elif defined __APPLE__
+    size_t memSize;
+    size_t sz = sizeof( memSize );
+    sysctlbyname( "hw.memsize", &memSize, &sz, nullptr, 0 );
+    ptr += sprintf( ptr, "RAM: %zu MB\n", memSize / 1024 / 1024 );
+#elif defined BSD
+    size_t memSize;
+    size_t sz = sizeof( memSize );
+    sysctlbyname( "hw.physmem", &memSize, &sz, nullptr, 0 );
+    ptr += sprintf( ptr, "RAM: %zu MB\n", memSize / 1024 / 1024 );
+#else
+    ptr += sprintf( ptr, "RAM: unknown\n" );
+#endif
+
+    return buf;
+}
+
+static uint64_t GetPid()
+{
+#if defined _WIN32 || defined __CYGWIN__
+    return uint64_t( GetCurrentProcessId() );
+#else
+    return uint64_t( getpid() );
+#endif
+}
+
+void Profiler::AckServerQuery()
+{
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::AckServerQueryNoop );
+    NeedDataSize( QueueDataSize[(int)QueueType::AckServerQueryNoop] );
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::AckServerQueryNoop] );
+}
+
+void Profiler::AckSourceCodeNotAvailable()
+{
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::AckSourceCodeNotAvailable );
+    NeedDataSize( QueueDataSize[(int)QueueType::AckSourceCodeNotAvailable] );
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::AckSourceCodeNotAvailable] );
+}
+
+static BroadcastMessage& GetBroadcastMessage( const char* procname, size_t pnsz, int& len, int port )
+{
+    static BroadcastMessage msg;
+
+    msg.broadcastVersion = BroadcastVersion;
+    msg.protocolVersion = ProtocolVersion;
+    msg.listenPort = port;
+
+    memcpy( msg.programName, procname, pnsz );
+    memset( msg.programName + pnsz, 0, WelcomeMessageProgramNameSize - pnsz );
+
+    len = int( offsetof( BroadcastMessage, programName ) + pnsz + 1 );
+    return msg;
+}
+
+#if defined _WIN32 || defined __CYGWIN__
+static DWORD s_profilerThreadId = 0;
+static char s_crashText[1024];
+
+LONG WINAPI CrashFilter( PEXCEPTION_POINTERS pExp )
+{
+    if( !GetProfiler().IsConnected() ) return EXCEPTION_CONTINUE_SEARCH;
+
+    const unsigned ec = pExp->ExceptionRecord->ExceptionCode;
+    auto msgPtr = s_crashText;
+    switch( ec )
+    {
+    case EXCEPTION_ACCESS_VIOLATION:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ACCESS_VIOLATION (0x%x). ", ec );
+        switch( pExp->ExceptionRecord->ExceptionInformation[0] )
+        {
+        case 0:
+            msgPtr += sprintf( msgPtr, "Read violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] );
+            break;
+        case 1:
+            msgPtr += sprintf( msgPtr, "Write violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] );
+            break;
+        case 8:
+            msgPtr += sprintf( msgPtr, "DEP violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] );
+            break;
+        default:
+            break;
+        }
+        break;
+    case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ARRAY_BOUNDS_EXCEEDED (0x%x). ", ec );
+        break;
+    case EXCEPTION_DATATYPE_MISALIGNMENT:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_DATATYPE_MISALIGNMENT (0x%x). ", ec );
+        break;
+    case EXCEPTION_FLT_DIVIDE_BY_ZERO:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_FLT_DIVIDE_BY_ZERO (0x%x). ", ec );
+        break;
+    case EXCEPTION_ILLEGAL_INSTRUCTION:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ILLEGAL_INSTRUCTION (0x%x). ", ec );
+        break;
+    case EXCEPTION_IN_PAGE_ERROR:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_IN_PAGE_ERROR (0x%x). ", ec );
+        break;
+    case EXCEPTION_INT_DIVIDE_BY_ZERO:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_INT_DIVIDE_BY_ZERO (0x%x). ", ec );
+        break;
+    case EXCEPTION_PRIV_INSTRUCTION:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_PRIV_INSTRUCTION (0x%x). ", ec );
+        break;
+    case EXCEPTION_STACK_OVERFLOW:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_STACK_OVERFLOW (0x%x). ", ec );
+        break;
+    default:
+        return EXCEPTION_CONTINUE_SEARCH;
+    }
+
+    {
+        GetProfiler().SendCallstack( 60, "KiUserExceptionDispatcher" );
+
+        TracyLfqPrepare( QueueType::CrashReport );
+        item->crashReport.time = Profiler::GetTime();
+        item->crashReport.text = (uint64_t)s_crashText;
+        TracyLfqCommit;
+    }
+
+    HANDLE h = CreateToolhelp32Snapshot( TH32CS_SNAPTHREAD, 0 );
+    if( h == INVALID_HANDLE_VALUE ) return EXCEPTION_CONTINUE_SEARCH;
+
+    THREADENTRY32 te = { sizeof( te ) };
+    if( !Thread32First( h, &te ) )
+    {
+        CloseHandle( h );
+        return EXCEPTION_CONTINUE_SEARCH;
+    }
+
+    const auto pid = GetCurrentProcessId();
+    const auto tid = GetCurrentThreadId();
+
+    do
+    {
+        if( te.th32OwnerProcessID == pid && te.th32ThreadID != tid && te.th32ThreadID != s_profilerThreadId )
+        {
+            HANDLE th = OpenThread( THREAD_SUSPEND_RESUME, FALSE, te.th32ThreadID );
+            if( th != INVALID_HANDLE_VALUE )
+            {
+                SuspendThread( th );
+                CloseHandle( th );
+            }
+        }
+    }
+    while( Thread32Next( h, &te ) );
+    CloseHandle( h );
+
+    {
+        TracyLfqPrepare( QueueType::Crash );
+        TracyLfqCommit;
+    }
+
+    std::this_thread::sleep_for( std::chrono::milliseconds( 500 ) );
+    GetProfiler().RequestShutdown();
+    while( !GetProfiler().HasShutdownFinished() ) { std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); };
+
+    TerminateProcess( GetCurrentProcess(), 1 );
+
+    return EXCEPTION_CONTINUE_SEARCH;
+}
+#endif
+
+#ifdef __linux__
+static long s_profilerTid = 0;
+static char s_crashText[1024];
+static std::atomic<bool> s_alreadyCrashed( false );
+
+static void ThreadFreezer( int /*signal*/ )
+{
+    for(;;) sleep( 1000 );
+}
+
+static inline void HexPrint( char*& ptr, uint64_t val )
+{
+    if( val == 0 )
+    {
+        *ptr++ = '0';
+        return;
+    }
+
+    static const char HexTable[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
+    char buf[16];
+    auto bptr = buf;
+
+    do
+    {
+        *bptr++ = HexTable[val%16];
+        val /= 16;
+    }
+    while( val > 0 );
+
+    do
+    {
+        *ptr++ = *--bptr;
+    }
+    while( bptr != buf );
+}
+
+static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ )
+{
+    bool expected = false;
+    if( !s_alreadyCrashed.compare_exchange_strong( expected, true ) ) ThreadFreezer( signal );
+
+    struct sigaction act = {};
+    act.sa_handler = SIG_DFL;
+    sigaction( SIGABRT, &act, nullptr );
+
+    auto msgPtr = s_crashText;
+    switch( signal )
+    {
+    case SIGILL:
+        strcpy( msgPtr, "Illegal Instruction.\n" );
+        while( *msgPtr ) msgPtr++;
+        switch( info->si_code )
+        {
+        case ILL_ILLOPC:
+            strcpy( msgPtr, "Illegal opcode.\n" );
+            break;
+        case ILL_ILLOPN:
+            strcpy( msgPtr, "Illegal operand.\n" );
+            break;
+        case ILL_ILLADR:
+            strcpy( msgPtr, "Illegal addressing mode.\n" );
+            break;
+        case ILL_ILLTRP:
+            strcpy( msgPtr, "Illegal trap.\n" );
+            break;
+        case ILL_PRVOPC:
+            strcpy( msgPtr, "Privileged opcode.\n" );
+            break;
+        case ILL_PRVREG:
+            strcpy( msgPtr, "Privileged register.\n" );
+            break;
+        case ILL_COPROC:
+            strcpy( msgPtr, "Coprocessor error.\n" );
+            break;
+        case ILL_BADSTK:
+            strcpy( msgPtr, "Internal stack error.\n" );
+            break;
+        default:
+            break;
+        }
+        break;
+    case SIGFPE:
+        strcpy( msgPtr, "Floating-point exception.\n" );
+        while( *msgPtr ) msgPtr++;
+        switch( info->si_code )
+        {
+        case FPE_INTDIV:
+            strcpy( msgPtr, "Integer divide by zero.\n" );
+            break;
+        case FPE_INTOVF:
+            strcpy( msgPtr, "Integer overflow.\n" );
+            break;
+        case FPE_FLTDIV:
+            strcpy( msgPtr, "Floating-point divide by zero.\n" );
+            break;
+        case FPE_FLTOVF:
+            strcpy( msgPtr, "Floating-point overflow.\n" );
+            break;
+        case FPE_FLTUND:
+            strcpy( msgPtr, "Floating-point underflow.\n" );
+            break;
+        case FPE_FLTRES:
+            strcpy( msgPtr, "Floating-point inexact result.\n" );
+            break;
+        case FPE_FLTINV:
+            strcpy( msgPtr, "Floating-point invalid operation.\n" );
+            break;
+        case FPE_FLTSUB:
+            strcpy( msgPtr, "Subscript out of range.\n" );
+            break;
+        default:
+            break;
+        }
+        break;
+    case SIGSEGV:
+        strcpy( msgPtr, "Invalid memory reference.\n" );
+        while( *msgPtr ) msgPtr++;
+        switch( info->si_code )
+        {
+        case SEGV_MAPERR:
+            strcpy( msgPtr, "Address not mapped to object.\n" );
+            break;
+        case SEGV_ACCERR:
+            strcpy( msgPtr, "Invalid permissions for mapped object.\n" );
+            break;
+#  ifdef SEGV_BNDERR
+        case SEGV_BNDERR:
+            strcpy( msgPtr, "Failed address bound checks.\n" );
+            break;
+#  endif
+#  ifdef SEGV_PKUERR
+        case SEGV_PKUERR:
+            strcpy( msgPtr, "Access was denied by memory protection keys.\n" );
+            break;
+#  endif
+        default:
+            break;
+        }
+        break;
+    case SIGPIPE:
+        strcpy( msgPtr, "Broken pipe.\n" );
+        while( *msgPtr ) msgPtr++;
+        break;
+    case SIGBUS:
+        strcpy( msgPtr, "Bus error.\n" );
+        while( *msgPtr ) msgPtr++;
+        switch( info->si_code )
+        {
+        case BUS_ADRALN:
+            strcpy( msgPtr, "Invalid address alignment.\n" );
+            break;
+        case BUS_ADRERR:
+            strcpy( msgPtr, "Nonexistent physical address.\n" );
+            break;
+        case BUS_OBJERR:
+            strcpy( msgPtr, "Object-specific hardware error.\n" );
+            break;
+#  ifdef BUS_MCEERR_AR
+        case BUS_MCEERR_AR:
+            strcpy( msgPtr, "Hardware memory error consumed on a machine check; action required.\n" );
+            break;
+#  endif
+#  ifdef BUS_MCEERR_AO
+        case BUS_MCEERR_AO:
+            strcpy( msgPtr, "Hardware memory error detected in process but not consumed; action optional.\n" );
+            break;
+#  endif
+        default:
+            break;
+        }
+        break;
+    case SIGABRT:
+        strcpy( msgPtr, "Abort signal from abort().\n" );
+        break;
+    default:
+        abort();
+    }
+    while( *msgPtr ) msgPtr++;
+
+    if( signal != SIGPIPE )
+    {
+        strcpy( msgPtr, "Fault address: 0x" );
+        while( *msgPtr ) msgPtr++;
+        HexPrint( msgPtr, uint64_t( info->si_addr ) );
+        *msgPtr++ = '\n';
+    }
+
+    {
+        GetProfiler().SendCallstack( 60, "__kernel_rt_sigreturn" );
+
+        TracyLfqPrepare( QueueType::CrashReport );
+        item->crashReport.time = Profiler::GetTime();
+        item->crashReport.text = (uint64_t)s_crashText;
+        TracyLfqCommit;
+    }
+
+    DIR* dp = opendir( "/proc/self/task" );
+    if( !dp ) abort();
+
+    const auto selfTid = syscall( SYS_gettid );
+
+    struct dirent* ep;
+    while( ( ep = readdir( dp ) ) != nullptr )
+    {
+        if( ep->d_name[0] == '.' ) continue;
+        int tid = atoi( ep->d_name );
+        if( tid != selfTid && tid != s_profilerTid )
+        {
+            syscall( SYS_tkill, tid, SIGPWR );
+        }
+    }
+    closedir( dp );
+
+    {
+        TracyLfqPrepare( QueueType::Crash );
+        TracyLfqCommit;
+    }
+
+    std::this_thread::sleep_for( std::chrono::milliseconds( 500 ) );
+    GetProfiler().RequestShutdown();
+    while( !GetProfiler().HasShutdownFinished() ) { std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); };
+
+    abort();
+}
+#endif
+
+
+enum { QueuePrealloc = 256 * 1024 };
+
+static Profiler* s_instance = nullptr;
+static Thread* s_thread;
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+static Thread* s_sysTraceThread = nullptr;
+#endif
+
+TRACY_API bool ProfilerAvailable() { return s_instance != nullptr; }
+
+TRACY_API int64_t GetFrequencyQpc()
+{
+#if defined _WIN32 || defined __CYGWIN__
+    LARGE_INTEGER t;
+    QueryPerformanceFrequency( &t );
+    return t.QuadPart;
+#else
+    return 0;
+#endif
+}
+
+#ifdef TRACY_DELAYED_INIT
+struct ThreadNameData;
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue();
+TRACY_API void InitRPMallocThread();
+
+void InitRPMallocThread()
+{
+    RPMallocInit rpinit;
+    rpmalloc_thread_initialize();
+}
+
+struct ProfilerData
+{
+    int64_t initTime = SetupHwTimer();
+    RPMallocInit rpmalloc_init;
+    moodycamel::ConcurrentQueue<QueueItem> queue;
+    Profiler profiler;
+    std::atomic<uint32_t> lockCounter { 0 };
+    std::atomic<uint8_t> gpuCtxCounter { 0 };
+    std::atomic<ThreadNameData*> threadNameData { nullptr };
+};
+
+struct ProducerWrapper
+{
+    ProducerWrapper( ProfilerData& data ) : detail( data.queue ), ptr( data.queue.get_explicit_producer( detail ) ) {}
+    moodycamel::ProducerToken detail;
+    tracy::moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* ptr;
+};
+
+struct ProfilerThreadData
+{
+    ProfilerThreadData( ProfilerData& data ) : token( data ), gpuCtx( { nullptr } ) {}
+    RPMallocInit rpmalloc_init;
+    ProducerWrapper token;
+    GpuCtxWrapper gpuCtx;
+#  ifdef TRACY_ON_DEMAND
+    LuaZoneState luaZoneState;
+#  endif
+};
+
+#  ifdef TRACY_MANUAL_LIFETIME
+ProfilerData* s_profilerData = nullptr;
+TRACY_API void StartupProfiler()
+{
+    s_profilerData = new ProfilerData;
+    s_profilerData->profiler.SpawnWorkerThreads();
+}
+static ProfilerData& GetProfilerData()
+{
+    assert(s_profilerData);
+    return *s_profilerData;
+}
+TRACY_API void ShutdownProfiler()
+{
+    delete s_profilerData;
+    s_profilerData = nullptr;
+    rpmalloc_finalize();
+}
+#  else
+static std::atomic<int> profilerDataLock { 0 };
+static std::atomic<ProfilerData*> profilerData { nullptr };
+
+static ProfilerData& GetProfilerData()
+{
+    auto ptr = profilerData.load( std::memory_order_acquire );
+    if( !ptr )
+    {
+        int expected = 0;
+        while( !profilerDataLock.compare_exchange_strong( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; }
+        ptr = profilerData.load( std::memory_order_acquire );
+        if( !ptr )
+        {
+            ptr = (ProfilerData*)malloc( sizeof( ProfilerData ) );
+            new (ptr) ProfilerData();
+            profilerData.store( ptr, std::memory_order_release );
+        }
+        profilerDataLock.store( 0, std::memory_order_release );
+    }
+    return *ptr;
+}
+#  endif
+
+static ProfilerThreadData& GetProfilerThreadData()
+{
+    thread_local ProfilerThreadData data( GetProfilerData() );
+    return data;
+}
+
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* GetToken() { return GetProfilerThreadData().token.ptr; }
+TRACY_API Profiler& GetProfiler() { return GetProfilerData().profiler; }
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue() { return GetProfilerData().queue; }
+TRACY_API int64_t GetInitTime() { return GetProfilerData().initTime; }
+TRACY_API std::atomic<uint32_t>& GetLockCounter() { return GetProfilerData().lockCounter; }
+TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter() { return GetProfilerData().gpuCtxCounter; }
+TRACY_API GpuCtxWrapper& GetGpuCtx() { return GetProfilerThreadData().gpuCtx; }
+TRACY_API uint64_t GetThreadHandle() { return detail::GetThreadHandleImpl(); }
+std::atomic<ThreadNameData*>& GetThreadNameData() { return GetProfilerData().threadNameData; }
+
+#  ifdef TRACY_ON_DEMAND
+TRACY_API LuaZoneState& GetLuaZoneState() { return GetProfilerThreadData().luaZoneState; }
+#  endif
+
+#  ifndef TRACY_MANUAL_LIFETIME
+namespace
+{
+    const auto& __profiler_init = GetProfiler();
+}
+#  endif
+
+#else
+TRACY_API void InitRPMallocThread()
+{
+    rpmalloc_thread_initialize();
+}
+
+// MSVC static initialization order solution. gcc/clang uses init_order() to avoid all this.
+
+// 1a. But s_queue is needed for initialization of variables in point 2.
+extern moodycamel::ConcurrentQueue<QueueItem> s_queue;
+
+thread_local RPMallocInit init_order(106) s_rpmalloc_thread_init;
+
+// 2. If these variables would be in the .CRT$XCB section, they would be initialized only in main thread.
+thread_local moodycamel::ProducerToken init_order(107) s_token_detail( s_queue );
+thread_local ProducerWrapper init_order(108) s_token { s_queue.get_explicit_producer( s_token_detail ) };
+thread_local ThreadHandleWrapper init_order(104) s_threadHandle { detail::GetThreadHandleImpl() };
+
+#  ifdef _MSC_VER
+// 1. Initialize these static variables before all other variables.
+#    pragma warning( disable : 4075 )
+#    pragma init_seg( ".CRT$XCB" )
+#  endif
+
+static InitTimeWrapper init_order(101) s_initTime { SetupHwTimer() };
+static RPMallocInit init_order(102) s_rpmalloc_init;
+moodycamel::ConcurrentQueue<QueueItem> init_order(103) s_queue( QueuePrealloc );
+std::atomic<uint32_t> init_order(104) s_lockCounter( 0 );
+std::atomic<uint8_t> init_order(104) s_gpuCtxCounter( 0 );
+
+thread_local GpuCtxWrapper init_order(104) s_gpuCtx { nullptr };
+
+struct ThreadNameData;
+static std::atomic<ThreadNameData*> init_order(104) s_threadNameDataInstance( nullptr );
+std::atomic<ThreadNameData*>& s_threadNameData = s_threadNameDataInstance;
+
+#  ifdef TRACY_ON_DEMAND
+thread_local LuaZoneState init_order(104) s_luaZoneState { 0, false };
+#  endif
+
+static Profiler init_order(105) s_profiler;
+
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* GetToken() { return s_token.ptr; }
+TRACY_API Profiler& GetProfiler() { return s_profiler; }
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue() { return s_queue; }
+TRACY_API int64_t GetInitTime() { return s_initTime.val; }
+TRACY_API std::atomic<uint32_t>& GetLockCounter() { return s_lockCounter; }
+TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter() { return s_gpuCtxCounter; }
+TRACY_API GpuCtxWrapper& GetGpuCtx() { return s_gpuCtx; }
+#  ifdef __CYGWIN__
+// Hackfix for cygwin reporting memory frees without matching allocations. WTF?
+TRACY_API uint64_t GetThreadHandle() { return detail::GetThreadHandleImpl(); }
+#  else
+TRACY_API uint64_t GetThreadHandle() { return s_threadHandle.val; }
+#  endif
+
+std::atomic<ThreadNameData*>& GetThreadNameData() { return s_threadNameData; }
+
+#  ifdef TRACY_ON_DEMAND
+TRACY_API LuaZoneState& GetLuaZoneState() { return s_luaZoneState; }
+#  endif
+#endif
+
+Profiler::Profiler()
+    : m_timeBegin( 0 )
+    , m_mainThread( detail::GetThreadHandleImpl() )
+    , m_epoch( std::chrono::duration_cast<std::chrono::seconds>( std::chrono::system_clock::now().time_since_epoch() ).count() )
+    , m_shutdown( false )
+    , m_shutdownManual( false )
+    , m_shutdownFinished( false )
+    , m_sock( nullptr )
+    , m_broadcast( nullptr )
+    , m_noExit( false )
+    , m_userPort( 0 )
+    , m_zoneId( 1 )
+    , m_samplingPeriod( 0 )
+    , m_stream( LZ4_createStream() )
+    , m_buffer( (char*)tracy_malloc( TargetFrameSize*3 ) )
+    , m_bufferOffset( 0 )
+    , m_bufferStart( 0 )
+    , m_lz4Buf( (char*)tracy_malloc( LZ4Size + sizeof( lz4sz_t ) ) )
+    , m_serialQueue( 1024*1024 )
+    , m_serialDequeue( 1024*1024 )
+    , m_frameCount( 0 )
+    , m_isConnected( false )
+#ifdef TRACY_ON_DEMAND
+    , m_connectionId( 0 )
+    , m_deferredQueue( 64*1024 )
+#endif
+    , m_paramCallback( nullptr )
+    , m_queryData( nullptr )
+{
+    assert( !s_instance );
+    s_instance = this;
+
+#ifndef TRACY_DELAYED_INIT
+#  ifdef _MSC_VER
+    // 3. But these variables need to be initialized in main thread within the .CRT$XCB section. Do it here.
+    s_token_detail = moodycamel::ProducerToken( s_queue );
+    s_token = ProducerWrapper { s_queue.get_explicit_producer( s_token_detail ) };
+    s_threadHandle = ThreadHandleWrapper { m_mainThread };
+#  endif
+#endif
+
+    CalibrateTimer();
+    CalibrateDelay();
+    ReportTopology();
+
+#ifndef TRACY_NO_EXIT
+    const char* noExitEnv = getenv( "TRACY_NO_EXIT" );
+    if( noExitEnv && noExitEnv[0] == '1' )
+    {
+        m_noExit = true;
+    }
+#endif
+
+    const char* userPort = getenv( "TRACY_PORT" );
+    if( userPort )
+    {
+        m_userPort = atoi( userPort );
+    }
+
+#if !defined(TRACY_DELAYED_INIT) || !defined(TRACY_MANUAL_LIFETIME)
+    SpawnWorkerThreads();
+#endif
+}
+
+void Profiler::SpawnWorkerThreads()
+{
+    s_thread = (Thread*)tracy_malloc( sizeof( Thread ) );
+    new(s_thread) Thread( LaunchWorker, this );
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    if( SysTraceStart( m_samplingPeriod ) )
+    {
+        s_sysTraceThread = (Thread*)tracy_malloc( sizeof( Thread ) );
+        new(s_sysTraceThread) Thread( SysTraceWorker, nullptr );
+        std::this_thread::sleep_for( std::chrono::milliseconds( 1 ) );
+    }
+#endif
+
+#if defined _WIN32 || defined __CYGWIN__
+    s_profilerThreadId = GetThreadId( s_thread->Handle() );
+    AddVectoredExceptionHandler( 1, CrashFilter );
+#endif
+
+#ifdef __linux__
+    struct sigaction threadFreezer = {};
+    threadFreezer.sa_handler = ThreadFreezer;
+    sigaction( SIGPWR, &threadFreezer, nullptr );
+
+    struct sigaction crashHandler = {};
+    crashHandler.sa_sigaction = CrashHandler;
+    crashHandler.sa_flags = SA_SIGINFO;
+    sigaction( SIGILL, &crashHandler, nullptr );
+    sigaction( SIGFPE, &crashHandler, nullptr );
+    sigaction( SIGSEGV, &crashHandler, nullptr );
+    sigaction( SIGPIPE, &crashHandler, nullptr );
+    sigaction( SIGBUS, &crashHandler, nullptr );
+    sigaction( SIGABRT, &crashHandler, nullptr );
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+    InitCallstack();
+#endif
+
+    m_timeBegin.store( GetTime(), std::memory_order_relaxed );
+}
+
+Profiler::~Profiler()
+{
+    m_shutdown.store( true, std::memory_order_relaxed );
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    if( s_sysTraceThread )
+    {
+        SysTraceStop();
+        s_sysTraceThread->~Thread();
+        tracy_free( s_sysTraceThread );
+    }
+#endif
+
+    s_thread->~Thread();
+    tracy_free( s_thread );
+
+    tracy_free( m_lz4Buf );
+    tracy_free( m_buffer );
+    LZ4_freeStream( (LZ4_stream_t*)m_stream );
+
+    if( m_sock )
+    {
+        m_sock->~Socket();
+        tracy_free( m_sock );
+    }
+
+    if( m_broadcast )
+    {
+        m_broadcast->~UdpBroadcast();
+        tracy_free( m_broadcast );
+    }
+
+    assert( s_instance );
+    s_instance = nullptr;
+}
+
+bool Profiler::ShouldExit()
+{
+    return s_instance->m_shutdown.load( std::memory_order_relaxed );
+}
+
+void Profiler::Worker()
+{
+#ifdef __linux__
+    s_profilerTid = syscall( SYS_gettid );
+#endif
+
+    ThreadExitHandler threadExitHandler;
+
+    SetThreadName( "Tracy Profiler" );
+
+#ifdef TRACY_DATA_PORT
+    const bool dataPortSearch = false;
+    auto dataPort = m_userPort != 0 ? m_userPort : TRACY_DATA_PORT;
+#else
+    const bool dataPortSearch = m_userPort == 0;
+    auto dataPort = m_userPort != 0 ? m_userPort : 8086;
+#endif
+#ifdef TRACY_BROADCAST_PORT
+    const auto broadcastPort = TRACY_BROADCAST_PORT;
+#else
+    const auto broadcastPort = 8086;
+#endif
+
+    while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+
+    rpmalloc_thread_initialize();
+
+    m_exectime = 0;
+    const auto execname = GetProcessExecutablePath();
+    if( execname )
+    {
+        struct stat st;
+        if( stat( execname, &st ) == 0 )
+        {
+            m_exectime = (uint64_t)st.st_mtime;
+        }
+    }
+
+    const auto procname = GetProcessName();
+    const auto pnsz = std::min<size_t>( strlen( procname ), WelcomeMessageProgramNameSize - 1 );
+
+    const auto hostinfo = GetHostInfo();
+    const auto hisz = std::min<size_t>( strlen( hostinfo ), WelcomeMessageHostInfoSize - 1 );
+
+    const uint64_t pid = GetPid();
+
+#ifdef TRACY_ON_DEMAND
+    uint8_t onDemand = 1;
+#else
+    uint8_t onDemand = 0;
+#endif
+
+#ifdef __APPLE__
+    uint8_t isApple = 1;
+#else
+    uint8_t isApple = 0;
+#endif
+
+#if defined __i386 || defined _M_IX86
+    uint8_t cpuArch = CpuArchX86;
+#elif defined __x86_64__ || defined _M_X64
+    uint8_t cpuArch = CpuArchX64;
+#elif defined __aarch64__
+    uint8_t cpuArch = CpuArchArm64;
+#elif defined __ARM_ARCH
+    uint8_t cpuArch = CpuArchArm32;
+#else
+    uint8_t cpuArch = CpuArchUnknown;
+#endif
+
+#ifdef TRACY_NO_CODE_TRANSFER
+    uint8_t codeTransfer = 0;
+#else
+    uint8_t codeTransfer = 1;
+#endif
+
+#if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+    uint32_t regs[4];
+    char manufacturer[12];
+    CpuId( regs, 0 );
+    memcpy( manufacturer, regs+1, 4 );
+    memcpy( manufacturer+4, regs+3, 4 );
+    memcpy( manufacturer+8, regs+2, 4 );
+
+    CpuId( regs, 1 );
+    uint32_t cpuId = ( regs[0] & 0xFFF ) | ( ( regs[0] & 0xFFF0000 ) >> 4 );
+#else
+    const char manufacturer[12] = {};
+    uint32_t cpuId = 0;
+#endif
+
+    WelcomeMessage welcome;
+    MemWrite( &welcome.timerMul, m_timerMul );
+    MemWrite( &welcome.initBegin, GetInitTime() );
+    MemWrite( &welcome.initEnd, m_timeBegin.load( std::memory_order_relaxed ) );
+    MemWrite( &welcome.delay, m_delay );
+    MemWrite( &welcome.resolution, m_resolution );
+    MemWrite( &welcome.epoch, m_epoch );
+    MemWrite( &welcome.exectime, m_exectime );
+    MemWrite( &welcome.pid, pid );
+    MemWrite( &welcome.samplingPeriod, m_samplingPeriod );
+    MemWrite( &welcome.onDemand, onDemand );
+    MemWrite( &welcome.isApple, isApple );
+    MemWrite( &welcome.cpuArch, cpuArch );
+    MemWrite( &welcome.codeTransfer, codeTransfer );
+    memcpy( welcome.cpuManufacturer, manufacturer, 12 );
+    MemWrite( &welcome.cpuId, cpuId );
+    memcpy( welcome.programName, procname, pnsz );
+    memset( welcome.programName + pnsz, 0, WelcomeMessageProgramNameSize - pnsz );
+    memcpy( welcome.hostInfo, hostinfo, hisz );
+    memset( welcome.hostInfo + hisz, 0, WelcomeMessageHostInfoSize - hisz );
+
+    moodycamel::ConsumerToken token( GetQueue() );
+
+    ListenSocket listen;
+    bool isListening = false;
+    if( !dataPortSearch )
+    {
+        isListening = listen.Listen( dataPort, 4 );
+    }
+    else
+    {
+        for( uint32_t i=0; i<20; i++ )
+        {
+            if( listen.Listen( dataPort+i, 4 ) )
+            {
+                dataPort += i;
+                isListening = true;
+                break;
+            }
+        }
+    }
+    if( !isListening )
+    {
+        for(;;)
+        {
+            if( ShouldExit() )
+            {
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
+
+            ClearQueues( token );
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+        }
+    }
+
+#ifndef TRACY_NO_BROADCAST
+    m_broadcast = (UdpBroadcast*)tracy_malloc( sizeof( UdpBroadcast ) );
+    new(m_broadcast) UdpBroadcast();
+#  ifdef TRACY_ONLY_LOCALHOST
+    const char* addr = "127.255.255.255";
+#  else
+    const char* addr = "255.255.255.255";
+#  endif
+    if( !m_broadcast->Open( addr, broadcastPort ) )
+    {
+        m_broadcast->~UdpBroadcast();
+        tracy_free( m_broadcast );
+        m_broadcast = nullptr;
+    }
+#endif
+
+    int broadcastLen = 0;
+    auto& broadcastMsg = GetBroadcastMessage( procname, pnsz, broadcastLen, dataPort );
+    uint64_t lastBroadcast = 0;
+
+    // Connections loop.
+    // Each iteration of the loop handles whole connection. Multiple iterations will only
+    // happen in the on-demand mode or when handshake fails.
+    for(;;)
+    {
+        // Wait for incoming connection
+        for(;;)
+        {
+#ifndef TRACY_NO_EXIT
+            if( !m_noExit && ShouldExit() )
+            {
+                if( m_broadcast )
+                {
+                    broadcastMsg.activeTime = -1;
+                    m_broadcast->Send( broadcastPort, &broadcastMsg, broadcastLen );
+                }
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
+#endif
+            m_sock = listen.Accept();
+            if( m_sock ) break;
+#ifndef TRACY_ON_DEMAND
+            ProcessSysTime();
+#endif
+
+            if( m_broadcast )
+            {
+                const auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+                if( t - lastBroadcast > 3000000000 )  // 3s
+                {
+                    lastBroadcast = t;
+                    const auto ts = std::chrono::duration_cast<std::chrono::seconds>( std::chrono::system_clock::now().time_since_epoch() ).count();
+                    broadcastMsg.activeTime = int32_t( ts - m_epoch );
+                    assert( broadcastMsg.activeTime >= 0 );
+                    m_broadcast->Send( broadcastPort, &broadcastMsg, broadcastLen );
+                }
+            }
+        }
+
+        if( m_broadcast )
+        {
+            lastBroadcast = 0;
+            broadcastMsg.activeTime = -1;
+            m_broadcast->Send( broadcastPort, &broadcastMsg, broadcastLen );
+        }
+
+        // Handshake
+        {
+            char shibboleth[HandshakeShibbolethSize];
+            auto res = m_sock->ReadRaw( shibboleth, HandshakeShibbolethSize, 2000 );
+            if( !res || memcmp( shibboleth, HandshakeShibboleth, HandshakeShibbolethSize ) != 0 )
+            {
+                m_sock->~Socket();
+                tracy_free( m_sock );
+                m_sock = nullptr;
+                continue;
+            }
+
+            uint32_t protocolVersion;
+            res = m_sock->ReadRaw( &protocolVersion, sizeof( protocolVersion ), 2000 );
+            if( !res )
+            {
+                m_sock->~Socket();
+                tracy_free( m_sock );
+                m_sock = nullptr;
+                continue;
+            }
+
+            if( protocolVersion != ProtocolVersion )
+            {
+                HandshakeStatus status = HandshakeProtocolMismatch;
+                m_sock->Send( &status, sizeof( status ) );
+                m_sock->~Socket();
+                tracy_free( m_sock );
+                m_sock = nullptr;
+                continue;
+            }
+        }
+
+#ifdef TRACY_ON_DEMAND
+        const auto currentTime = GetTime();
+        ClearQueues( token );
+        m_connectionId.fetch_add( 1, std::memory_order_release );
+#endif
+        m_isConnected.store( true, std::memory_order_release );
+
+        HandshakeStatus handshake = HandshakeWelcome;
+        m_sock->Send( &handshake, sizeof( handshake ) );
+
+        LZ4_resetStream( (LZ4_stream_t*)m_stream );
+        m_sock->Send( &welcome, sizeof( welcome ) );
+
+        m_threadCtx = 0;
+        m_refTimeSerial = 0;
+        m_refTimeCtx = 0;
+        m_refTimeGpu = 0;
+
+#ifdef TRACY_ON_DEMAND
+        OnDemandPayloadMessage onDemand;
+        onDemand.frames = m_frameCount.load( std::memory_order_relaxed );
+        onDemand.currentTime = currentTime;
+
+        m_sock->Send( &onDemand, sizeof( onDemand ) );
+
+        m_deferredLock.lock();
+        for( auto& item : m_deferredQueue )
+        {
+            uint64_t ptr;
+            uint16_t size;
+            const auto idx = MemRead<uint8_t>( &item.hdr.idx );
+            switch( (QueueType)idx )
+            {
+            case QueueType::MessageAppInfo:
+                ptr = MemRead<uint64_t>( &item.messageFat.text );
+                size = MemRead<uint16_t>( &item.messageFat.size );
+                SendSingleString( (const char*)ptr, size );
+                break;
+            case QueueType::LockName:
+                ptr = MemRead<uint64_t>( &item.lockNameFat.name );
+                size = MemRead<uint16_t>( &item.lockNameFat.size );
+                SendSingleString( (const char*)ptr, size );
+                break;
+            case QueueType::GpuContextName:
+                ptr = MemRead<uint64_t>( &item.gpuContextNameFat.ptr );
+                size = MemRead<uint16_t>( &item.gpuContextNameFat.size );
+                SendSingleString( (const char*)ptr, size );
+                break;
+            default:
+                break;
+            }
+            AppendData( &item, QueueDataSize[idx] );
+        }
+        m_deferredLock.unlock();
+#endif
+
+        // Main communications loop
+        int keepAlive = 0;
+        for(;;)
+        {
+            ProcessSysTime();
+            const auto status = Dequeue( token );
+            const auto serialStatus = DequeueSerial();
+            if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost )
+            {
+                break;
+            }
+            else if( status == DequeueStatus::QueueEmpty && serialStatus == DequeueStatus::QueueEmpty )
+            {
+                if( ShouldExit() ) break;
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) break;
+                }
+                if( keepAlive == 500 )
+                {
+                    QueueItem ka;
+                    ka.hdr.type = QueueType::KeepAlive;
+                    AppendData( &ka, QueueDataSize[ka.hdr.idx] );
+                    if( !CommitData() ) break;
+
+                    keepAlive = 0;
+                }
+                else
+                {
+                    keepAlive++;
+                    std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+                }
+            }
+            else
+            {
+                keepAlive = 0;
+            }
+
+            bool connActive = true;
+            while( m_sock->HasData() && connActive )
+            {
+                connActive = HandleServerQuery();
+            }
+            if( !connActive ) break;
+        }
+        if( ShouldExit() ) break;
+
+        m_isConnected.store( false, std::memory_order_release );
+#ifdef TRACY_ON_DEMAND
+        m_bufferOffset = 0;
+        m_bufferStart = 0;
+#endif
+
+        m_sock->~Socket();
+        tracy_free( m_sock );
+        m_sock = nullptr;
+
+#ifndef TRACY_ON_DEMAND
+        // Client is no longer available here. Accept incoming connections, but reject handshake.
+        for(;;)
+        {
+            if( ShouldExit() )
+            {
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
+
+            ClearQueues( token );
+
+            m_sock = listen.Accept();
+            if( m_sock )
+            {
+                char shibboleth[HandshakeShibbolethSize];
+                auto res = m_sock->ReadRaw( shibboleth, HandshakeShibbolethSize, 1000 );
+                if( !res || memcmp( shibboleth, HandshakeShibboleth, HandshakeShibbolethSize ) != 0 )
+                {
+                    m_sock->~Socket();
+                    tracy_free( m_sock );
+                    m_sock = nullptr;
+                    continue;
+                }
+
+                uint32_t protocolVersion;
+                res = m_sock->ReadRaw( &protocolVersion, sizeof( protocolVersion ), 1000 );
+                if( !res )
+                {
+                    m_sock->~Socket();
+                    tracy_free( m_sock );
+                    m_sock = nullptr;
+                    continue;
+                }
+
+                HandshakeStatus status = HandshakeNotAvailable;
+                m_sock->Send( &status, sizeof( status ) );
+                m_sock->~Socket();
+                tracy_free( m_sock );
+            }
+        }
+#endif
+    }
+    // End of connections loop
+
+    // Client is exiting. Send items remaining in queues.
+    for(;;)
+    {
+        const auto status = Dequeue( token );
+        const auto serialStatus = DequeueSerial();
+        if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost )
+        {
+            m_shutdownFinished.store( true, std::memory_order_relaxed );
+            return;
+        }
+        else if( status == DequeueStatus::QueueEmpty && serialStatus == DequeueStatus::QueueEmpty )
+        {
+            if( m_bufferOffset != m_bufferStart ) CommitData();
+            break;
+        }
+
+        while( m_sock->HasData() )
+        {
+            if( !HandleServerQuery() )
+            {
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
+        }
+    }
+
+    // Send client termination notice to the server
+    QueueItem terminate;
+    MemWrite( &terminate.hdr.type, QueueType::Terminate );
+    if( !SendData( (const char*)&terminate, 1 ) )
+    {
+        m_shutdownFinished.store( true, std::memory_order_relaxed );
+        return;
+    }
+    // Handle remaining server queries
+    for(;;)
+    {
+        if( m_sock->HasData() )
+        {
+            while( m_sock->HasData() )
+            {
+                if( !HandleServerQuery() )
+                {
+                    m_shutdownFinished.store( true, std::memory_order_relaxed );
+                    return;
+                }
+            }
+            while( Dequeue( token ) == DequeueStatus::DataDequeued ) {}
+            while( DequeueSerial() == DequeueStatus::DataDequeued ) {}
+            if( m_bufferOffset != m_bufferStart )
+            {
+                if( !CommitData() )
+                {
+                    m_shutdownFinished.store( true, std::memory_order_relaxed );
+                    return;
+                }
+            }
+        }
+        else
+        {
+            if( m_bufferOffset != m_bufferStart ) CommitData();
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+        }
+    }
+}
+
+static void FreeAssociatedMemory( const QueueItem& item )
+{
+    if( item.hdr.idx >= (int)QueueType::Terminate ) return;
+
+    uint64_t ptr;
+    switch( item.hdr.type )
+    {
+    case QueueType::ZoneText:
+    case QueueType::ZoneName:
+        ptr = MemRead<uint64_t>( &item.zoneTextFat.text );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::MessageColor:
+    case QueueType::MessageColorCallstack:
+        ptr = MemRead<uint64_t>( &item.messageColorFat.text );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::Message:
+    case QueueType::MessageCallstack:
+#ifndef TRACY_ON_DEMAND
+    case QueueType::MessageAppInfo:
+#endif
+        ptr = MemRead<uint64_t>( &item.messageFat.text );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::ZoneBeginAllocSrcLoc:
+    case QueueType::ZoneBeginAllocSrcLocCallstack:
+        ptr = MemRead<uint64_t>( &item.zoneBegin.srcloc );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::GpuZoneBeginAllocSrcLoc:
+    case QueueType::GpuZoneBeginAllocSrcLocCallstack:
+    case QueueType::GpuZoneBeginAllocSrcLocSerial:
+    case QueueType::GpuZoneBeginAllocSrcLocCallstackSerial:
+        ptr = MemRead<uint64_t>( &item.gpuZoneBegin.srcloc );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::CallstackSerial:
+    case QueueType::Callstack:
+        ptr = MemRead<uint64_t>( &item.callstackFat.ptr );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::CallstackAlloc:
+        ptr = MemRead<uint64_t>( &item.callstackAllocFat.nativePtr );
+        tracy_free( (void*)ptr );
+        ptr = MemRead<uint64_t>( &item.callstackAllocFat.ptr );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::CallstackSample:
+        ptr = MemRead<uint64_t>( &item.callstackSampleFat.ptr );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::FrameImage:
+        ptr = MemRead<uint64_t>( &item.frameImageFat.image );
+        tracy_free( (void*)ptr );
+        break;
+#ifndef TRACY_ON_DEMAND
+    case QueueType::LockName:
+        ptr = MemRead<uint64_t>( &item.lockNameFat.name );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::GpuContextName:
+        ptr = MemRead<uint64_t>( &item.gpuContextNameFat.ptr );
+        tracy_free( (void*)ptr );
+        break;
+#endif
+#ifdef TRACY_ON_DEMAND
+    case QueueType::MessageAppInfo:
+    case QueueType::GpuContextName:
+        // Don't free memory associated with deferred messages.
+        break;
+#endif
+    default:
+        break;
+    }
+}
+
+void Profiler::ClearQueues( moodycamel::ConsumerToken& token )
+{
+    for(;;)
+    {
+        const auto sz = GetQueue().try_dequeue_bulk_single( token, [](const uint64_t&){}, []( QueueItem* item, size_t sz ) { assert( sz > 0 ); while( sz-- > 0 ) FreeAssociatedMemory( *item++ ); } );
+        if( sz == 0 ) break;
+    }
+
+    ClearSerial();
+}
+
+void Profiler::ClearSerial()
+{
+    bool lockHeld = true;
+    while( !m_serialLock.try_lock() )
+    {
+        if( m_shutdownManual.load( std::memory_order_relaxed ) )
+        {
+            lockHeld = false;
+            break;
+        }
+    }
+    for( auto& v : m_serialQueue ) FreeAssociatedMemory( v );
+    m_serialQueue.clear();
+    if( lockHeld )
+    {
+        m_serialLock.unlock();
+    }
+
+    for( auto& v : m_serialDequeue ) FreeAssociatedMemory( v );
+    m_serialDequeue.clear();
+}
+
+Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token )
+{
+    bool connectionLost = false;
+    const auto sz = GetQueue().try_dequeue_bulk_single( token,
+        [this, &connectionLost] ( const uint64_t& threadId )
+        {
+            if( threadId != m_threadCtx )
+            {
+                QueueItem item;
+                MemWrite( &item.hdr.type, QueueType::ThreadContext );
+                MemWrite( &item.threadCtx.thread, threadId );
+                if( !AppendData( &item, QueueDataSize[(int)QueueType::ThreadContext] ) ) connectionLost = true;
+                m_threadCtx = threadId;
+                m_refTimeThread = 0;
+            }
+        },
+        [this, &connectionLost] ( QueueItem* item, size_t sz )
+        {
+            if( connectionLost ) return;
+            assert( sz > 0 );
+            int64_t refThread = m_refTimeThread;
+            int64_t refCtx = m_refTimeCtx;
+            int64_t refGpu = m_refTimeGpu;
+            while( sz-- > 0 )
+            {
+                uint64_t ptr;
+                uint16_t size;
+                auto idx = MemRead<uint8_t>( &item->hdr.idx );
+                if( idx < (int)QueueType::Terminate )
+                {
+                    switch( (QueueType)idx )
+                    {
+                    case QueueType::ZoneText:
+                    case QueueType::ZoneName:
+                        ptr = MemRead<uint64_t>( &item->zoneTextFat.text );
+                        size = MemRead<uint16_t>( &item->zoneTextFat.size );
+                        SendSingleString( (const char*)ptr, size );
+                        tracy_free( (void*)ptr );
+                        break;
+                    case QueueType::Message:
+                    case QueueType::MessageCallstack:
+                        ptr = MemRead<uint64_t>( &item->messageFat.text );
+                        size = MemRead<uint16_t>( &item->messageFat.size );
+                        SendSingleString( (const char*)ptr, size );
+                        tracy_free( (void*)ptr );
+                        break;
+                    case QueueType::MessageColor:
+                    case QueueType::MessageColorCallstack:
+                        ptr = MemRead<uint64_t>( &item->messageColorFat.text );
+                        size = MemRead<uint16_t>( &item->messageColorFat.size );
+                        SendSingleString( (const char*)ptr, size );
+                        tracy_free( (void*)ptr );
+                        break;
+                    case QueueType::MessageAppInfo:
+                        ptr = MemRead<uint64_t>( &item->messageFat.text );
+                        size = MemRead<uint16_t>( &item->messageFat.size );
+                        SendSingleString( (const char*)ptr, size );
+#ifndef TRACY_ON_DEMAND
+                        tracy_free( (void*)ptr );
+#endif
+                        break;
+                    case QueueType::ZoneBeginAllocSrcLoc:
+                    case QueueType::ZoneBeginAllocSrcLocCallstack:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->zoneBegin.time );
+                        int64_t dt = t - refThread;
+                        refThread = t;
+                        MemWrite( &item->zoneBegin.time, dt );
+                        ptr = MemRead<uint64_t>( &item->zoneBegin.srcloc );
+                        SendSourceLocationPayload( ptr );
+                        tracy_free( (void*)ptr );
+                        break;
+                    }
+                    case QueueType::Callstack:
+                        ptr = MemRead<uint64_t>( &item->callstackFat.ptr );
+                        SendCallstackPayload( ptr );
+                        tracy_free( (void*)ptr );
+                        break;
+                    case QueueType::CallstackAlloc:
+                        ptr = MemRead<uint64_t>( &item->callstackAllocFat.nativePtr );
+                        if( ptr != 0 )
+                        {
+                            CutCallstack( (void*)ptr, "lua_pcall" );
+                            SendCallstackPayload( ptr );
+                            tracy_free( (void*)ptr );
+                        }
+                        ptr = MemRead<uint64_t>( &item->callstackAllocFat.ptr );
+                        SendCallstackAlloc( ptr );
+                        tracy_free( (void*)ptr );
+                        break;
+                    case QueueType::CallstackSample:
+                    {
+                        ptr = MemRead<uint64_t>( &item->callstackSampleFat.ptr );
+                        SendCallstackPayload64( ptr );
+                        tracy_free( (void*)ptr );
+                        int64_t t = MemRead<int64_t>( &item->callstackSampleFat.time );
+                        int64_t dt = t - refCtx;
+                        refCtx = t;
+                        MemWrite( &item->callstackSampleFat.time, dt );
+                        break;
+                    }
+                    case QueueType::FrameImage:
+                    {
+                        ptr = MemRead<uint64_t>( &item->frameImageFat.image );
+                        const auto w = MemRead<uint16_t>( &item->frameImageFat.w );
+                        const auto h = MemRead<uint16_t>( &item->frameImageFat.h );
+                        const auto csz = size_t( w * h / 2 );
+                        SendLongString( ptr, (const char*)ptr, csz, QueueType::FrameImageData );
+                        tracy_free( (void*)ptr );
+                        break;
+                    }
+                    case QueueType::ZoneBegin:
+                    case QueueType::ZoneBeginCallstack:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->zoneBegin.time );
+                        int64_t dt = t - refThread;
+                        refThread = t;
+                        MemWrite( &item->zoneBegin.time, dt );
+                        break;
+                    }
+                    case QueueType::ZoneEnd:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->zoneEnd.time );
+                        int64_t dt = t - refThread;
+                        refThread = t;
+                        MemWrite( &item->zoneEnd.time, dt );
+                        break;
+                    }
+                    case QueueType::GpuZoneBegin:
+                    case QueueType::GpuZoneBeginCallstack:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->gpuZoneBegin.cpuTime );
+                        int64_t dt = t - refThread;
+                        refThread = t;
+                        MemWrite( &item->gpuZoneBegin.cpuTime, dt );
+                        break;
+                    }
+                    case QueueType::GpuZoneBeginAllocSrcLoc:
+                    case QueueType::GpuZoneBeginAllocSrcLocCallstack:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->gpuZoneBegin.cpuTime );
+                        int64_t dt = t - refThread;
+                        refThread = t;
+                        MemWrite( &item->gpuZoneBegin.cpuTime, dt );
+                        ptr = MemRead<uint64_t>( &item->gpuZoneBegin.srcloc );
+                        SendSourceLocationPayload( ptr );
+                        tracy_free( (void*)ptr );
+                        break;
+                    }
+                    case QueueType::GpuZoneEnd:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->gpuZoneEnd.cpuTime );
+                        int64_t dt = t - refThread;
+                        refThread = t;
+                        MemWrite( &item->gpuZoneEnd.cpuTime, dt );
+                        break;
+                    }
+                    case QueueType::GpuContextName:
+                        ptr = MemRead<uint64_t>( &item->gpuContextNameFat.ptr );
+                        size = MemRead<uint16_t>( &item->gpuContextNameFat.size );
+                        SendSingleString( (const char*)ptr, size );
+#ifndef TRACY_ON_DEMAND
+                        tracy_free( (void*)ptr );
+#endif
+                        break;
+                    case QueueType::PlotData:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->plotData.time );
+                        int64_t dt = t - refThread;
+                        refThread = t;
+                        MemWrite( &item->plotData.time, dt );
+                        break;
+                    }
+                    case QueueType::ContextSwitch:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->contextSwitch.time );
+                        int64_t dt = t - refCtx;
+                        refCtx = t;
+                        MemWrite( &item->contextSwitch.time, dt );
+                        break;
+                    }
+                    case QueueType::ThreadWakeup:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->threadWakeup.time );
+                        int64_t dt = t - refCtx;
+                        refCtx = t;
+                        MemWrite( &item->threadWakeup.time, dt );
+                        break;
+                    }
+                    case QueueType::GpuTime:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->gpuTime.gpuTime );
+                        int64_t dt = t - refGpu;
+                        refGpu = t;
+                        MemWrite( &item->gpuTime.gpuTime, dt );
+                        break;
+                    }
+                    default:
+                        assert( false );
+                        break;
+                    }
+                }
+                if( !AppendData( item++, QueueDataSize[idx] ) )
+                {
+                    connectionLost = true;
+                    m_refTimeThread = refThread;
+                    m_refTimeCtx = refCtx;
+                    m_refTimeGpu = refGpu;
+                    return;
+                }
+            }
+            m_refTimeThread = refThread;
+            m_refTimeCtx = refCtx;
+            m_refTimeGpu = refGpu;
+        }
+    );
+    if( connectionLost ) return DequeueStatus::ConnectionLost;
+    return sz > 0 ? DequeueStatus::DataDequeued : DequeueStatus::QueueEmpty;
+}
+
+Profiler::DequeueStatus Profiler::DequeueContextSwitches( tracy::moodycamel::ConsumerToken& token, int64_t& timeStop )
+{
+    const auto sz = GetQueue().try_dequeue_bulk_single( token, [] ( const uint64_t& ) {},
+        [this, &timeStop] ( QueueItem* item, size_t sz )
+        {
+            assert( sz > 0 );
+            int64_t refCtx = m_refTimeCtx;
+            while( sz-- > 0 )
+            {
+                FreeAssociatedMemory( *item );
+                if( timeStop < 0 ) return;
+                const auto idx = MemRead<uint8_t>( &item->hdr.idx );
+                if( idx == (uint8_t)QueueType::ContextSwitch )
+                {
+                    const auto csTime = MemRead<int64_t>( &item->contextSwitch.time );
+                    if( csTime > timeStop )
+                    {
+                        timeStop = -1;
+                        m_refTimeCtx = refCtx;
+                        return;
+                    }
+                    int64_t dt = csTime - refCtx;
+                    refCtx = csTime;
+                    MemWrite( &item->contextSwitch.time, dt );
+                    if( !AppendData( item, QueueDataSize[(int)QueueType::ContextSwitch] ) )
+                    {
+                        timeStop = -2;
+                        m_refTimeCtx = refCtx;
+                        return;
+                    }
+                }
+                else if( idx == (uint8_t)QueueType::ThreadWakeup )
+                {
+                    const auto csTime = MemRead<int64_t>( &item->threadWakeup.time );
+                    if( csTime > timeStop )
+                    {
+                        timeStop = -1;
+                        m_refTimeCtx = refCtx;
+                        return;
+                    }
+                    int64_t dt = csTime - refCtx;
+                    refCtx = csTime;
+                    MemWrite( &item->threadWakeup.time, dt );
+                    if( !AppendData( item, QueueDataSize[(int)QueueType::ThreadWakeup] ) )
+                    {
+                        timeStop = -2;
+                        m_refTimeCtx = refCtx;
+                        return;
+                    }
+                }
+                item++;
+            }
+            m_refTimeCtx = refCtx;
+        }
+    );
+
+    if( timeStop == -2 ) return DequeueStatus::ConnectionLost;
+    return ( timeStop == -1 || sz > 0 ) ? DequeueStatus::DataDequeued : DequeueStatus::QueueEmpty;
+}
+
+Profiler::DequeueStatus Profiler::DequeueSerial()
+{
+    {
+        bool lockHeld = true;
+        while( !m_serialLock.try_lock() )
+        {
+            if( m_shutdownManual.load( std::memory_order_relaxed ) )
+            {
+                lockHeld = false;
+                break;
+            }
+        }
+        if( !m_serialQueue.empty() ) m_serialQueue.swap( m_serialDequeue );
+        if( lockHeld )
+        {
+            m_serialLock.unlock();
+        }
+    }
+
+    const auto sz = m_serialDequeue.size();
+    if( sz > 0 )
+    {
+        int64_t refSerial = m_refTimeSerial;
+        int64_t refGpu = m_refTimeGpu;
+        auto item = m_serialDequeue.data();
+        auto end = item + sz;
+        while( item != end )
+        {
+            uint64_t ptr;
+            auto idx = MemRead<uint8_t>( &item->hdr.idx );
+            if( idx < (int)QueueType::Terminate )
+            {
+                switch( (QueueType)idx )
+                {
+                case QueueType::CallstackSerial:
+                    ptr = MemRead<uint64_t>( &item->callstackFat.ptr );
+                    SendCallstackPayload( ptr );
+                    tracy_free( (void*)ptr );
+                    break;
+                case QueueType::LockWait:
+                case QueueType::LockSharedWait:
+                {
+                    int64_t t = MemRead<int64_t>( &item->lockWait.time );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->lockWait.time, dt );
+                    break;
+                }
+                case QueueType::LockObtain:
+                case QueueType::LockSharedObtain:
+                {
+                    int64_t t = MemRead<int64_t>( &item->lockObtain.time );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->lockObtain.time, dt );
+                    break;
+                }
+                case QueueType::LockRelease:
+                case QueueType::LockSharedRelease:
+                {
+                    int64_t t = MemRead<int64_t>( &item->lockRelease.time );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->lockRelease.time, dt );
+                    break;
+                }
+                case QueueType::LockName:
+                {
+                    ptr = MemRead<uint64_t>( &item->lockNameFat.name );
+                    uint16_t size = MemRead<uint16_t>( &item->lockNameFat.size );
+                    SendSingleString( (const char*)ptr, size );
+#ifndef TRACY_ON_DEMAND
+                    tracy_free( (void*)ptr );
+#endif
+                    break;
+                }
+                case QueueType::MemAlloc:
+                case QueueType::MemAllocNamed:
+                case QueueType::MemAllocCallstack:
+                case QueueType::MemAllocCallstackNamed:
+                {
+                    int64_t t = MemRead<int64_t>( &item->memAlloc.time );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->memAlloc.time, dt );
+                    break;
+                }
+                case QueueType::MemFree:
+                case QueueType::MemFreeNamed:
+                case QueueType::MemFreeCallstack:
+                case QueueType::MemFreeCallstackNamed:
+                {
+                    int64_t t = MemRead<int64_t>( &item->memFree.time );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->memFree.time, dt );
+                    break;
+                }
+                case QueueType::GpuZoneBeginSerial:
+                case QueueType::GpuZoneBeginCallstackSerial:
+                {
+                    int64_t t = MemRead<int64_t>( &item->gpuZoneBegin.cpuTime );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->gpuZoneBegin.cpuTime, dt );
+                    break;
+                }
+                case QueueType::GpuZoneBeginAllocSrcLocSerial:
+                case QueueType::GpuZoneBeginAllocSrcLocCallstackSerial:
+                {
+                    int64_t t = MemRead<int64_t>( &item->gpuZoneBegin.cpuTime );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->gpuZoneBegin.cpuTime, dt );
+                    ptr = MemRead<uint64_t>( &item->gpuZoneBegin.srcloc );
+                    SendSourceLocationPayload( ptr );
+                    tracy_free( (void*)ptr );
+                    break;
+                }
+                case QueueType::GpuZoneEndSerial:
+                {
+                    int64_t t = MemRead<int64_t>( &item->gpuZoneEnd.cpuTime );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->gpuZoneEnd.cpuTime, dt );
+                    break;
+                }
+                case QueueType::GpuTime:
+                {
+                    int64_t t = MemRead<int64_t>( &item->gpuTime.gpuTime );
+                    int64_t dt = t - refGpu;
+                    refGpu = t;
+                    MemWrite( &item->gpuTime.gpuTime, dt );
+                    break;
+                }
+                case QueueType::GpuContextName:
+                {
+                    ptr = MemRead<uint64_t>( &item->gpuContextNameFat.ptr );
+                    uint16_t size = MemRead<uint16_t>( &item->gpuContextNameFat.size );
+                    SendSingleString( (const char*)ptr, size );
+#ifndef TRACY_ON_DEMAND
+                    tracy_free( (void*)ptr );
+#endif
+                    break;
+                }
+                default:
+                    assert( false );
+                    break;
+                }
+            }
+            if( !AppendData( item, QueueDataSize[idx] ) ) return DequeueStatus::ConnectionLost;
+            item++;
+        }
+        m_refTimeSerial = refSerial;
+        m_refTimeGpu = refGpu;
+        m_serialDequeue.clear();
+    }
+    else
+    {
+        return DequeueStatus::QueueEmpty;
+    }
+    return DequeueStatus::DataDequeued;
+}
+
+bool Profiler::CommitData()
+{
+    bool ret = SendData( m_buffer + m_bufferStart, m_bufferOffset - m_bufferStart );
+    if( m_bufferOffset > TargetFrameSize * 2 ) m_bufferOffset = 0;
+    m_bufferStart = m_bufferOffset;
+    return ret;
+}
+
+bool Profiler::SendData( const char* data, size_t len )
+{
+    const lz4sz_t lz4sz = LZ4_compress_fast_continue( (LZ4_stream_t*)m_stream, data, m_lz4Buf + sizeof( lz4sz_t ), (int)len, LZ4Size, 1 );
+    memcpy( m_lz4Buf, &lz4sz, sizeof( lz4sz ) );
+    return m_sock->Send( m_lz4Buf, lz4sz + sizeof( lz4sz_t ) ) != -1;
+}
+
+void Profiler::SendString( uint64_t str, const char* ptr, size_t len, QueueType type )
+{
+    assert( type == QueueType::StringData ||
+            type == QueueType::ThreadName ||
+            type == QueueType::PlotName ||
+            type == QueueType::FrameName ||
+            type == QueueType::ExternalName ||
+            type == QueueType::ExternalThreadName );
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, type );
+    MemWrite( &item.stringTransfer.ptr, str );
+
+    assert( len <= std::numeric_limits<uint16_t>::max() );
+    auto l16 = uint16_t( len );
+
+    NeedDataSize( QueueDataSize[(int)type] + sizeof( l16 ) + l16 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)type] );
+    AppendDataUnsafe( &l16, sizeof( l16 ) );
+    AppendDataUnsafe( ptr, l16 );
+}
+
+void Profiler::SendSingleString( const char* ptr, size_t len )
+{
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::SingleStringData );
+
+    assert( len <= std::numeric_limits<uint16_t>::max() );
+    auto l16 = uint16_t( len );
+
+    NeedDataSize( QueueDataSize[(int)QueueType::SingleStringData] + sizeof( l16 ) + l16 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::SingleStringData] );
+    AppendDataUnsafe( &l16, sizeof( l16 ) );
+    AppendDataUnsafe( ptr, l16 );
+}
+
+void Profiler::SendSecondString( const char* ptr, size_t len )
+{
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::SecondStringData );
+
+    assert( len <= std::numeric_limits<uint16_t>::max() );
+    auto l16 = uint16_t( len );
+
+    NeedDataSize( QueueDataSize[(int)QueueType::SecondStringData] + sizeof( l16 ) + l16 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::SecondStringData] );
+    AppendDataUnsafe( &l16, sizeof( l16 ) );
+    AppendDataUnsafe( ptr, l16 );
+}
+
+void Profiler::SendLongString( uint64_t str, const char* ptr, size_t len, QueueType type )
+{
+    assert( type == QueueType::FrameImageData ||
+            type == QueueType::SymbolCode ||
+            type == QueueType::SourceCode );
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, type );
+    MemWrite( &item.stringTransfer.ptr, str );
+
+    assert( len <= std::numeric_limits<uint32_t>::max() );
+    assert( QueueDataSize[(int)type] + sizeof( uint32_t ) + len <= TargetFrameSize );
+    auto l32 = uint32_t( len );
+
+    NeedDataSize( QueueDataSize[(int)type] + sizeof( l32 ) + l32 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)type] );
+    AppendDataUnsafe( &l32, sizeof( l32 ) );
+    AppendDataUnsafe( ptr, l32 );
+}
+
+void Profiler::SendSourceLocation( uint64_t ptr )
+{
+    auto srcloc = (const SourceLocationData*)ptr;
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::SourceLocation );
+    MemWrite( &item.srcloc.name, (uint64_t)srcloc->name );
+    MemWrite( &item.srcloc.file, (uint64_t)srcloc->file );
+    MemWrite( &item.srcloc.function, (uint64_t)srcloc->function );
+    MemWrite( &item.srcloc.line, srcloc->line );
+    MemWrite( &item.srcloc.r, uint8_t( ( srcloc->color       ) & 0xFF ) );
+    MemWrite( &item.srcloc.g, uint8_t( ( srcloc->color >> 8  ) & 0xFF ) );
+    MemWrite( &item.srcloc.b, uint8_t( ( srcloc->color >> 16 ) & 0xFF ) );
+    AppendData( &item, QueueDataSize[(int)QueueType::SourceLocation] );
+}
+
+void Profiler::SendSourceLocationPayload( uint64_t _ptr )
+{
+    auto ptr = (const char*)_ptr;
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::SourceLocationPayload );
+    MemWrite( &item.stringTransfer.ptr, _ptr );
+
+    uint16_t len;
+    memcpy( &len, ptr, sizeof( len ) );
+    assert( len > 2 );
+    len -= 2;
+    ptr += 2;
+
+    NeedDataSize( QueueDataSize[(int)QueueType::SourceLocationPayload] + sizeof( len ) + len );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::SourceLocationPayload] );
+    AppendDataUnsafe( &len, sizeof( len ) );
+    AppendDataUnsafe( ptr, len );
+}
+
+void Profiler::SendCallstackPayload( uint64_t _ptr )
+{
+    auto ptr = (uintptr_t*)_ptr;
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::CallstackPayload );
+    MemWrite( &item.stringTransfer.ptr, _ptr );
+
+    const auto sz = *ptr++;
+    const auto len = sz * sizeof( uint64_t );
+    const auto l16 = uint16_t( len );
+
+    NeedDataSize( QueueDataSize[(int)QueueType::CallstackPayload] + sizeof( l16 ) + l16 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::CallstackPayload] );
+    AppendDataUnsafe( &l16, sizeof( l16 ) );
+
+    if( compile_time_condition<sizeof( uintptr_t ) == sizeof( uint64_t )>::value )
+    {
+        AppendDataUnsafe( ptr, sizeof( uint64_t ) * sz );
+    }
+    else
+    {
+        for( uintptr_t i=0; i<sz; i++ )
+        {
+            const auto val = uint64_t( *ptr++ );
+            AppendDataUnsafe( &val, sizeof( uint64_t ) );
+        }
+    }
+}
+
+void Profiler::SendCallstackPayload64( uint64_t _ptr )
+{
+    auto ptr = (uint64_t*)_ptr;
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::CallstackPayload );
+    MemWrite( &item.stringTransfer.ptr, _ptr );
+
+    const auto sz = *ptr++;
+    const auto len = sz * sizeof( uint64_t );
+    const auto l16 = uint16_t( len );
+
+    NeedDataSize( QueueDataSize[(int)QueueType::CallstackPayload] + sizeof( l16 ) + l16 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::CallstackPayload] );
+    AppendDataUnsafe( &l16, sizeof( l16 ) );
+    AppendDataUnsafe( ptr, sizeof( uint64_t ) * sz );
+}
+
+void Profiler::SendCallstackAlloc( uint64_t _ptr )
+{
+    auto ptr = (const char*)_ptr;
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::CallstackAllocPayload );
+    MemWrite( &item.stringTransfer.ptr, _ptr );
+
+    uint16_t len;
+    memcpy( &len, ptr, 2 );
+    ptr += 2;
+
+    NeedDataSize( QueueDataSize[(int)QueueType::CallstackAllocPayload] + sizeof( len ) + len );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::CallstackAllocPayload] );
+    AppendDataUnsafe( &len, sizeof( len ) );
+    AppendDataUnsafe( ptr, len );
+}
+
+void Profiler::SendCallstackFrame( uint64_t ptr )
+{
+#ifdef TRACY_HAS_CALLSTACK
+    const auto frameData = DecodeCallstackPtr( ptr );
+
+    {
+        SendSingleString( frameData.imageName );
+
+        QueueItem item;
+        MemWrite( &item.hdr.type, QueueType::CallstackFrameSize );
+        MemWrite( &item.callstackFrameSize.ptr, ptr );
+        MemWrite( &item.callstackFrameSize.size, frameData.size );
+
+        AppendData( &item, QueueDataSize[(int)QueueType::CallstackFrameSize] );
+    }
+
+    for( uint8_t i=0; i<frameData.size; i++ )
+    {
+        const auto& frame = frameData.data[i];
+
+        SendSingleString( frame.name );
+        SendSecondString( frame.file );
+
+        QueueItem item;
+        MemWrite( &item.hdr.type, QueueType::CallstackFrame );
+        MemWrite( &item.callstackFrame.line, frame.line );
+        MemWrite( &item.callstackFrame.symAddr, frame.symAddr );
+        MemWrite( &item.callstackFrame.symLen, frame.symLen );
+
+        AppendData( &item, QueueDataSize[(int)QueueType::CallstackFrame] );
+
+        tracy_free( (void*)frame.name );
+        tracy_free( (void*)frame.file );
+    }
+#endif
+}
+
+
+bool Profiler::HandleServerQuery()
+{
+    ServerQueryPacket payload;
+    if( !m_sock->Read( &payload, sizeof( payload ), 10 ) ) return false;
+
+    uint8_t type;
+    uint64_t ptr;
+    uint32_t extra;
+    memcpy( &type, &payload.type, sizeof( payload.type ) );
+    memcpy( &ptr, &payload.ptr, sizeof( payload.ptr ) );
+    memcpy( &extra, &payload.extra, sizeof( payload.extra ) );
+
+    switch( type )
+    {
+    case ServerQueryString:
+        SendString( ptr, (const char*)ptr, QueueType::StringData );
+        break;
+    case ServerQueryThreadString:
+        if( ptr == m_mainThread )
+        {
+            SendString( ptr, "Main thread", 11, QueueType::ThreadName );
+        }
+        else
+        {
+            SendString( ptr, GetThreadName( ptr ), QueueType::ThreadName );
+        }
+        break;
+    case ServerQuerySourceLocation:
+        SendSourceLocation( ptr );
+        break;
+    case ServerQueryPlotName:
+        SendString( ptr, (const char*)ptr, QueueType::PlotName );
+        break;
+    case ServerQueryTerminate:
+        return false;
+    case ServerQueryCallstackFrame:
+        SendCallstackFrame( ptr );
+        break;
+    case ServerQueryFrameName:
+        SendString( ptr, (const char*)ptr, QueueType::FrameName );
+        break;
+    case ServerQueryDisconnect:
+        HandleDisconnect();
+        return false;
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    case ServerQueryExternalName:
+        SysTraceSendExternalName( ptr );
+        break;
+#endif
+    case ServerQueryParameter:
+        HandleParameter( ptr );
+        break;
+    case ServerQuerySymbol:
+        HandleSymbolQuery( ptr );
+        break;
+#ifndef TRACY_NO_CODE_TRANSFER
+    case ServerQuerySymbolCode:
+        HandleSymbolCodeQuery( ptr, extra );
+        break;
+#endif
+    case ServerQueryCodeLocation:
+        SendCodeLocation( ptr );
+        break;
+    case ServerQuerySourceCode:
+        HandleSourceCodeQuery();
+        break;
+    case ServerQueryDataTransfer:
+        assert( !m_queryData );
+        m_queryDataPtr = m_queryData = (char*)tracy_malloc( ptr + 11 );
+        AckServerQuery();
+        break;
+    case ServerQueryDataTransferPart:
+        memcpy( m_queryDataPtr, &ptr, 8 );
+        memcpy( m_queryDataPtr+8, &extra, 4 );
+        m_queryDataPtr += 12;
+        AckServerQuery();
+        break;
+    default:
+        assert( false );
+        break;
+    }
+
+    return true;
+}
+
+void Profiler::HandleDisconnect()
+{
+    moodycamel::ConsumerToken token( GetQueue() );
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    if( s_sysTraceThread )
+    {
+        auto timestamp = GetTime();
+        for(;;)
+        {
+            const auto status = DequeueContextSwitches( token, timestamp );
+            if( status == DequeueStatus::ConnectionLost )
+            {
+                return;
+            }
+            else if( status == DequeueStatus::QueueEmpty )
+            {
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) return;
+                }
+            }
+            if( timestamp < 0 )
+            {
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) return;
+                }
+                break;
+            }
+            ClearSerial();
+            if( m_sock->HasData() )
+            {
+                while( m_sock->HasData() )
+                {
+                    if( !HandleServerQuery() ) return;
+                }
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) return;
+                }
+            }
+            else
+            {
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) return;
+                }
+                std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+            }
+        }
+    }
+#endif
+
+    QueueItem terminate;
+    MemWrite( &terminate.hdr.type, QueueType::Terminate );
+    if( !SendData( (const char*)&terminate, 1 ) ) return;
+    for(;;)
+    {
+        ClearQueues( token );
+        if( m_sock->HasData() )
+        {
+            while( m_sock->HasData() )
+            {
+                if( !HandleServerQuery() ) return;
+            }
+            if( m_bufferOffset != m_bufferStart )
+            {
+                if( !CommitData() ) return;
+            }
+        }
+        else
+        {
+            if( m_bufferOffset != m_bufferStart )
+            {
+                if( !CommitData() ) return;
+            }
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+        }
+    }
+}
+
+void Profiler::CalibrateTimer()
+{
+#ifdef TRACY_HW_TIMER
+    std::atomic_signal_fence( std::memory_order_acq_rel );
+    const auto t0 = std::chrono::high_resolution_clock::now();
+    const auto r0 = GetTime();
+    std::atomic_signal_fence( std::memory_order_acq_rel );
+    std::this_thread::sleep_for( std::chrono::milliseconds( 200 ) );
+    std::atomic_signal_fence( std::memory_order_acq_rel );
+    const auto t1 = std::chrono::high_resolution_clock::now();
+    const auto r1 = GetTime();
+    std::atomic_signal_fence( std::memory_order_acq_rel );
+
+    const auto dt = std::chrono::duration_cast<std::chrono::nanoseconds>( t1 - t0 ).count();
+    const auto dr = r1 - r0;
+
+    m_timerMul = double( dt ) / double( dr );
+#else
+    m_timerMul = 1.;
+#endif
+}
+
+void Profiler::CalibrateDelay()
+{
+    constexpr int Iterations = 50000;
+
+    auto mindiff = std::numeric_limits<int64_t>::max();
+    for( int i=0; i<Iterations * 10; i++ )
+    {
+        const auto t0i = GetTime();
+        const auto t1i = GetTime();
+        const auto dti = t1i - t0i;
+        if( dti > 0 && dti < mindiff ) mindiff = dti;
+    }
+    m_resolution = mindiff;
+
+#ifdef TRACY_DELAYED_INIT
+    m_delay = m_resolution;
+#else
+    constexpr int Events = Iterations * 2;   // start + end
+    static_assert( Events < QueuePrealloc, "Delay calibration loop will allocate memory in queue" );
+
+    static const tracy::SourceLocationData __tracy_source_location { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 };
+    const auto t0 = GetTime();
+    for( int i=0; i<Iterations; i++ )
+    {
+        {
+            TracyLfqPrepare( QueueType::ZoneBegin );
+            MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+            MemWrite( &item->zoneBegin.srcloc, (uint64_t)&__tracy_source_location );
+            TracyLfqCommit;
+        }
+        {
+            TracyLfqPrepare( QueueType::ZoneEnd );
+            MemWrite( &item->zoneEnd.time, GetTime() );
+            TracyLfqCommit;
+        }
+    }
+    const auto t1 = GetTime();
+    const auto dt = t1 - t0;
+    m_delay = dt / Events;
+
+    moodycamel::ConsumerToken token( GetQueue() );
+    int left = Events;
+    while( left != 0 )
+    {
+        const auto sz = GetQueue().try_dequeue_bulk_single( token, [](const uint64_t&){}, [](QueueItem* item, size_t sz){} );
+        assert( sz > 0 );
+        left -= (int)sz;
+    }
+    assert( GetQueue().size_approx() == 0 );
+#endif
+}
+
+void Profiler::ReportTopology()
+{
+#ifndef TRACY_DELAYED_INIT
+    struct CpuData
+    {
+        uint32_t package;
+        uint32_t core;
+        uint32_t thread;
+    };
+
+#if defined _WIN32 || defined __CYGWIN__
+    t_GetLogicalProcessorInformationEx _GetLogicalProcessorInformationEx = (t_GetLogicalProcessorInformationEx)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetLogicalProcessorInformationEx" );
+    if( !_GetLogicalProcessorInformationEx ) return;
+
+    DWORD psz = 0;
+    _GetLogicalProcessorInformationEx( RelationProcessorPackage, nullptr, &psz );
+    auto packageInfo = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)tracy_malloc( psz );
+    auto res = _GetLogicalProcessorInformationEx( RelationProcessorPackage, packageInfo, &psz );
+    assert( res );
+
+    DWORD csz = 0;
+    _GetLogicalProcessorInformationEx( RelationProcessorCore, nullptr, &csz );
+    auto coreInfo = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)tracy_malloc( csz );
+    res = _GetLogicalProcessorInformationEx( RelationProcessorCore, coreInfo, &csz );
+    assert( res );
+
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo( &sysinfo );
+    const uint32_t numcpus = sysinfo.dwNumberOfProcessors;
+
+    auto cpuData = (CpuData*)tracy_malloc( sizeof( CpuData ) * numcpus );
+    for( uint32_t i=0; i<numcpus; i++ ) cpuData[i].thread = i;
+
+    int idx = 0;
+    auto ptr = packageInfo;
+    while( (char*)ptr < ((char*)packageInfo) + psz )
+    {
+        assert( ptr->Relationship == RelationProcessorPackage );
+        // FIXME account for GroupCount
+        auto mask = ptr->Processor.GroupMask[0].Mask;
+        int core = 0;
+        while( mask != 0 )
+        {
+            if( mask & 1 ) cpuData[core].package = idx;
+            core++;
+            mask >>= 1;
+        }
+        ptr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(((char*)ptr) + ptr->Size);
+        idx++;
+    }
+
+    idx = 0;
+    ptr = coreInfo;
+    while( (char*)ptr < ((char*)coreInfo) + csz )
+    {
+        assert( ptr->Relationship == RelationProcessorCore );
+        // FIXME account for GroupCount
+        auto mask = ptr->Processor.GroupMask[0].Mask;
+        int core = 0;
+        while( mask != 0 )
+        {
+            if( mask & 1 ) cpuData[core].core = idx;
+            core++;
+            mask >>= 1;
+        }
+        ptr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(((char*)ptr) + ptr->Size);
+        idx++;
+    }
+
+    for( uint32_t i=0; i<numcpus; i++ )
+    {
+        auto& data = cpuData[i];
+
+        TracyLfqPrepare( QueueType::CpuTopology );
+        MemWrite( &item->cpuTopology.package, data.package );
+        MemWrite( &item->cpuTopology.core, data.core );
+        MemWrite( &item->cpuTopology.thread, data.thread );
+
+#ifdef TRACY_ON_DEMAND
+        DeferItem( *item );
+#endif
+
+        TracyLfqCommit;
+    }
+
+    tracy_free( cpuData );
+    tracy_free( coreInfo );
+    tracy_free( packageInfo );
+#elif defined __linux__
+    const int numcpus = std::thread::hardware_concurrency();
+    auto cpuData = (CpuData*)tracy_malloc( sizeof( CpuData ) * numcpus );
+    memset( cpuData, 0, sizeof( CpuData ) * numcpus );
+
+    const char* basePath = "/sys/devices/system/cpu/cpu";
+    for( int i=0; i<numcpus; i++ )
+    {
+        char path[1024];
+        sprintf( path, "%s%i/topology/physical_package_id", basePath, i );
+        char buf[1024];
+        FILE* f = fopen( path, "rb" );
+        if( !f )
+        {
+            tracy_free( cpuData );
+            return;
+        }
+        auto read = fread( buf, 1, 1024, f );
+        buf[read] = '\0';
+        fclose( f );
+        cpuData[i].package = uint32_t( atoi( buf ) );
+        cpuData[i].thread = i;
+        sprintf( path, "%s%i/topology/core_id", basePath, i );
+        f = fopen( path, "rb" );
+        read = fread( buf, 1, 1024, f );
+        buf[read] = '\0';
+        fclose( f );
+        cpuData[i].core = uint32_t( atoi( buf ) );
+    }
+
+    for( int i=0; i<numcpus; i++ )
+    {
+        auto& data = cpuData[i];
+
+        TracyLfqPrepare( QueueType::CpuTopology );
+        MemWrite( &item->cpuTopology.package, data.package );
+        MemWrite( &item->cpuTopology.core, data.core );
+        MemWrite( &item->cpuTopology.thread, data.thread );
+
+#ifdef TRACY_ON_DEMAND
+        DeferItem( *item );
+#endif
+
+        TracyLfqCommit;
+    }
+
+    tracy_free( cpuData );
+#endif
+#endif
+}
+
+void Profiler::SendFrameMark( const char* name )
+{
+    if( !name ) GetProfiler().m_frameCount.fetch_add( 1, std::memory_order_relaxed );
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+    TracyLfqPrepare( QueueType::FrameMarkMsg );
+    MemWrite( &item->frameMark.time, GetTime() );
+    MemWrite( &item->frameMark.name, uint64_t( name ) );
+    TracyLfqCommit;
+}
+
+void Profiler::SendFrameMark( const char* name, QueueType type )
+{
+    assert( type == QueueType::FrameMarkMsgStart || type == QueueType::FrameMarkMsgEnd );
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+    auto item = QueueSerial();
+    MemWrite( &item->hdr.type, type );
+    MemWrite( &item->frameMark.time, GetTime() );
+    MemWrite( &item->frameMark.name, uint64_t( name ) );
+    QueueSerialFinish();
+}
+
+void Profiler::PlotData( const char* name, int64_t val )
+{
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+    TracyLfqPrepare( QueueType::PlotData );
+    MemWrite( &item->plotData.name, (uint64_t)name );
+    MemWrite( &item->plotData.time, GetTime() );
+    MemWrite( &item->plotData.type, PlotDataType::Int );
+    MemWrite( &item->plotData.data.i, val );
+    TracyLfqCommit;
+}
+
+void Profiler::PlotData( const char* name, float val )
+{
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+    TracyLfqPrepare( QueueType::PlotData );
+    MemWrite( &item->plotData.name, (uint64_t)name );
+    MemWrite( &item->plotData.time, GetTime() );
+    MemWrite( &item->plotData.type, PlotDataType::Float );
+    MemWrite( &item->plotData.data.f, val );
+    TracyLfqCommit;
+}
+
+void Profiler::PlotData( const char* name, double val )
+{
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+    TracyLfqPrepare( QueueType::PlotData );
+    MemWrite( &item->plotData.name, (uint64_t)name );
+    MemWrite( &item->plotData.time, GetTime() );
+    MemWrite( &item->plotData.type, PlotDataType::Double );
+    MemWrite( &item->plotData.data.d, val );
+    TracyLfqCommit;
+}
+
+void Profiler::ConfigurePlot( const char* name, PlotFormatType type )
+{
+    TracyLfqPrepare( QueueType::PlotConfig );
+    MemWrite( &item->plotConfig.name, (uint64_t)name );
+    MemWrite( &item->plotConfig.type, (uint8_t)type );
+
+#ifdef TRACY_ON_DEMAND
+    GetProfiler().DeferItem( *item );
+#endif
+
+    TracyLfqCommit;
+}
+
+    void Profiler::Message( const char* txt, size_t size, int callstack )
+{
+    assert( size < std::numeric_limits<uint16_t>::max() );
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+    if( callstack != 0 )
+    {
+        InitRPMallocThread();
+        tracy::GetProfiler().SendCallstack( callstack );
+    }
+
+    TracyLfqPrepare( callstack == 0 ? QueueType::Message : QueueType::MessageCallstack );
+    auto ptr = (char*)tracy_malloc( size );
+    memcpy( ptr, txt, size );
+    MemWrite( &item->messageFat.time, GetTime() );
+    MemWrite( &item->messageFat.text, (uint64_t)ptr );
+    MemWrite( &item->messageFat.size, (uint16_t)size );
+    TracyLfqCommit;
+}
+
+void Profiler::Message( const char* txt, int callstack )
+{
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+    if( callstack != 0 )
+    {
+        InitRPMallocThread();
+        tracy::GetProfiler().SendCallstack( callstack );
+    }
+
+    TracyLfqPrepare( callstack == 0 ? QueueType::MessageLiteral : QueueType::MessageLiteralCallstack );
+    MemWrite( &item->messageLiteral.time, GetTime() );
+    MemWrite( &item->messageLiteral.text, (uint64_t)txt );
+    TracyLfqCommit;
+}
+
+void Profiler::MessageColor( const char* txt, size_t size, uint32_t color, int callstack )
+{
+    assert( size < std::numeric_limits<uint16_t>::max() );
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+    if( callstack != 0 )
+    {
+        InitRPMallocThread();
+        tracy::GetProfiler().SendCallstack( callstack );
+    }
+
+    TracyLfqPrepare( callstack == 0 ? QueueType::MessageColor : QueueType::MessageColorCallstack );
+    auto ptr = (char*)tracy_malloc( size );
+    memcpy( ptr, txt, size );
+    MemWrite( &item->messageColorFat.time, GetTime() );
+    MemWrite( &item->messageColorFat.text, (uint64_t)ptr );
+    MemWrite( &item->messageColorFat.r, uint8_t( ( color       ) & 0xFF ) );
+    MemWrite( &item->messageColorFat.g, uint8_t( ( color >> 8  ) & 0xFF ) );
+    MemWrite( &item->messageColorFat.b, uint8_t( ( color >> 16 ) & 0xFF ) );
+    MemWrite( &item->messageColorFat.size, (uint16_t)size );
+    TracyLfqCommit;
+}
+
+void Profiler::MessageColor( const char* txt, uint32_t color, int callstack )
+{
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+    if( callstack != 0 )
+    {
+        InitRPMallocThread();
+        tracy::GetProfiler().SendCallstack( callstack );
+    }
+
+    TracyLfqPrepare( callstack == 0 ? QueueType::MessageLiteralColor : QueueType::MessageLiteralColorCallstack );
+    MemWrite( &item->messageColorLiteral.time, GetTime() );
+    MemWrite( &item->messageColorLiteral.text, (uint64_t)txt );
+    MemWrite( &item->messageColorLiteral.r, uint8_t( ( color       ) & 0xFF ) );
+    MemWrite( &item->messageColorLiteral.g, uint8_t( ( color >> 8  ) & 0xFF ) );
+    MemWrite( &item->messageColorLiteral.b, uint8_t( ( color >> 16 ) & 0xFF ) );
+    TracyLfqCommit;
+}
+
+void Profiler::MessageAppInfo( const char* txt, size_t size )
+{
+    assert( size < std::numeric_limits<uint16_t>::max() );
+    InitRPMallocThread();
+    auto ptr = (char*)tracy_malloc( size );
+    memcpy( ptr, txt, size );
+    TracyLfqPrepare( QueueType::MessageAppInfo );
+    MemWrite( &item->messageFat.time, GetTime() );
+    MemWrite( &item->messageFat.text, (uint64_t)ptr );
+    MemWrite( &item->messageFat.size, (uint16_t)size );
+
+#ifdef TRACY_ON_DEMAND
+    GetProfiler().DeferItem( *item );
+#endif
+
+    TracyLfqCommit;
+}
+
+void Profiler::MemAlloc(const void* ptr, size_t size, bool secure)
+{
+    if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+    const auto thread = GetThreadHandle();
+
+    GetProfiler().m_serialLock.lock();
+    SendMemAlloc( QueueType::MemAlloc, thread, ptr, size );
+    GetProfiler().m_serialLock.unlock();
+}
+
+void Profiler::MemFree( const void* ptr, bool secure )
+{
+    if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+    const auto thread = GetThreadHandle();
+
+    GetProfiler().m_serialLock.lock();
+    SendMemFree( QueueType::MemFree, thread, ptr );
+    GetProfiler().m_serialLock.unlock();
+}
+
+void Profiler::MemAllocCallstack( const void* ptr, size_t size, int depth, bool secure )
+{
+    if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_HAS_CALLSTACK
+    auto& profiler = GetProfiler();
+#  ifdef TRACY_ON_DEMAND
+    if( !profiler.IsConnected() ) return;
+#  endif
+    const auto thread = GetThreadHandle();
+
+    InitRPMallocThread();
+    auto callstack = Callstack( depth );
+
+    profiler.m_serialLock.lock();
+    SendCallstackSerial( callstack );
+    SendMemAlloc( QueueType::MemAllocCallstack, thread, ptr, size );
+    profiler.m_serialLock.unlock();
+#else
+    MemAlloc( ptr, size, secure );
+#endif
+}
+
+void Profiler::MemFreeCallstack( const void* ptr, int depth, bool secure )
+{
+    if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_HAS_CALLSTACK
+    auto& profiler = GetProfiler();
+#  ifdef TRACY_ON_DEMAND
+    if( !profiler.IsConnected() ) return;
+#  endif
+    const auto thread = GetThreadHandle();
+
+    InitRPMallocThread();
+    auto callstack = Callstack( depth );
+
+    profiler.m_serialLock.lock();
+    SendCallstackSerial( callstack );
+    SendMemFree( QueueType::MemFreeCallstack, thread, ptr );
+    profiler.m_serialLock.unlock();
+#else
+    MemFree( ptr, secure );
+#endif
+}
+
+void Profiler::MemAllocNamed( const void* ptr, size_t size, bool secure, const char* name )
+{
+    if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+    const auto thread = GetThreadHandle();
+
+    GetProfiler().m_serialLock.lock();
+    SendMemName( name );
+    SendMemAlloc( QueueType::MemAllocNamed, thread, ptr, size );
+    GetProfiler().m_serialLock.unlock();
+}
+
+void Profiler::MemFreeNamed( const void* ptr, bool secure, const char* name )
+{
+    if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+    const auto thread = GetThreadHandle();
+
+    GetProfiler().m_serialLock.lock();
+    SendMemName( name );
+    SendMemFree( QueueType::MemFreeNamed, thread, ptr );
+    GetProfiler().m_serialLock.unlock();
+}
+
+void Profiler::MemAllocCallstackNamed( const void* ptr, size_t size, int depth, bool secure, const char* name )
+{
+    if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_HAS_CALLSTACK
+    auto& profiler = GetProfiler();
+#  ifdef TRACY_ON_DEMAND
+    if( !profiler.IsConnected() ) return;
+#  endif
+    const auto thread = GetThreadHandle();
+
+    InitRPMallocThread();
+    auto callstack = Callstack( depth );
+
+    profiler.m_serialLock.lock();
+    SendCallstackSerial( callstack );
+    SendMemName( name );
+    SendMemAlloc( QueueType::MemAllocCallstackNamed, thread, ptr, size );
+    profiler.m_serialLock.unlock();
+#else
+    MemAlloc( ptr, size, secure );
+#endif
+}
+
+void Profiler::MemFreeCallstackNamed( const void* ptr, int depth, bool secure, const char* name )
+{
+    if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_HAS_CALLSTACK
+    auto& profiler = GetProfiler();
+#  ifdef TRACY_ON_DEMAND
+    if( !profiler.IsConnected() ) return;
+#  endif
+    const auto thread = GetThreadHandle();
+
+    InitRPMallocThread();
+    auto callstack = Callstack( depth );
+
+    profiler.m_serialLock.lock();
+    SendCallstackSerial( callstack );
+    SendMemName( name );
+    SendMemFree( QueueType::MemFreeCallstackNamed, thread, ptr );
+    profiler.m_serialLock.unlock();
+#else
+    MemFree( ptr, secure );
+#endif
+}
+
+void Profiler::SendCallstack( int depth )
+{
+#ifdef TRACY_HAS_CALLSTACK
+    auto ptr = Callstack( depth );
+    TracyLfqPrepare( QueueType::Callstack );
+    MemWrite( &item->callstackFat.ptr, (uint64_t)ptr );
+    TracyLfqCommit;
+#endif
+}
+
+void Profiler::ParameterRegister( ParameterCallback cb ) { GetProfiler().m_paramCallback = cb; }
+void Profiler::ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val )
+{
+    TracyLfqPrepare( QueueType::ParamSetup );
+    tracy::MemWrite( &item->paramSetup.idx, idx );
+    tracy::MemWrite( &item->paramSetup.name, (uint64_t)name );
+    tracy::MemWrite( &item->paramSetup.isBool, (uint8_t)isBool );
+    tracy::MemWrite( &item->paramSetup.val, val );
+
+#ifdef TRACY_ON_DEMAND
+    GetProfiler().DeferItem( *item );
+#endif
+
+    TracyLfqCommit;
+}
+
+void Profiler::SendCallstack( int depth, const char* skipBefore )
+{
+#ifdef TRACY_HAS_CALLSTACK
+    TracyLfqPrepare( QueueType::Callstack );
+    auto ptr = Callstack( depth );
+    CutCallstack( ptr, skipBefore );
+    MemWrite( &item->callstackFat.ptr, (uint64_t)ptr );
+    TracyLfqCommit;
+#endif
+}
+
+void Profiler::CutCallstack( void* callstack, const char* skipBefore )
+{
+#ifdef TRACY_HAS_CALLSTACK
+    auto data = (uintptr_t*)callstack;
+    const auto sz = *data++;
+    uintptr_t i;
+    for( i=0; i<sz; i++ )
+    {
+        auto name = DecodeCallstackPtrFast( uint64_t( data[i] ) );
+        const bool found = strcmp( name, skipBefore ) == 0;
+        if( found )
+        {
+            i++;
+            break;
+        }
+    }
+
+    if( i != sz )
+    {
+        memmove( data, data + i, ( sz - i ) * sizeof( uintptr_t* ) );
+        *--data = sz - i;
+    }
+#endif
+}
+
+#ifdef TRACY_HAS_SYSTIME
+void Profiler::ProcessSysTime()
+{
+    if( m_shutdown.load( std::memory_order_relaxed ) ) return;
+    auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+    if( t - m_sysTimeLast > 100000000 )    // 100 ms
+    {
+        auto sysTime = m_sysTime.Get();
+        if( sysTime >= 0 )
+        {
+            m_sysTimeLast = t;
+
+            TracyLfqPrepare( QueueType::SysTimeReport );
+            MemWrite( &item->sysTime.time, GetTime() );
+            MemWrite( &item->sysTime.sysTime, sysTime );
+            TracyLfqCommit;
+        }
+    }
+}
+#endif
+
+void Profiler::HandleParameter( uint64_t payload )
+{
+    assert( m_paramCallback );
+    const auto idx = uint32_t( payload >> 32 );
+    const auto val = int32_t( payload & 0xFFFFFFFF );
+    m_paramCallback( idx, val );
+    AckServerQuery();
+}
+
+#ifdef __ANDROID__
+// Implementation helpers of EnsureReadable(address).
+// This is so far only needed on Android, where it is common for libraries to be mapped
+// with only executable, not readable, permissions. Typical example (line from /proc/self/maps):
+/*
+746b63b000-746b6dc000 --xp 00042000 07:48 35                             /apex/com.android.runtime/lib64/bionic/libc.so
+*/
+// See https://github.com/wolfpld/tracy/issues/125 .
+// To work around this, we parse /proc/self/maps and we use mprotect to set read permissions
+// on any mappings that contain symbols addresses hit by HandleSymbolCodeQuery.
+
+namespace {
+// Holds some information about a single memory mapping.
+struct MappingInfo {
+    // Start of address range. Inclusive.
+    uintptr_t start_address;
+    // End of address range. Exclusive, so the mapping is the half-open interval
+    // [start, end) and its length in bytes is `end - start`. As in /proc/self/maps.
+    uintptr_t end_address;
+    // Read/Write/Executable permissions.
+    bool perm_r, perm_w, perm_x;
+};
+}  // anonymous namespace
+
+// Internal implementation helper for LookUpMapping(address).
+//
+// Parses /proc/self/maps returning a vector<MappingInfo>.
+// /proc/self/maps is assumed to be sorted by ascending address, so the resulting
+// vector is sorted by ascending address too.
+static std::vector<MappingInfo> ParseMappings()
+{
+    std::vector<MappingInfo> result;
+    FILE* file = fopen( "/proc/self/maps", "r" );
+    if( !file ) return result;
+    char line[1024];
+    while( fgets( line, sizeof( line ), file ) )
+    {
+        uintptr_t start_addr;
+        uintptr_t end_addr;
+        if( sscanf( line, "%lx-%lx", &start_addr, &end_addr ) != 2 ) continue;
+        char* first_space = strchr( line, ' ' );
+        if( !first_space ) continue;
+        char* perm = first_space + 1;
+        char* second_space = strchr( perm, ' ' );
+        if( !second_space || second_space - perm != 4 ) continue;
+        result.emplace_back();
+        auto& mapping = result.back();
+        mapping.start_address = start_addr;
+        mapping.end_address = end_addr;
+        mapping.perm_r = perm[0] == 'r';
+        mapping.perm_w = perm[1] == 'w';
+        mapping.perm_x = perm[2] == 'x';
+    }
+    fclose( file );
+    return result;
+}
+
+// Internal implementation helper for LookUpMapping(address).
+//
+// Takes as input an `address` and a known vector `mappings`, assumed to be
+// sorted by increasing addresses, as /proc/self/maps seems to be.
+// Returns a pointer to the MappingInfo describing the mapping that this
+// address belongs to, or nullptr if the address isn't in `mappings`.
+static MappingInfo* LookUpMapping(std::vector<MappingInfo>& mappings, uintptr_t address)
+{
+    // Comparison function for std::lower_bound. Returns true if all addresses in `m1`
+    // are lower than `addr`.
+    auto Compare = []( const MappingInfo& m1, uintptr_t addr ) {
+        // '<=' because the address ranges are half-open intervals, [start, end).
+        return m1.end_address <= addr;
+    };
+    auto iter = std::lower_bound( mappings.begin(), mappings.end(), address, Compare );
+    if( iter == mappings.end() || iter->start_address > address) {
+        return nullptr;
+    }
+    return &*iter;
+}
+
+// Internal implementation helper for EnsureReadable(address).
+//
+// Takes as input an `address` and returns a pointer to a MappingInfo
+// describing the mapping that this address belongs to, or nullptr if
+// the address isn't in any known mapping.
+//
+// This function is stateful and not reentrant (assumes to be called from
+// only one thread). It holds a vector of mappings parsed from /proc/self/maps.
+//
+// Attempts to react to mappings changes by re-parsing /proc/self/maps.
+static MappingInfo* LookUpMapping(uintptr_t address)
+{
+    // Static state managed by this function. Not constant, we mutate that state as
+    // we turn some mappings readable. Initially parsed once here, updated as needed below.
+    static std::vector<MappingInfo> s_mappings = ParseMappings();
+    MappingInfo* mapping = LookUpMapping( s_mappings, address );
+    if( mapping ) return mapping;
+
+    // This address isn't in any known mapping. Try parsing again, maybe
+    // mappings changed.
+    s_mappings = ParseMappings();
+    return LookUpMapping( s_mappings, address );
+}
+
+// Internal implementation helper for EnsureReadable(address).
+//
+// Attempts to make the specified `mapping` readable if it isn't already.
+// Returns true if and only if the mapping is readable.
+static bool EnsureReadable( MappingInfo& mapping )
+{
+    if( mapping.perm_r )
+    {
+        // The mapping is already readable.
+        return true;
+    }
+    int prot = PROT_READ;
+    if( mapping.perm_w ) prot |= PROT_WRITE;
+    if( mapping.perm_x ) prot |= PROT_EXEC;
+    if( mprotect( reinterpret_cast<void*>( mapping.start_address ),
+                  mapping.end_address - mapping.start_address, prot ) == -1 )
+    {
+        // Failed to make the mapping readable. Shouldn't happen, hasn't
+        // been observed yet. If it happened in practice, we should consider
+        // adding a bool to MappingInfo to track this to avoid retrying mprotect
+        // everytime on such mappings.
+        return false;
+    }
+    // The mapping is now readable. Update `mapping` so the next call will be fast.
+    mapping.perm_r = true;
+    return true;
+}
+
+// Attempts to set the read permission on the entire mapping containing the
+// specified address. Returns true if and only if the mapping is now readable.
+static bool EnsureReadable( uintptr_t address )
+{
+    MappingInfo* mapping = LookUpMapping(address);
+    return mapping && EnsureReadable( *mapping );
+}
+
+#endif  // defined __ANDROID__
+
+void Profiler::HandleSymbolQuery( uint64_t symbol )
+{
+#ifdef TRACY_HAS_CALLSTACK
+#ifdef __ANDROID__
+    // On Android it's common for code to be in mappings that are only executable
+    // but not readable.
+    if( !EnsureReadable( symbol ) )
+    {
+        return;
+    }
+#endif
+    const auto sym = DecodeSymbolAddress( symbol );
+
+    SendSingleString( sym.file );
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::SymbolInformation );
+    MemWrite( &item.symbolInformation.line, sym.line );
+    MemWrite( &item.symbolInformation.symAddr, symbol );
+
+    AppendData( &item, QueueDataSize[(int)QueueType::SymbolInformation] );
+
+    if( sym.needFree ) tracy_free( (void*)sym.file );
+#endif
+}
+
+void Profiler::HandleSymbolCodeQuery( uint64_t symbol, uint32_t size )
+{
+#ifdef __ANDROID__
+    // On Android it's common for code to be in mappings that are only executable
+    // but not readable.
+    if( !EnsureReadable( symbol ) )
+    {
+        return;
+    }
+#endif
+    SendLongString( symbol, (const char*)symbol, size, QueueType::SymbolCode );
+}
+
+void Profiler::HandleSourceCodeQuery()
+{
+    assert( m_exectime != 0 );
+    assert( m_queryData );
+
+    struct stat st;
+    if( stat( m_queryData, &st ) == 0 && (uint64_t)st.st_mtime < m_exectime && st.st_size < ( TargetFrameSize - 16 ) )
+    {
+        FILE* f = fopen( m_queryData, "rb" );
+        tracy_free( m_queryData );
+        if( f )
+        {
+            auto ptr = (char*)tracy_malloc( st.st_size );
+            auto rd = fread( ptr, 1, st.st_size, f );
+            fclose( f );
+            if( rd == (size_t)st.st_size )
+            {
+                SendLongString( (uint64_t)ptr, ptr, rd, QueueType::SourceCode );
+            }
+            else
+            {
+                AckSourceCodeNotAvailable();
+            }
+            tracy_free( ptr );
+        }
+        else
+        {
+            AckSourceCodeNotAvailable();
+        }
+    }
+    else
+    {
+        tracy_free( m_queryData );
+        AckSourceCodeNotAvailable();
+    }
+    m_queryData = nullptr;
+}
+
+void Profiler::SendCodeLocation( uint64_t ptr )
+{
+#ifdef TRACY_HAS_CALLSTACK
+    const auto sym = DecodeCodeAddress( ptr );
+
+    SendSingleString( sym.file );
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::CodeInformation );
+    MemWrite( &item.codeInformation.ptr, ptr );
+    MemWrite( &item.codeInformation.line, sym.line );
+
+    AppendData( &item, QueueDataSize[(int)QueueType::CodeInformation] );
+
+    if( sym.needFree ) tracy_free( (void*)sym.file );
+#endif
+}
+
+#if ( defined _WIN32 || defined __CYGWIN__ ) && defined TRACY_TIMER_QPC
+int64_t Profiler::GetTimeQpc()
+{
+    LARGE_INTEGER t;
+    QueryPerformanceCounter( &t );
+    return t.QuadPart;
+}
+#endif
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracyProfiler.hpp b/Source/ThirdParty/tracy/client/TracyProfiler.hpp
new file mode 100644
index 000000000..0cec00f11
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyProfiler.hpp
@@ -0,0 +1,451 @@
+#ifndef __TRACYPROFILER_HPP__
+#define __TRACYPROFILER_HPP__
+
+#include <assert.h>
+#include <atomic>
+#include <stdint.h>
+#include <string.h>
+#include <time.h>
+
+#include "tracy_concurrentqueue.h"
+#include "TracyCallstack.hpp"
+#include "TracySysTime.hpp"
+#include "TracyFastVector.hpp"
+#include "../common/TracyQueue.hpp"
+#include "../common/TracyAlign.hpp"
+#include "../common/TracyAlloc.hpp"
+#include "../common/TracyMutex.hpp"
+#include "../common/TracyProtocol.hpp"
+
+#if defined _WIN32 || defined __CYGWIN__
+#  include <intrin.h>
+#endif
+#ifdef __APPLE__
+#  include <TargetConditionals.h>
+#  include <mach/mach_time.h>
+#endif
+
+#if !defined TRACY_TIMER_FALLBACK && ( defined _WIN32 || defined __CYGWIN__ || ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) || ( defined TARGET_OS_IOS && TARGET_OS_IOS == 1 ) )
+#  define TRACY_HW_TIMER
+#endif
+
+#if !defined TRACY_HW_TIMER
+#  include <chrono>
+#endif
+
+namespace tracy
+{
+#if defined(TRACY_DELAYED_INIT) && defined(TRACY_MANUAL_LIFETIME)
+TRACY_API void StartupProfiler();
+TRACY_API void ShutdownProfiler();
+#endif
+
+class GpuCtx;
+class Profiler;
+class Socket;
+class UdpBroadcast;
+
+struct GpuCtxWrapper
+{
+    GpuCtx* ptr;
+};
+
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* GetToken();
+TRACY_API Profiler& GetProfiler();
+TRACY_API std::atomic<uint32_t>& GetLockCounter();
+TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter();
+TRACY_API GpuCtxWrapper& GetGpuCtx();
+TRACY_API uint64_t GetThreadHandle();
+TRACY_API void InitRPMallocThread();
+TRACY_API bool ProfilerAvailable();
+TRACY_API int64_t GetFrequencyQpc();
+
+#ifdef TRACY_ON_DEMAND
+struct LuaZoneState
+{
+    uint32_t counter;
+    bool active;
+};
+#endif
+
+
+#define TracyLfqPrepare( _type ) \
+    moodycamel::ConcurrentQueueDefaultTraits::index_t __magic; \
+    auto __token = GetToken(); \
+    auto& __tail = __token->get_tail_index(); \
+    auto item = __token->enqueue_begin( __magic ); \
+    MemWrite( &item->hdr.type, _type );
+
+#define TracyLfqCommit \
+    __tail.store( __magic + 1, std::memory_order_release );
+
+#define TracyLfqPrepareC( _type ) \
+    tracy::moodycamel::ConcurrentQueueDefaultTraits::index_t __magic; \
+    auto __token = tracy::GetToken(); \
+    auto& __tail = __token->get_tail_index(); \
+    auto item = __token->enqueue_begin( __magic ); \
+    tracy::MemWrite( &item->hdr.type, _type );
+
+#define TracyLfqCommitC \
+    __tail.store( __magic + 1, std::memory_order_release );
+
+
+class TRACY_API Profiler
+{
+    struct FrameImageQueueItem
+    {
+        void* image;
+        uint32_t frame;
+        uint16_t w;
+        uint16_t h;
+        uint8_t offset;
+        bool flip;
+    };
+
+public:
+    Profiler();
+    ~Profiler();
+
+    void SpawnWorkerThreads();
+
+    static tracy_force_inline int64_t GetTime()
+    {
+#ifdef TRACY_HW_TIMER
+#  if defined TARGET_OS_IOS && TARGET_OS_IOS == 1
+        return mach_absolute_time();
+#  elif defined _WIN32 || defined __CYGWIN__
+#    ifdef TRACY_TIMER_QPC
+        return GetTimeQpc();
+#    else
+        return int64_t( __rdtsc() );
+#    endif
+#  elif defined __i386 || defined _M_IX86
+        uint32_t eax, edx;
+        asm volatile ( "rdtsc" : "=a" (eax), "=d" (edx) );
+        return ( uint64_t( edx ) << 32 ) + uint64_t( eax );
+#  elif defined __x86_64__ || defined _M_X64
+        uint64_t rax, rdx;
+        asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) );
+        return (int64_t)(( rdx << 32 ) + rax);
+#  else
+#    error "TRACY_HW_TIMER detection logic needs fixing"
+#  endif
+#else
+#  if defined __linux__ && defined CLOCK_MONOTONIC_RAW
+        struct timespec ts;
+        clock_gettime( CLOCK_MONOTONIC_RAW, &ts );
+        return int64_t( ts.tv_sec ) * 1000000000ll + int64_t( ts.tv_nsec );
+#  else
+        return std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::high_resolution_clock::now().time_since_epoch() ).count();
+#  endif
+#endif
+    }
+
+    tracy_force_inline uint32_t GetNextZoneId()
+    {
+        return m_zoneId.fetch_add( 1, std::memory_order_relaxed );
+    }
+
+    static tracy_force_inline QueueItem* QueueSerial()
+    {
+        auto& p = GetProfiler();
+        p.m_serialLock.lock();
+        return p.m_serialQueue.prepare_next();
+    }
+
+    static tracy_force_inline QueueItem* QueueSerialCallstack( void* ptr )
+    {
+        auto& p = GetProfiler();
+        p.m_serialLock.lock();
+        p.SendCallstackSerial( ptr );
+        return p.m_serialQueue.prepare_next();
+    }
+
+    static tracy_force_inline void QueueSerialFinish()
+    {
+        auto& p = GetProfiler();
+        p.m_serialQueue.commit_next();
+        p.m_serialLock.unlock();
+    }
+
+    static void SendFrameMark( const char* name );
+    static void SendFrameMark( const char* name, QueueType type );
+    static void PlotData( const char* name, int64_t val );
+    static void PlotData( const char* name, float val );
+    static void PlotData( const char* name, double val );
+    static void ConfigurePlot( const char* name, PlotFormatType type );
+    static void Message( const char* txt, size_t size, int callstack );
+    static void Message( const char* txt, int callstack );
+    static void MessageColor( const char* txt, size_t size, uint32_t color, int callstack );
+    static void MessageColor( const char* txt, uint32_t color, int callstack );
+    static void MessageAppInfo( const char* txt, size_t size );
+    static void MemAlloc( const void* ptr, size_t size, bool secure );
+    static void MemFree( const void* ptr, bool secure );
+    static void MemAllocCallstack( const void* ptr, size_t size, int depth, bool secure );
+    static void MemFreeCallstack( const void* ptr, int depth, bool secure );
+    static void MemAllocNamed( const void* ptr, size_t size, bool secure, const char* name );
+    static void MemFreeNamed( const void* ptr, bool secure, const char* name );
+    static void MemAllocCallstackNamed( const void* ptr, size_t size, int depth, bool secure, const char* name );
+    static void MemFreeCallstackNamed( const void* ptr, int depth, bool secure, const char* name );
+    static void SendCallstack( int depth );
+    static void ParameterRegister( ParameterCallback cb );
+
+    void SendCallstack( int depth, const char* skipBefore );
+    static void CutCallstack( void* callstack, const char* skipBefore );
+
+    static bool ShouldExit();
+
+    tracy_force_inline bool IsConnected() const
+    {
+        return m_isConnected.load( std::memory_order_acquire );
+    }
+
+#ifdef TRACY_ON_DEMAND
+    tracy_force_inline uint64_t ConnectionId() const
+    {
+        return m_connectionId.load( std::memory_order_acquire );
+    }
+
+    tracy_force_inline void DeferItem( const QueueItem& item )
+    {
+        m_deferredLock.lock();
+        auto dst = m_deferredQueue.push_next();
+        memcpy( dst, &item, sizeof( item ) );
+        m_deferredLock.unlock();
+    }
+#endif
+
+    void RequestShutdown() { m_shutdown.store( true, std::memory_order_relaxed ); m_shutdownManual.store( true, std::memory_order_relaxed ); }
+    bool HasShutdownFinished() const { return m_shutdownFinished.load( std::memory_order_relaxed ); }
+
+    void SendString( uint64_t str, const char* ptr, QueueType type ) { SendString( str, ptr, strlen( ptr ), type ); }
+    void SendString( uint64_t str, const char* ptr, size_t len, QueueType type );
+    void SendSingleString( const char* ptr ) { SendSingleString( ptr, strlen( ptr ) ); }
+    void SendSingleString( const char* ptr, size_t len );
+    void SendSecondString( const char* ptr ) { SendSecondString( ptr, strlen( ptr ) ); }
+    void SendSecondString( const char* ptr, size_t len );
+
+
+    // Allocated source location data layout:
+    //  2b  payload size
+    //  4b  color
+    //  4b  source line
+    //  fsz function name
+    //  1b  null terminator
+    //  ssz source file name
+    //  1b  null terminator
+    //  nsz zone name (optional)
+
+    static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, const char* function )
+    {
+        return AllocSourceLocation( line, source, function, nullptr, 0 );
+    }
+
+    static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, const char* function, const char* name, size_t nameSz )
+    {
+        return AllocSourceLocation( line, source, strlen(source), function, strlen(function), name, nameSz );
+    }
+
+    static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz )
+    {
+        return AllocSourceLocation( line, source, sourceSz, function, functionSz, nullptr, 0 );
+    }
+
+    static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz )
+    {
+        const auto sz32 = uint32_t( 2 + 4 + 4 + functionSz + 1 + sourceSz + 1 + nameSz );
+        assert( sz32 <= std::numeric_limits<uint16_t>::max() );
+        const auto sz = uint16_t( sz32 );
+        auto ptr = (char*)tracy_malloc( sz );
+        memcpy( ptr, &sz, 2 );
+        memset( ptr + 2, 0, 4 );
+        memcpy( ptr + 6, &line, 4 );
+        memcpy( ptr + 10, function, functionSz );
+        ptr[10 + functionSz] = '\0';
+        memcpy( ptr + 10 + functionSz + 1, source, sourceSz );
+        ptr[10 + functionSz + 1 + sourceSz] = '\0';
+        if( nameSz != 0 )
+        {
+            memcpy( ptr + 10 + functionSz + 1 + sourceSz + 1, name, nameSz );
+        }
+        return uint64_t( ptr );
+    }
+
+private:
+    enum class DequeueStatus { DataDequeued, ConnectionLost, QueueEmpty };
+
+    static void LaunchWorker( void* ptr ) { ((Profiler*)ptr)->Worker(); }
+    void Worker();
+
+    void ClearQueues( tracy::moodycamel::ConsumerToken& token );
+    void ClearSerial();
+    DequeueStatus Dequeue( tracy::moodycamel::ConsumerToken& token );
+    DequeueStatus DequeueContextSwitches( tracy::moodycamel::ConsumerToken& token, int64_t& timeStop );
+    DequeueStatus DequeueSerial();
+    bool CommitData();
+
+    tracy_force_inline bool AppendData( const void* data, size_t len )
+    {
+        const auto ret = NeedDataSize( len );
+        AppendDataUnsafe( data, len );
+        return ret;
+    }
+
+    tracy_force_inline bool NeedDataSize( size_t len )
+    {
+        assert( len <= TargetFrameSize );
+        bool ret = true;
+        if( m_bufferOffset - m_bufferStart + (int)len > TargetFrameSize )
+        {
+            ret = CommitData();
+        }
+        return ret;
+    }
+
+    tracy_force_inline void AppendDataUnsafe( const void* data, size_t len )
+    {
+        memcpy( m_buffer + m_bufferOffset, data, len );
+        m_bufferOffset += int( len );
+    }
+
+    bool SendData( const char* data, size_t len );
+    void SendLongString( uint64_t ptr, const char* str, size_t len, QueueType type );
+    void SendSourceLocation( uint64_t ptr );
+    void SendSourceLocationPayload( uint64_t ptr );
+    void SendCallstackPayload( uint64_t ptr );
+    void SendCallstackPayload64( uint64_t ptr );
+    void SendCallstackAlloc( uint64_t ptr );
+    void SendCallstackFrame( uint64_t ptr );
+    void SendCodeLocation( uint64_t ptr );
+
+    bool HandleServerQuery();
+    void HandleDisconnect();
+    void HandleParameter( uint64_t payload );
+    void HandleSymbolQuery( uint64_t symbol );
+    void HandleSymbolCodeQuery( uint64_t symbol, uint32_t size );
+    void HandleSourceCodeQuery();
+
+    void AckServerQuery();
+    void AckSourceCodeNotAvailable();
+
+    void CalibrateTimer();
+    void CalibrateDelay();
+    void ReportTopology();
+
+    static tracy_force_inline void SendCallstackSerial( void* ptr )
+    {
+#ifdef TRACY_HAS_CALLSTACK
+        auto item = GetProfiler().m_serialQueue.prepare_next();
+        MemWrite( &item->hdr.type, QueueType::CallstackSerial );
+        MemWrite( &item->callstackFat.ptr, (uint64_t)ptr );
+        GetProfiler().m_serialQueue.commit_next();
+#endif
+    }
+
+    static tracy_force_inline void SendMemAlloc( QueueType type, const uint64_t thread, const void* ptr, size_t size )
+    {
+        assert( type == QueueType::MemAlloc || type == QueueType::MemAllocCallstack || type == QueueType::MemAllocNamed || type == QueueType::MemAllocCallstackNamed );
+
+        auto item = GetProfiler().m_serialQueue.prepare_next();
+        MemWrite( &item->hdr.type, type );
+        MemWrite( &item->memAlloc.time, GetTime() );
+        MemWrite( &item->memAlloc.thread, thread );
+        MemWrite( &item->memAlloc.ptr, (uint64_t)ptr );
+        if( compile_time_condition<sizeof( size ) == 4>::value )
+        {
+            memcpy( &item->memAlloc.size, &size, 4 );
+            memset( &item->memAlloc.size + 4, 0, 2 );
+        }
+        else
+        {
+            assert( sizeof( size ) == 8 );
+            memcpy( &item->memAlloc.size, &size, 4 );
+            memcpy( ((char*)&item->memAlloc.size)+4, ((char*)&size)+4, 2 );
+        }
+        GetProfiler().m_serialQueue.commit_next();
+    }
+
+    static tracy_force_inline void SendMemFree( QueueType type, const uint64_t thread, const void* ptr )
+    {
+        assert( type == QueueType::MemFree || type == QueueType::MemFreeCallstack || type == QueueType::MemFreeNamed || type == QueueType::MemFreeCallstackNamed );
+
+        auto item = GetProfiler().m_serialQueue.prepare_next();
+        MemWrite( &item->hdr.type, type );
+        MemWrite( &item->memFree.time, GetTime() );
+        MemWrite( &item->memFree.thread, thread );
+        MemWrite( &item->memFree.ptr, (uint64_t)ptr );
+        GetProfiler().m_serialQueue.commit_next();
+    }
+
+    static tracy_force_inline void SendMemName( const char* name )
+    {
+        assert( name );
+        auto item = GetProfiler().m_serialQueue.prepare_next();
+        MemWrite( &item->hdr.type, QueueType::MemNamePayload );
+        MemWrite( &item->memName.name, (uint64_t)name );
+        GetProfiler().m_serialQueue.commit_next();
+    }
+
+#if ( defined _WIN32 || defined __CYGWIN__ ) && defined TRACY_TIMER_QPC
+    static int64_t GetTimeQpc();
+#endif
+
+    double m_timerMul;
+    uint64_t m_resolution;
+    uint64_t m_delay;
+    std::atomic<int64_t> m_timeBegin;
+    uint64_t m_mainThread;
+    uint64_t m_epoch, m_exectime;
+    std::atomic<bool> m_shutdown;
+    std::atomic<bool> m_shutdownManual;
+    std::atomic<bool> m_shutdownFinished;
+    Socket* m_sock;
+    UdpBroadcast* m_broadcast;
+    bool m_noExit;
+    uint32_t m_userPort;
+    std::atomic<uint32_t> m_zoneId;
+    int64_t m_samplingPeriod;
+
+    uint64_t m_threadCtx;
+    int64_t m_refTimeThread;
+    int64_t m_refTimeSerial;
+    int64_t m_refTimeCtx;
+    int64_t m_refTimeGpu;
+
+    void* m_stream;     // LZ4_stream_t*
+    char* m_buffer;
+    int m_bufferOffset;
+    int m_bufferStart;
+
+    char* m_lz4Buf;
+
+    FastVector<QueueItem> m_serialQueue, m_serialDequeue;
+    TracyMutex m_serialLock;
+
+    std::atomic<uint64_t> m_frameCount;
+    std::atomic<bool> m_isConnected;
+#ifdef TRACY_ON_DEMAND
+    std::atomic<uint64_t> m_connectionId;
+
+    TracyMutex m_deferredLock;
+    FastVector<QueueItem> m_deferredQueue;
+#endif
+
+#ifdef TRACY_HAS_SYSTIME
+    void ProcessSysTime();
+
+    SysTime m_sysTime;
+    uint64_t m_sysTimeLast = 0;
+#else
+    void ProcessSysTime() {}
+#endif
+
+    ParameterCallback m_paramCallback;
+
+    char* m_queryData;
+    char* m_queryDataPtr;
+};
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracyRingBuffer.hpp b/Source/ThirdParty/tracy/client/TracyRingBuffer.hpp
new file mode 100644
index 000000000..29d935596
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyRingBuffer.hpp
@@ -0,0 +1,116 @@
+namespace tracy
+{
+
+template<size_t Size>
+class RingBuffer
+{
+public:
+    RingBuffer( int fd )
+        : m_fd( fd )
+    {
+        const auto pageSize = uint32_t( getpagesize() );
+        assert( Size >= pageSize );
+        assert( __builtin_popcount( Size ) == 1 );
+        m_mapSize = Size + pageSize;
+        auto mapAddr = mmap( nullptr, m_mapSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 );
+        if( !mapAddr )
+        {
+            m_fd = 0;
+            close( fd );
+            return;
+        }
+        m_metadata = (perf_event_mmap_page*)mapAddr;
+        assert( m_metadata->data_offset == pageSize );
+        m_buffer = ((char*)mapAddr) + pageSize;
+    }
+
+    ~RingBuffer()
+    {
+        if( m_metadata ) munmap( m_metadata, m_mapSize );
+        if( m_fd ) close( m_fd );
+    }
+
+    RingBuffer( const RingBuffer& ) = delete;
+    RingBuffer& operator=( const RingBuffer& ) = delete;
+
+    RingBuffer( RingBuffer&& other )
+    {
+        memcpy( (char*)&other, (char*)this, sizeof( RingBuffer ) );
+        m_metadata = nullptr;
+        m_fd = 0;
+    }
+
+    RingBuffer& operator=( RingBuffer&& other )
+    {
+        memcpy( (char*)&other, (char*)this, sizeof( RingBuffer ) );
+        m_metadata = nullptr;
+        m_fd = 0;
+        return *this;
+    }
+
+    bool IsValid() const { return m_metadata != nullptr; }
+
+    void Enable()
+    {
+        ioctl( m_fd, PERF_EVENT_IOC_ENABLE, 0 );
+    }
+
+    bool HasData() const
+    {
+        const auto head = LoadHead();
+        return head > m_metadata->data_tail;
+    }
+
+    void Read( void* dst, uint64_t offset, uint64_t cnt )
+    {
+        auto src = ( m_metadata->data_tail + offset ) % Size;
+        if( src + cnt <= Size )
+        {
+            memcpy( dst, m_buffer + src, cnt );
+        }
+        else
+        {
+            const auto s0 = Size - src;
+            memcpy( dst, m_buffer + src, s0 );
+            memcpy( (char*)dst + s0, m_buffer, cnt - s0 );
+        }
+    }
+
+    void Advance( uint64_t cnt )
+    {
+        StoreTail( m_metadata->data_tail + cnt );
+    }
+
+    bool CheckTscCaps() const
+    {
+        return m_metadata->cap_user_time_zero;
+    }
+
+    int64_t ConvertTimeToTsc( int64_t timestamp ) const
+    {
+        assert( m_metadata->cap_user_time_zero );
+        const auto time = timestamp - m_metadata->time_zero;
+        const auto quot = time / m_metadata->time_mult;
+        const auto rem = time % m_metadata->time_mult;
+        return ( quot << m_metadata->time_shift ) + ( rem << m_metadata->time_shift ) / m_metadata->time_mult;
+    }
+
+private:
+    uint64_t LoadHead() const
+    {
+        return std::atomic_load_explicit( (const volatile std::atomic<uint64_t>*)&m_metadata->data_head, std::memory_order_acquire );
+    }
+
+    void StoreTail( uint64_t tail )
+    {
+        std::atomic_store_explicit( (volatile std::atomic<uint64_t>*)&m_metadata->data_tail, tail, std::memory_order_release );
+    }
+
+    perf_event_mmap_page* m_metadata;
+    char* m_buffer;
+
+    size_t m_mapSize;
+    int m_fd;
+};
+
+}
diff --git a/Source/ThirdParty/tracy/client/TracyScoped.hpp b/Source/ThirdParty/tracy/client/TracyScoped.hpp
new file mode 100644
index 000000000..fa6a52808
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyScoped.hpp
@@ -0,0 +1,174 @@
+#ifndef __TRACYSCOPED_HPP__
+#define __TRACYSCOPED_HPP__
+
+#include <limits>
+#include <stdint.h>
+#include <string.h>
+
+#include "../common/TracySystem.hpp"
+#include "../common/TracyAlign.hpp"
+#include "../common/TracyAlloc.hpp"
+#include "TracyProfiler.hpp"
+
+namespace tracy
+{
+inline ScopedZone::ScopedZone( const SourceLocationData* srcloc, bool is_active )
+#ifdef TRACY_ON_DEMAND
+    : m_active( is_active && GetProfiler().IsConnected() )
+#else
+    : m_active( is_active )
+#endif
+{
+    if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+    m_connectionId = GetProfiler().ConnectionId();
+#endif
+    TracyLfqPrepare( QueueType::ZoneBegin );
+    MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+    MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
+    TracyLfqCommit;
+}
+
+inline ScopedZone::ScopedZone( const SourceLocationData* srcloc, int depth, bool is_active )
+#ifdef TRACY_ON_DEMAND
+    : m_active( is_active && GetProfiler().IsConnected() )
+#else
+    : m_active( is_active )
+#endif
+{
+    if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+    m_connectionId = GetProfiler().ConnectionId();
+#endif
+    GetProfiler().SendCallstack( depth );
+
+    TracyLfqPrepare( QueueType::ZoneBeginCallstack );
+    MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+    MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
+    TracyLfqCommit;
+}
+
+inline ScopedZone::ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool is_active )
+#ifdef TRACY_ON_DEMAND
+    : m_active( is_active && GetProfiler().IsConnected() )
+#else
+    : m_active( is_active )
+#endif
+{
+    if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+    m_connectionId = GetProfiler().ConnectionId();
+#endif
+    TracyLfqPrepare( QueueType::ZoneBeginAllocSrcLoc );
+    const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
+    MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+    MemWrite( &item->zoneBegin.srcloc, srcloc );
+    TracyLfqCommit;
+}
+
+inline ScopedZone::ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool is_active )
+#ifdef TRACY_ON_DEMAND
+    : m_active( is_active && GetProfiler().IsConnected() )
+#else
+    : m_active( is_active )
+#endif
+{
+    if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+    m_connectionId = GetProfiler().ConnectionId();
+#endif
+    GetProfiler().SendCallstack( depth );
+
+    TracyLfqPrepare( QueueType::ZoneBeginAllocSrcLocCallstack );
+    const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
+    MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+    MemWrite( &item->zoneBegin.srcloc, srcloc );
+    TracyLfqCommit;
+}
+
+inline ScopedZone::~ScopedZone()
+{
+    if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+    if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+    TracyLfqPrepare( QueueType::ZoneEnd );
+    MemWrite( &item->zoneEnd.time, Profiler::GetTime() );
+    TracyLfqCommit;
+}
+
+inline void ScopedZone::Text( const char* txt, size_t size )
+{
+    assert( size < std::numeric_limits<uint16_t>::max() );
+    if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+    if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+    auto ptr = (char*)tracy_malloc( size );
+    memcpy( ptr, txt, size );
+    TracyLfqPrepare( QueueType::ZoneText );
+    MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
+    MemWrite( &item->zoneTextFat.size, (uint16_t)size );
+    TracyLfqCommit;
+}
+
+inline void ScopedZone::Name( const char* txt, size_t size )
+{
+    assert( size < std::numeric_limits<uint16_t>::max() );
+    if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+    if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+    auto ptr = (char*)tracy_malloc( size );
+    memcpy( ptr, txt, size );
+    TracyLfqPrepare( QueueType::ZoneName );
+    MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
+    MemWrite( &item->zoneTextFat.size, (uint16_t)size );
+    TracyLfqCommit;
+}
+
+inline void ScopedZone::Name( const Char* txt, size_t size )
+{
+    assert( size < std::numeric_limits<uint16_t>::max() );
+    if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+    if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+    auto ptr = (char*)tracy_malloc( size );
+    for( int i = 0; i < size; i++)
+        ptr[i] = (char)txt[i];
+    TracyLfqPrepare( QueueType::ZoneName );
+    MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
+    MemWrite( &item->zoneTextFat.size, (uint16_t)size );
+    TracyLfqCommit;
+}
+
+inline void ScopedZone::Color( uint32_t color )
+{
+    if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+    if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+    TracyLfqPrepare( QueueType::ZoneColor );
+    MemWrite( &item->zoneColor.r, uint8_t( ( color       ) & 0xFF ) );
+    MemWrite( &item->zoneColor.g, uint8_t( ( color >> 8  ) & 0xFF ) );
+    MemWrite( &item->zoneColor.b, uint8_t( ( color >> 16 ) & 0xFF ) );
+    TracyLfqCommit;
+}
+
+inline void ScopedZone::Value( uint64_t value )
+{
+    if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+    if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+    TracyLfqPrepare( QueueType::ZoneValue );
+    MemWrite( &item->zoneValue.value, value );
+    TracyLfqCommit;
+}
+
+inline bool ScopedZone::IsActive() const { return m_active; }
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracySysTime.cpp b/Source/ThirdParty/tracy/client/TracySysTime.cpp
new file mode 100644
index 000000000..e5903467d
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracySysTime.cpp
@@ -0,0 +1,108 @@
+#include "TracySysTime.hpp"
+
+#ifdef TRACY_HAS_SYSTIME
+
+#  if defined _WIN32 || defined __CYGWIN__
+#    include <windows.h>
+#  elif defined __linux__
+#    include <stdio.h>
+#    include <inttypes.h>
+#  elif defined __APPLE__
+#    include <mach/mach_host.h>
+#    include <mach/host_info.h>
+#  elif defined BSD
+#    include <sys/types.h>
+#    include <sys/sysctl.h>
+#  endif
+
+namespace tracy
+{
+
+#  if defined _WIN32 || defined __CYGWIN__
+
+static inline uint64_t ConvertTime( const FILETIME& t )
+{
+    return ( uint64_t( t.dwHighDateTime ) << 32 ) | uint64_t( t.dwLowDateTime );
+}
+
+void SysTime::ReadTimes()
+{
+    FILETIME idleTime;
+    FILETIME kernelTime;
+    FILETIME userTime;
+
+    GetSystemTimes( &idleTime, &kernelTime, &userTime );
+
+    idle = ConvertTime( idleTime );
+    const auto kernel = ConvertTime( kernelTime );
+    const auto user = ConvertTime( userTime );
+    used = kernel + user;
+}
+
+#  elif defined __linux__
+
+void SysTime::ReadTimes()
+{
+    uint64_t user, nice, system;
+    FILE* f = fopen( "/proc/stat", "r" );
+    if( f )
+    {
+        int read = fscanf( f, "cpu %" PRIu64 " %" PRIu64 " %" PRIu64" %" PRIu64, &user, &nice, &system, &idle );
+        fclose( f );
+        if (read == 4)
+        {
+            used = user + nice + system;
+        }
+    }
+}
+
+#  elif defined __APPLE__
+
+void SysTime::ReadTimes()
+{
+    host_cpu_load_info_data_t info;
+    mach_msg_type_number_t cnt = HOST_CPU_LOAD_INFO_COUNT;
+    host_statistics( mach_host_self(), HOST_CPU_LOAD_INFO, reinterpret_cast<host_info_t>( &info ), &cnt ); 
+    used = info.cpu_ticks[CPU_STATE_USER] + info.cpu_ticks[CPU_STATE_NICE] + info.cpu_ticks[CPU_STATE_SYSTEM];
+    idle = info.cpu_ticks[CPU_STATE_IDLE];
+}
+
+#  elif defined BSD
+
+void SysTime::ReadTimes()
+{
+    u_long data[5];
+    size_t sz = sizeof( data );
+    sysctlbyname( "kern.cp_time", &data, &sz, nullptr, 0 );
+    used = data[0] + data[1] + data[2] + data[3];
+    idle = data[4];
+}
+
+#endif
+
+SysTime::SysTime()
+{
+    ReadTimes();
+}
+
+float SysTime::Get()
+{
+    const auto oldUsed = used;
+    const auto oldIdle = idle;
+
+    ReadTimes();
+
+    const auto diffIdle = idle - oldIdle;
+    const auto diffUsed = used - oldUsed;
+
+#if defined _WIN32 || defined __CYGWIN__
+    return diffUsed == 0 ? -1 : ( diffUsed - diffIdle ) * 100.f / diffUsed;
+#elif defined __linux__ || defined __APPLE__ || defined BSD
+    const auto total = diffUsed + diffIdle;
+    return total == 0 ? -1 : diffUsed * 100.f / total;
+#endif
+}
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracySysTime.hpp b/Source/ThirdParty/tracy/client/TracySysTime.hpp
new file mode 100644
index 000000000..fc6ba321a
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracySysTime.hpp
@@ -0,0 +1,36 @@
+#ifndef __TRACYSYSTIME_HPP__
+#define __TRACYSYSTIME_HPP__
+
+#if defined _WIN32 || defined __CYGWIN__ || defined __linux__ || defined __APPLE__
+#  define TRACY_HAS_SYSTIME
+#else
+#  include <sys/param.h>
+#endif
+
+#ifdef BSD
+#  define TRACY_HAS_SYSTIME
+#endif
+
+#ifdef TRACY_HAS_SYSTIME
+
+#include <stdint.h>
+
+namespace tracy
+{
+
+class SysTime
+{
+public:
+    SysTime();
+    float Get();
+
+    void ReadTimes();
+
+private:
+    uint64_t idle, used;
+};
+
+}
+#endif
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracySysTrace.cpp b/Source/ThirdParty/tracy/client/TracySysTrace.cpp
new file mode 100644
index 000000000..972779770
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracySysTrace.cpp
@@ -0,0 +1,1326 @@
+#include "TracySysTrace.hpp"
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+
+#  if defined _WIN32 || defined __CYGWIN__
+
+#    ifndef NOMINMAX
+#      define NOMINMAX
+#    endif
+
+#    define INITGUID
+#    include <assert.h>
+#    include <string.h>
+#    include <windows.h>
+#    include <dbghelp.h>
+#    include <evntrace.h>
+#    include <evntcons.h>
+#    include <psapi.h>
+#    include <winternl.h>
+
+#    include "../common/TracyAlloc.hpp"
+#    include "../common/TracySystem.hpp"
+#    include "TracyProfiler.hpp"
+#    include "TracyThread.hpp"
+
+namespace tracy
+{
+
+static const GUID PerfInfoGuid = { 0xce1dbfb4, 0x137e, 0x4da6, { 0x87, 0xb0, 0x3f, 0x59, 0xaa, 0x10, 0x2c, 0xbc } };
+static const GUID DxgKrnlGuid  = { 0x802ec45a, 0x1e99, 0x4b83, { 0x99, 0x20, 0x87, 0xc9, 0x82, 0x77, 0xba, 0x9d } };
+
+
+static TRACEHANDLE s_traceHandle;
+static TRACEHANDLE s_traceHandle2;
+static EVENT_TRACE_PROPERTIES* s_prop;
+static DWORD s_pid;
+
+static EVENT_TRACE_PROPERTIES* s_propVsync;
+static TRACEHANDLE s_traceHandleVsync;
+static TRACEHANDLE s_traceHandleVsync2;
+Thread* s_threadVsync = nullptr;
+
+struct CSwitch
+{
+    uint32_t    newThreadId;
+    uint32_t    oldThreadId;
+    int8_t      newThreadPriority;
+    int8_t      oldThreadPriority;
+    uint8_t     previousCState;
+    int8_t      spareByte;
+    int8_t      oldThreadWaitReason;
+    int8_t      oldThreadWaitMode;
+    int8_t      oldThreadState;
+    int8_t      oldThreadWaitIdealProcessor;
+    uint32_t    newThreadWaitTime;
+    uint32_t    reserved;
+};
+
+struct ReadyThread
+{
+    uint32_t    threadId;
+    int8_t      adjustReason;
+    int8_t      adjustIncrement;
+    int8_t      flag;
+    int8_t      reserverd;
+};
+
+struct ThreadTrace
+{
+    uint32_t processId;
+    uint32_t threadId;
+    uint32_t stackBase;
+    uint32_t stackLimit;
+    uint32_t userStackBase;
+    uint32_t userStackLimit;
+    uint32_t startAddr;
+    uint32_t win32StartAddr;
+    uint32_t tebBase;
+    uint32_t subProcessTag;
+};
+
+struct StackWalkEvent
+{
+    uint64_t eventTimeStamp;
+    uint32_t stackProcess;
+    uint32_t stackThread;
+    uint64_t stack[192];
+};
+
+struct VSyncInfo
+{
+    void*       dxgAdapter;
+    uint32_t    vidPnTargetId;
+    uint64_t    scannedPhysicalAddress;
+    uint32_t    vidPnSourceId;
+    uint32_t    frameNumber;
+    int64_t     frameQpcTime;
+    void*       hFlipDevice;
+    uint32_t    flipType;
+    uint64_t    flipFenceId;
+};
+
+#ifdef __CYGWIN__
+extern "C" typedef DWORD (WINAPI *t_GetProcessIdOfThread)( HANDLE );
+extern "C" typedef DWORD (WINAPI *t_GetProcessImageFileNameA)( HANDLE, LPSTR, DWORD );
+extern "C" ULONG WMIAPI TraceSetInformation(TRACEHANDLE SessionHandle, TRACE_INFO_CLASS InformationClass, PVOID TraceInformation, ULONG InformationLength);
+t_GetProcessIdOfThread GetProcessIdOfThread = (t_GetProcessIdOfThread)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetProcessIdOfThread" );
+t_GetProcessImageFileNameA GetProcessImageFileNameA = (t_GetProcessImageFileNameA)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetProcessImageFileNameA" );
+#endif
+
+extern "C" typedef NTSTATUS (WINAPI *t_NtQueryInformationThread)( HANDLE, THREADINFOCLASS, PVOID, ULONG, PULONG );
+extern "C" typedef BOOL (WINAPI *t_EnumProcessModules)( HANDLE, HMODULE*, DWORD, LPDWORD );
+extern "C" typedef BOOL (WINAPI *t_GetModuleInformation)( HANDLE, HMODULE, LPMODULEINFO, DWORD );
+extern "C" typedef DWORD (WINAPI *t_GetModuleBaseNameA)( HANDLE, HMODULE, LPSTR, DWORD );
+extern "C" typedef HRESULT (WINAPI *t_GetThreadDescription)( HANDLE, PWSTR* );
+
+t_NtQueryInformationThread NtQueryInformationThread = (t_NtQueryInformationThread)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "NtQueryInformationThread" );
+t_EnumProcessModules _EnumProcessModules = (t_EnumProcessModules)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32EnumProcessModules" );
+t_GetModuleInformation _GetModuleInformation = (t_GetModuleInformation)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetModuleInformation" );
+t_GetModuleBaseNameA _GetModuleBaseNameA = (t_GetModuleBaseNameA)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetModuleBaseNameA" );
+
+static t_GetThreadDescription _GetThreadDescription = 0;
+
+
+void WINAPI EventRecordCallback( PEVENT_RECORD record )
+{
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+
+    const auto& hdr = record->EventHeader;
+    switch( hdr.ProviderId.Data1 )
+    {
+    case 0x3d6fa8d1:    // Thread Guid
+        if( hdr.EventDescriptor.Opcode == 36 )
+        {
+            const auto cswitch = (const CSwitch*)record->UserData;
+
+            TracyLfqPrepare( QueueType::ContextSwitch );
+            MemWrite( &item->contextSwitch.time, hdr.TimeStamp.QuadPart );
+            memcpy( &item->contextSwitch.oldThread, &cswitch->oldThreadId, sizeof( cswitch->oldThreadId ) );
+            memcpy( &item->contextSwitch.newThread, &cswitch->newThreadId, sizeof( cswitch->newThreadId ) );
+            memset( ((char*)&item->contextSwitch.oldThread)+4, 0, 4 );
+            memset( ((char*)&item->contextSwitch.newThread)+4, 0, 4 );
+            MemWrite( &item->contextSwitch.cpu, record->BufferContext.ProcessorNumber );
+            MemWrite( &item->contextSwitch.reason, cswitch->oldThreadWaitReason );
+            MemWrite( &item->contextSwitch.state, cswitch->oldThreadState );
+            TracyLfqCommit;
+        }
+        else if( hdr.EventDescriptor.Opcode == 50 )
+        {
+            const auto rt = (const ReadyThread*)record->UserData;
+
+            TracyLfqPrepare( QueueType::ThreadWakeup );
+            MemWrite( &item->threadWakeup.time, hdr.TimeStamp.QuadPart );
+            memcpy( &item->threadWakeup.thread, &rt->threadId, sizeof( rt->threadId ) );
+            memset( ((char*)&item->threadWakeup.thread)+4, 0, 4 );
+            TracyLfqCommit;
+        }
+        else if( hdr.EventDescriptor.Opcode == 1 || hdr.EventDescriptor.Opcode == 3 )
+        {
+            const auto tt = (const ThreadTrace*)record->UserData;
+
+            uint64_t tid = tt->threadId;
+            if( tid == 0 ) return;
+            uint64_t pid = tt->processId;
+            TracyLfqPrepare( QueueType::TidToPid );
+            MemWrite( &item->tidToPid.tid, tid );
+            MemWrite( &item->tidToPid.pid, pid );
+            TracyLfqCommit;
+        }
+        break;
+    case 0xdef2fe46:    // StackWalk Guid
+        if( hdr.EventDescriptor.Opcode == 32 )
+        {
+            const auto sw = (const StackWalkEvent*)record->UserData;
+            if( sw->stackProcess == s_pid && ( sw->stack[0] & 0x8000000000000000 ) == 0 )
+            {
+                const uint64_t sz = ( record->UserDataLength - 16 ) / 8;
+                if( sz > 0 )
+                {
+                    auto trace = (uint64_t*)tracy_malloc( ( 1 + sz ) * sizeof( uint64_t ) );
+                    memcpy( trace, &sz, sizeof( uint64_t ) );
+                    memcpy( trace+1, sw->stack, sizeof( uint64_t ) * sz );
+                    TracyLfqPrepare( QueueType::CallstackSample );
+                    MemWrite( &item->callstackSampleFat.time, sw->eventTimeStamp );
+                    MemWrite( &item->callstackSampleFat.thread, (uint64_t)sw->stackThread );
+                    MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
+                    TracyLfqCommit;
+                }
+            }
+        }
+        break;
+    default:
+        break;
+    }
+}
+
+static constexpr const char* VsyncName[] = {
+    "[0] Vsync",
+    "[1] Vsync",
+    "[2] Vsync",
+    "[3] Vsync",
+    "[4] Vsync",
+    "[5] Vsync",
+    "[6] Vsync",
+    "[7] Vsync",
+    "Vsync"
+};
+
+static uint32_t VsyncTarget[8] = {};
+
+void WINAPI EventRecordCallbackVsync( PEVENT_RECORD record )
+{
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+
+    const auto& hdr = record->EventHeader;
+    assert( hdr.ProviderId.Data1 == 0x802EC45A );
+    assert( hdr.EventDescriptor.Id == 0x0011 );
+
+    const auto vs = (const VSyncInfo*)record->UserData;
+
+    int idx = 0;
+    do
+    {
+        if( VsyncTarget[idx] == 0 )
+        {
+            VsyncTarget[idx] = vs->vidPnTargetId;
+            break;
+        }
+        else if( VsyncTarget[idx] == vs->vidPnTargetId )
+        {
+            break;
+        }
+    }
+    while( ++idx < 8 );
+
+    TracyLfqPrepare( QueueType::FrameMarkMsg );
+    MemWrite( &item->frameMark.time, hdr.TimeStamp.QuadPart );
+    MemWrite( &item->frameMark.name, uint64_t( VsyncName[idx] ) );
+    TracyLfqCommit;
+}
+
+static void SetupVsync()
+{
+#if _WIN32_WINNT >= _WIN32_WINNT_WINBLUE
+    const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + MAX_PATH;
+    s_propVsync = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz );
+    memset( s_propVsync, 0, sizeof( EVENT_TRACE_PROPERTIES ) );
+    s_propVsync->LogFileMode = EVENT_TRACE_REAL_TIME_MODE;
+    s_propVsync->Wnode.BufferSize = psz;
+#ifdef TRACY_TIMER_QPC
+    s_propVsync->Wnode.ClientContext = 1;
+#else
+    s_propVsync->Wnode.ClientContext = 3;
+#endif
+    s_propVsync->LoggerNameOffset = sizeof( EVENT_TRACE_PROPERTIES );
+    strcpy( ((char*)s_propVsync) + sizeof( EVENT_TRACE_PROPERTIES ), "TracyVsync" );
+
+    auto backup = tracy_malloc( psz );
+    memcpy( backup, s_propVsync, psz );
+
+    const auto controlStatus = ControlTraceA( 0, "TracyVsync", s_propVsync, EVENT_TRACE_CONTROL_STOP );
+    if( controlStatus != ERROR_SUCCESS && controlStatus != ERROR_WMI_INSTANCE_NOT_FOUND )
+    {
+        tracy_free( backup );
+        tracy_free( s_propVsync );
+        return;
+    }
+
+    memcpy( s_propVsync, backup, psz );
+    tracy_free( backup );
+
+    const auto startStatus = StartTraceA( &s_traceHandleVsync, "TracyVsync", s_propVsync );
+    if( startStatus != ERROR_SUCCESS )
+    {
+        tracy_free( s_propVsync );
+        return;
+    }
+
+    EVENT_FILTER_EVENT_ID fe = {};
+    fe.FilterIn = TRUE;
+    fe.Count = 1;
+    fe.Events[0] = 0x0011;  // VSyncDPC_Info
+
+    EVENT_FILTER_DESCRIPTOR desc = {};
+    desc.Ptr = (ULONGLONG)&fe;
+    desc.Size = sizeof( fe );
+    desc.Type = EVENT_FILTER_TYPE_EVENT_ID;
+
+    ENABLE_TRACE_PARAMETERS params = {};
+    params.Version = ENABLE_TRACE_PARAMETERS_VERSION_2;
+    params.EnableProperty = EVENT_ENABLE_PROPERTY_IGNORE_KEYWORD_0;
+    params.SourceId = s_propVsync->Wnode.Guid;
+    params.EnableFilterDesc = &desc;
+    params.FilterDescCount = 1;
+
+    uint64_t mask = 0x4000000000000001;   // Microsoft_Windows_DxgKrnl_Performance | Base
+    if( EnableTraceEx2( s_traceHandleVsync, &DxgKrnlGuid, EVENT_CONTROL_CODE_ENABLE_PROVIDER, TRACE_LEVEL_INFORMATION, mask, mask, 0, &params ) != ERROR_SUCCESS )
+    {
+        tracy_free( s_propVsync );
+        return;
+    }
+
+    char loggerName[MAX_PATH];
+    strcpy( loggerName, "TracyVsync" );
+
+    EVENT_TRACE_LOGFILEA log = {};
+    log.LoggerName = loggerName;
+    log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP;
+    log.EventRecordCallback = EventRecordCallbackVsync;
+
+    s_traceHandleVsync2 = OpenTraceA( &log );
+    if( s_traceHandleVsync2 == (TRACEHANDLE)INVALID_HANDLE_VALUE )
+    {
+        CloseTrace( s_traceHandleVsync );
+        tracy_free( s_propVsync );
+        return;
+    }
+
+    s_threadVsync = (Thread*)tracy_malloc( sizeof( Thread ) );
+    new(s_threadVsync) Thread( [] (void*) {
+        ThreadExitHandler threadExitHandler;
+        SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
+        SetThreadName( "Tracy Vsync" );
+        ProcessTrace( &s_traceHandleVsync2, 1, nullptr, nullptr );
+    }, nullptr );
+#endif
+}
+
+bool SysTraceStart( int64_t& samplingPeriod )
+{
+    if( !_GetThreadDescription ) _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" );
+
+    s_pid = GetCurrentProcessId();
+
+#if defined _WIN64
+    constexpr bool isOs64Bit = true;
+#else
+    BOOL _iswow64;
+    IsWow64Process( GetCurrentProcess(), &_iswow64 );
+    const bool isOs64Bit = _iswow64;
+#endif
+
+    TOKEN_PRIVILEGES priv = {};
+    priv.PrivilegeCount = 1;
+    priv.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+    if( LookupPrivilegeValue( nullptr, SE_SYSTEM_PROFILE_NAME, &priv.Privileges[0].Luid ) == 0 ) return false;
+
+    HANDLE pt;
+    if( OpenProcessToken( GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &pt ) == 0 ) return false;
+    const auto adjust = AdjustTokenPrivileges( pt, FALSE, &priv, 0, nullptr, nullptr );
+    CloseHandle( pt );
+    if( adjust == 0 ) return false;
+    const auto status = GetLastError();
+    if( status != ERROR_SUCCESS ) return false;
+
+    if( isOs64Bit )
+    {
+        TRACE_PROFILE_INTERVAL interval = {};
+        interval.Interval = 1250;   // 8 kHz
+        const auto intervalStatus = TraceSetInformation( 0, TraceSampledProfileIntervalInfo, &interval, sizeof( interval ) );
+        if( intervalStatus != ERROR_SUCCESS ) return false;
+        samplingPeriod = 125*1000;
+    }
+
+    const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + sizeof( KERNEL_LOGGER_NAME );
+    s_prop = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz );
+    memset( s_prop, 0, sizeof( EVENT_TRACE_PROPERTIES ) );
+    ULONG flags = 0;
+#ifndef TRACY_NO_CONTEXT_SWITCH
+    flags = EVENT_TRACE_FLAG_CSWITCH | EVENT_TRACE_FLAG_DISPATCHER | EVENT_TRACE_FLAG_THREAD;
+#endif
+#ifndef TRACY_NO_SAMPLING
+    if( isOs64Bit ) flags |= EVENT_TRACE_FLAG_PROFILE;
+#endif
+    s_prop->EnableFlags = flags;
+    s_prop->LogFileMode = EVENT_TRACE_REAL_TIME_MODE;
+    s_prop->Wnode.BufferSize = psz;
+    s_prop->Wnode.Flags = WNODE_FLAG_TRACED_GUID;
+#ifdef TRACY_TIMER_QPC
+    s_prop->Wnode.ClientContext = 1;
+#else
+    s_prop->Wnode.ClientContext = 3;
+#endif
+    s_prop->Wnode.Guid = SystemTraceControlGuid;
+    s_prop->BufferSize = 1024;
+    s_prop->MinimumBuffers = std::thread::hardware_concurrency() * 4;
+    s_prop->MaximumBuffers = std::thread::hardware_concurrency() * 6;
+    s_prop->LoggerNameOffset = sizeof( EVENT_TRACE_PROPERTIES );
+    memcpy( ((char*)s_prop) + sizeof( EVENT_TRACE_PROPERTIES ), KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) );
+
+    auto backup = tracy_malloc( psz );
+    memcpy( backup, s_prop, psz );
+
+    const auto controlStatus = ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP );
+    if( controlStatus != ERROR_SUCCESS && controlStatus != ERROR_WMI_INSTANCE_NOT_FOUND )
+    {
+        tracy_free( backup );
+        tracy_free( s_prop );
+        return false;
+    }
+
+    memcpy( s_prop, backup, psz );
+    tracy_free( backup );
+
+    const auto startStatus = StartTrace( &s_traceHandle, KERNEL_LOGGER_NAME, s_prop );
+    if( startStatus != ERROR_SUCCESS )
+    {
+        tracy_free( s_prop );
+        return false;
+    }
+
+    if( isOs64Bit )
+    {
+        CLASSIC_EVENT_ID stackId;
+        stackId.EventGuid = PerfInfoGuid;
+        stackId.Type = 46;
+        const auto stackStatus = TraceSetInformation( s_traceHandle, TraceStackTracingInfo, &stackId, sizeof( stackId ) );
+        if( stackStatus != ERROR_SUCCESS )
+        {
+            tracy_free( s_prop );
+            return false;
+        }
+    }
+
+#ifdef UNICODE
+    WCHAR KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )];
+#else
+    char KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )];
+#endif
+    memcpy( KernelLoggerName, KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) );
+    EVENT_TRACE_LOGFILE log = {};
+    log.LoggerName = KernelLoggerName;
+    log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP;
+    log.EventRecordCallback = EventRecordCallback;
+
+    s_traceHandle2 = OpenTrace( &log );
+    if( s_traceHandle2 == (TRACEHANDLE)INVALID_HANDLE_VALUE )
+    {
+        CloseTrace( s_traceHandle );
+        tracy_free( s_prop );
+        return false;
+    }
+
+#ifndef TRACY_NO_VSYNC_CAPTURE
+    SetupVsync();
+#endif
+
+    return true;
+}
+
+void SysTraceStop()
+{
+    if( s_threadVsync )
+    {
+        CloseTrace( s_traceHandleVsync2 );
+        CloseTrace( s_traceHandleVsync );
+        s_threadVsync->~Thread();
+        tracy_free( s_threadVsync );
+    }
+
+    CloseTrace( s_traceHandle2 );
+    CloseTrace( s_traceHandle );
+}
+
+void SysTraceWorker( void* ptr )
+{
+    ThreadExitHandler threadExitHandler;
+    SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
+    SetThreadName( "Tracy SysTrace" );
+    ProcessTrace( &s_traceHandle2, 1, 0, 0 );
+    ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP );
+    tracy_free( s_prop );
+}
+
+void SysTraceSendExternalName( uint64_t thread )
+{
+    bool threadSent = false;
+    auto hnd = OpenThread( THREAD_QUERY_INFORMATION, FALSE, DWORD( thread ) );
+    if( hnd == 0 )
+    {
+        hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, DWORD( thread ) );
+    }
+    if( hnd != 0 )
+    {
+        PWSTR tmp;
+        _GetThreadDescription( hnd, &tmp );
+        char buf[256];
+        if( tmp )
+        {
+            auto ret = wcstombs( buf, tmp, 256 );
+            if( ret != 0 )
+            {
+                GetProfiler().SendString( thread, buf, ret, QueueType::ExternalThreadName );
+                threadSent = true;
+            }
+        }
+        const auto pid = GetProcessIdOfThread( hnd );
+        if( !threadSent && NtQueryInformationThread && _EnumProcessModules && _GetModuleInformation && _GetModuleBaseNameA )
+        {
+            void* ptr;
+            ULONG retlen;
+            auto status = NtQueryInformationThread( hnd, (THREADINFOCLASS)9 /*ThreadQuerySetWin32StartAddress*/, &ptr, sizeof( &ptr ), &retlen );
+            if( status == 0 )
+            {
+                const auto phnd = OpenProcess( PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, pid );
+                if( phnd != INVALID_HANDLE_VALUE )
+                {
+                    HMODULE modules[1024];
+                    DWORD needed;
+                    if( _EnumProcessModules( phnd, modules, 1024 * sizeof( HMODULE ), &needed ) != 0 )
+                    {
+                        const auto sz = std::min( DWORD( needed / sizeof( HMODULE ) ), DWORD( 1024 ) );
+                        for( DWORD i=0; i<sz; i++ )
+                        {
+                            MODULEINFO info;
+                            if( _GetModuleInformation( phnd, modules[i], &info, sizeof( info ) ) != 0 )
+                            {
+                                if( (uint64_t)ptr >= (uint64_t)info.lpBaseOfDll && (uint64_t)ptr <= (uint64_t)info.lpBaseOfDll + (uint64_t)info.SizeOfImage )
+                                {
+                                    char buf2[1024];
+                                    const auto modlen = _GetModuleBaseNameA( phnd, modules[i], buf2, 1024 );
+                                    if( modlen != 0 )
+                                    {
+                                        GetProfiler().SendString( thread, buf2, modlen, QueueType::ExternalThreadName );
+                                        threadSent = true;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    CloseHandle( phnd );
+                }
+            }
+        }
+        CloseHandle( hnd );
+        if( !threadSent )
+        {
+            GetProfiler().SendString( thread, "???", 3, QueueType::ExternalThreadName );
+            threadSent = true;
+        }
+        if( pid != 0 )
+        {
+            {
+                uint64_t _pid = pid;
+                TracyLfqPrepare( QueueType::TidToPid );
+                MemWrite( &item->tidToPid.tid, thread );
+                MemWrite( &item->tidToPid.pid, _pid );
+                TracyLfqCommit;
+            }
+            if( pid == 4 )
+            {
+                GetProfiler().SendString( thread, "System", 6, QueueType::ExternalName );
+                return;
+            }
+            else
+            {
+                const auto phnd = OpenProcess( PROCESS_QUERY_LIMITED_INFORMATION, FALSE, pid );
+                if( phnd != INVALID_HANDLE_VALUE )
+                {
+                    char buf2[1024];
+                    const auto sz = GetProcessImageFileNameA( phnd, buf2, 1024 );
+                    CloseHandle( phnd );
+                    if( sz != 0 )
+                    {
+                        auto ptr = buf2 + sz - 1;
+                        while( ptr > buf2 && *ptr != '\\' ) ptr--;
+                        if( *ptr == '\\' ) ptr++;
+                        GetProfiler().SendString( thread, ptr, QueueType::ExternalName );
+                        return;
+                    }
+                }
+            }
+        }
+    }
+
+    if( !threadSent )
+    {
+        GetProfiler().SendString( thread, "???", 3, QueueType::ExternalThreadName );
+    }
+    GetProfiler().SendString( thread, "???", 3, QueueType::ExternalName );
+}
+
+}
+
+#  elif defined __linux__
+
+#    include <sys/types.h>
+#    include <sys/stat.h>
+#    include <sys/wait.h>
+#    include <fcntl.h>
+#    include <inttypes.h>
+#    include <limits>
+#    include <poll.h>
+#    include <stdio.h>
+#    include <stdlib.h>
+#    include <string.h>
+#    include <unistd.h>
+#    include <atomic>
+#    include <thread>
+#    include <linux/perf_event.h>
+#    include <linux/version.h>
+#    include <sys/mman.h>
+#    include <sys/ioctl.h>
+#    include <sys/syscall.h>
+
+#    include "TracyProfiler.hpp"
+#    include "TracyRingBuffer.hpp"
+#    include "TracyThread.hpp"
+
+#    ifdef __ANDROID__
+#      include "TracySysTracePayload.hpp"
+#    endif
+
+namespace tracy
+{
+
+static const char BasePath[] = "/sys/kernel/debug/tracing/";
+static const char TracingOn[] = "tracing_on";
+static const char CurrentTracer[] = "current_tracer";
+static const char TraceOptions[] = "trace_options";
+static const char TraceClock[] = "trace_clock";
+static const char SchedSwitch[] = "events/sched/sched_switch/enable";
+static const char SchedWakeup[] = "events/sched/sched_wakeup/enable";
+static const char BufferSizeKb[] = "buffer_size_kb";
+static const char TracePipe[] = "trace_pipe";
+
+static std::atomic<bool> traceActive { false };
+static Thread* s_threadSampling = nullptr;
+static int s_numCpus = 0;
+
+static constexpr size_t RingBufSize = 64*1024;
+static RingBuffer<RingBufSize>* s_ring = nullptr;
+
+static int perf_event_open( struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags )
+{
+    return syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags );
+}
+
+static void SetupSampling( int64_t& samplingPeriod )
+{
+#ifndef CLOCK_MONOTONIC_RAW
+    return;
+#endif
+
+    samplingPeriod = 100*1000;
+
+    s_numCpus = (int)std::thread::hardware_concurrency();
+    s_ring = (RingBuffer<RingBufSize>*)tracy_malloc( sizeof( RingBuffer<RingBufSize> ) * s_numCpus );
+
+    perf_event_attr pe = {};
+
+    pe.type = PERF_TYPE_SOFTWARE;
+    pe.size = sizeof( perf_event_attr );
+    pe.config = PERF_COUNT_SW_CPU_CLOCK;
+
+    pe.sample_freq = 10000;
+    pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 )
+    pe.sample_max_stack = 127;
+#endif
+    pe.exclude_callchain_kernel = 1;
+
+    pe.disabled = 1;
+    pe.freq = 1;
+#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+    pe.use_clockid = 1;
+    pe.clockid = CLOCK_MONOTONIC_RAW;
+#endif
+
+    for( int i=0; i<s_numCpus; i++ )
+    {
+        const int fd = perf_event_open( &pe, -1, i, -1, 0 );
+        if( fd == -1 )
+        {
+            for( int j=0; j<i; j++ ) s_ring[j].~RingBuffer<RingBufSize>();
+            tracy_free( s_ring );
+            return;
+        }
+        new( s_ring+i ) RingBuffer<RingBufSize>( fd );
+    }
+
+    s_threadSampling = (Thread*)tracy_malloc( sizeof( Thread ) );
+    new(s_threadSampling) Thread( [] (void*) {
+        ThreadExitHandler threadExitHandler;
+        SetThreadName( "Tracy Sampling" );
+        sched_param sp = { 5 };
+        pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp );
+        uint32_t currentPid = (uint32_t)getpid();
+#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            if( !s_ring[i].CheckTscCaps() )
+            {
+                for( int j=0; j<s_numCpus; j++ ) s_ring[j].~RingBuffer<RingBufSize>();
+                tracy_free( s_ring );
+                const char* err = "Tracy Profiler: sampling is disabled due to non-native scheduler clock. Are you running under a VM?";
+                Profiler::MessageAppInfo( err, strlen( err ) );
+                return;
+            }
+        }
+#endif
+        for( int i=0; i<s_numCpus; i++ ) s_ring[i].Enable();
+        for(;;)
+        {
+            bool hadData = false;
+            for( int i=0; i<s_numCpus; i++ )
+            {
+                if( !traceActive.load( std::memory_order_relaxed ) ) break;
+                if( !s_ring[i].HasData() ) continue;
+                hadData = true;
+
+                perf_event_header hdr;
+                s_ring[i].Read( &hdr, 0, sizeof( perf_event_header ) );
+                if( hdr.type == PERF_RECORD_SAMPLE )
+                {
+                    uint32_t pid, tid;
+                    uint64_t t0;
+                    uint64_t cnt;
+
+                    auto offset = sizeof( perf_event_header );
+                    s_ring[i].Read( &pid, offset, sizeof( uint32_t ) );
+                    if( pid == currentPid )
+                    {
+                        offset += sizeof( uint32_t );
+                        s_ring[i].Read( &tid, offset, sizeof( uint32_t ) );
+                        offset += sizeof( uint32_t );
+                        s_ring[i].Read( &t0, offset, sizeof( uint64_t ) );
+                        offset += sizeof( uint64_t );
+                        s_ring[i].Read( &cnt, offset, sizeof( uint64_t ) );
+                        offset += sizeof( uint64_t );
+
+                        auto trace = (uint64_t*)tracy_malloc( ( 1 + cnt ) * sizeof( uint64_t ) );
+                        s_ring[i].Read( trace+1, offset, sizeof( uint64_t ) * cnt );
+
+                        // remove non-canonical pointers
+                        do
+                        {
+                            const auto test = (int64_t)trace[cnt];
+                            const auto m1 = test >> 63;
+                            const auto m2 = test >> 47;
+                            if( m1 == m2 ) break;
+                        }
+                        while( --cnt > 0 );
+                        for( uint64_t j=1; j<cnt; j++ )
+                        {
+                            const auto test = (int64_t)trace[j];
+                            const auto m1 = test >> 63;
+                            const auto m2 = test >> 47;
+                            if( m1 != m2 ) trace[j] = 0;
+                        }
+
+                        // skip kernel frames
+                        uint64_t j;
+                        for( j=0; j<cnt; j++ )
+                        {
+                            if( (int64_t)trace[j+1] >= 0 ) break;
+                        }
+                        if( j == cnt )
+                        {
+                            tracy_free( trace );
+                        }
+                        else
+                        {
+                            if( j > 0 )
+                            {
+                                cnt -= j;
+                                memmove( trace+1, trace+1+j, sizeof( uint64_t ) * cnt );
+                            }
+                            memcpy( trace, &cnt, sizeof( uint64_t ) );
+
+#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+                            t0 = s_ring[i].ConvertTimeToTsc( t0 );
+#endif
+
+                            TracyLfqPrepare( QueueType::CallstackSample );
+                            MemWrite( &item->callstackSampleFat.time, t0 );
+                            MemWrite( &item->callstackSampleFat.thread, (uint64_t)tid );
+                            MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
+                            TracyLfqCommit;
+                        }
+                    }
+                }
+                s_ring[i].Advance( hdr.size );
+            }
+            if( !traceActive.load( std::memory_order_relaxed) ) break;
+            if( !hadData )
+            {
+                std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+            }
+        }
+
+        for( int i=0; i<s_numCpus; i++ ) s_ring[i].~RingBuffer<RingBufSize>();
+        tracy_free( s_ring );
+    }, nullptr );
+}
+
+#ifdef __ANDROID__
+static bool TraceWrite( const char* path, size_t psz, const char* val, size_t vsz )
+{
+    // Explanation for "su root sh -c": there are 2 flavors of "su" in circulation
+    // on Android. The default Android su has the following syntax to run a command
+    // as root:
+    //   su root 'command'
+    // and 'command' is exec'd not passed to a shell, so if shell interpretation is
+    // wanted, one needs to do:
+    //   su root sh -c 'command'
+    // Besides that default Android 'su' command, some Android devices use a different
+    // su with a command-line interface closer to the familiar util-linux su found
+    // on Linux distributions. Fortunately, both the util-linux su and the one
+    // in https://github.com/topjohnwu/Magisk seem to be happy with the above
+    // `su root sh -c 'command'` command line syntax.
+    char tmp[256];
+    sprintf( tmp, "su root sh -c 'echo \"%s\" > %s%s'", val, BasePath, path );
+    return system( tmp ) == 0;
+}
+#else
+static bool TraceWrite( const char* path, size_t psz, const char* val, size_t vsz )
+{
+    char tmp[256];
+    memcpy( tmp, BasePath, sizeof( BasePath ) - 1 );
+    memcpy( tmp + sizeof( BasePath ) - 1, path, psz );
+
+    int fd = open( tmp, O_WRONLY );
+    if( fd < 0 ) return false;
+
+    for(;;)
+    {
+        ssize_t cnt = write( fd, val, vsz );
+        if( cnt == (ssize_t)vsz )
+        {
+            close( fd );
+            return true;
+        }
+        if( cnt < 0 )
+        {
+            close( fd );
+            return false;
+        }
+        vsz -= cnt;
+        val += cnt;
+    }
+}
+#endif
+
+#ifdef __ANDROID__
+void SysTraceInjectPayload()
+{
+    int pipefd[2];
+    if( pipe( pipefd ) == 0 )
+    {
+        const auto pid = fork();
+        if( pid == 0 )
+        {
+            // child
+            close( pipefd[1] );
+            if( dup2( pipefd[0], STDIN_FILENO ) >= 0 )
+            {
+                close( pipefd[0] );
+                execlp( "su", "su", "root", "sh", "-c", "cat > /data/tracy_systrace", (char*)nullptr );
+                exit( 1 );
+            }
+        }
+        else if( pid > 0 )
+        {
+            // parent
+            close( pipefd[0] );
+
+#ifdef __aarch64__
+            write( pipefd[1], tracy_systrace_aarch64_data, tracy_systrace_aarch64_size );
+#else
+            write( pipefd[1], tracy_systrace_armv7_data, tracy_systrace_armv7_size );
+#endif
+            close( pipefd[1] );
+            waitpid( pid, nullptr, 0 );
+
+            system( "su root sh -c 'chmod 700 /data/tracy_systrace'" );
+        }
+    }
+}
+#endif
+
+bool SysTraceStart( int64_t& samplingPeriod )
+{
+#ifndef CLOCK_MONOTONIC_RAW
+    return false;
+#endif
+
+    if( !TraceWrite( TracingOn, sizeof( TracingOn ), "0", 2 ) ) return false;
+    if( !TraceWrite( CurrentTracer, sizeof( CurrentTracer ), "nop", 4 ) ) return false;
+    TraceWrite( TraceOptions, sizeof( TraceOptions ), "norecord-cmd", 13 );
+    TraceWrite( TraceOptions, sizeof( TraceOptions ), "norecord-tgid", 14 );
+    TraceWrite( TraceOptions, sizeof( TraceOptions ), "noirq-info", 11 );
+    TraceWrite( TraceOptions, sizeof( TraceOptions ), "noannotate", 11 );
+#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+    if( !TraceWrite( TraceClock, sizeof( TraceClock ), "x86-tsc", 8 ) ) return false;
+#else
+    if( !TraceWrite( TraceClock, sizeof( TraceClock ), "mono_raw", 9 ) ) return false;
+#endif
+    if( !TraceWrite( SchedSwitch, sizeof( SchedSwitch ), "1", 2 ) ) return false;
+    if( !TraceWrite( SchedWakeup, sizeof( SchedWakeup ), "1", 2 ) ) return false;
+    if( !TraceWrite( BufferSizeKb, sizeof( BufferSizeKb ), "4096", 5 ) ) return false;
+
+#if defined __ANDROID__ && ( defined __aarch64__ || defined __ARM_ARCH )
+    SysTraceInjectPayload();
+#endif
+
+    if( !TraceWrite( TracingOn, sizeof( TracingOn ), "1", 2 ) ) return false;
+    traceActive.store( true, std::memory_order_relaxed );
+
+    SetupSampling( samplingPeriod );
+
+    return true;
+}
+
+void SysTraceStop()
+{
+    TraceWrite( TracingOn, sizeof( TracingOn ), "0", 2 );
+    traceActive.store( false, std::memory_order_relaxed );
+    if( s_threadSampling )
+    {
+        s_threadSampling->~Thread();
+        tracy_free( s_threadSampling );
+    }
+}
+
+static uint64_t ReadNumber( const char*& data )
+{
+    auto ptr = data;
+    assert( *ptr >= '0' && *ptr <= '9' );
+    uint64_t val = *ptr++ - '0';
+    for(;;)
+    {
+        const uint8_t v = uint8_t( *ptr - '0' );
+        if( v > 9 ) break;
+        val = val * 10 + v;
+        ptr++;
+    }
+    data = ptr;
+    return val;
+}
+
+static uint8_t ReadState( char state )
+{
+    switch( state )
+    {
+    case 'D': return 101;
+    case 'I': return 102;
+    case 'R': return 103;
+    case 'S': return 104;
+    case 'T': return 105;
+    case 't': return 106;
+    case 'W': return 107;
+    case 'X': return 108;
+    case 'Z': return 109;
+    default: return 100;
+    }
+}
+
+#if defined __ANDROID__ && defined __ANDROID_API__ && __ANDROID_API__ < 18
+/*-
+ * Copyright (c) 2011 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Christos Zoulas.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+ssize_t getdelim(char **buf, size_t *bufsiz, int delimiter, FILE *fp)
+{
+	char *ptr, *eptr;
+
+	if (*buf == NULL || *bufsiz == 0) {
+		*bufsiz = BUFSIZ;
+		if ((*buf = (char*)malloc(*bufsiz)) == NULL)
+			return -1;
+	}
+
+	for (ptr = *buf, eptr = *buf + *bufsiz;;) {
+		int c = fgetc(fp);
+		if (c == -1) {
+			if (feof(fp))
+				return ptr == *buf ? -1 : ptr - *buf;
+			else
+				return -1;
+		}
+		*ptr++ = c;
+		if (c == delimiter) {
+			*ptr = '\0';
+			return ptr - *buf;
+		}
+		if (ptr + 2 >= eptr) {
+			char *nbuf;
+			size_t nbufsiz = *bufsiz * 2;
+			ssize_t d = ptr - *buf;
+			if ((nbuf = (char*)realloc(*buf, nbufsiz)) == NULL)
+				return -1;
+			*buf = nbuf;
+			*bufsiz = nbufsiz;
+			eptr = nbuf + nbufsiz;
+			ptr = nbuf + d;
+		}
+	}
+}
+
+ssize_t getline(char **buf, size_t *bufsiz, FILE *fp)
+{
+	return getdelim(buf, bufsiz, '\n', fp);
+}
+#endif
+
+static void HandleTraceLine( const char* line )
+{
+    line += 23;
+    while( *line != '[' ) line++;
+    line++;
+    const auto cpu = (uint8_t)ReadNumber( line );
+    line++;      // ']'
+    while( *line == ' ' ) line++;
+
+#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+    const auto time = ReadNumber( line );
+#else
+    const auto ts = ReadNumber( line );
+    line++;      // '.'
+    const auto tus = ReadNumber( line );
+    const auto time = ts * 1000000000ll + tus * 1000ll;
+#endif
+
+    line += 2;   // ': '
+    if( memcmp( line, "sched_switch", 12 ) == 0 )
+    {
+        line += 14;
+
+        while( memcmp( line, "prev_pid", 8 ) != 0 ) line++;
+        line += 9;
+
+        const auto oldPid = ReadNumber( line );
+        line++;
+
+        while( memcmp( line, "prev_state", 10 ) != 0 ) line++;
+        line += 11;
+
+        const auto oldState = (uint8_t)ReadState( *line );
+        line += 5;
+
+        while( memcmp( line, "next_pid", 8 ) != 0 ) line++;
+        line += 9;
+
+        const auto newPid = ReadNumber( line );
+
+        uint8_t reason = 100;
+
+        TracyLfqPrepare( QueueType::ContextSwitch );
+        MemWrite( &item->contextSwitch.time, time );
+        MemWrite( &item->contextSwitch.oldThread, oldPid );
+        MemWrite( &item->contextSwitch.newThread, newPid );
+        MemWrite( &item->contextSwitch.cpu, cpu );
+        MemWrite( &item->contextSwitch.reason, reason );
+        MemWrite( &item->contextSwitch.state, oldState );
+        TracyLfqCommit;
+    }
+    else if( memcmp( line, "sched_wakeup", 12 ) == 0 )
+    {
+        line += 14;
+
+        while( memcmp( line, "pid=", 4 ) != 0 ) line++;
+        line += 4;
+
+        const auto pid = ReadNumber( line );
+
+        TracyLfqPrepare( QueueType::ThreadWakeup );
+        MemWrite( &item->threadWakeup.time, time );
+        MemWrite( &item->threadWakeup.thread, pid );
+        TracyLfqCommit;
+    }
+}
+
+#ifdef __ANDROID__
+static void ProcessTraceLines( int fd )
+{
+    // Linux pipe buffer is 64KB, additional 1KB is for unfinished lines
+    char* buf = (char*)tracy_malloc( (64+1)*1024 );
+    char* line = buf;
+
+    for(;;)
+    {
+        if( !traceActive.load( std::memory_order_relaxed ) ) break;
+
+        const auto rd = read( fd, line, 64*1024 );
+        if( rd <= 0 ) break;
+
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() )
+        {
+            if( rd < 64*1024 )
+            {
+                assert( line[rd-1] == '\n' );
+                line = buf;
+                std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+            }
+            else
+            {
+                const auto end = line + rd;
+                line = end - 1;
+                while( line > buf && *line != '\n' ) line--;
+                if( line > buf )
+                {
+                    line++;
+                    const auto lsz = end - line;
+                    memmove( buf, line, lsz );
+                    line = buf + lsz;
+                }
+            }
+            continue;
+        }
+#endif
+
+        const auto end = line + rd;
+        line = buf;
+        for(;;)
+        {
+            auto next = (char*)memchr( line, '\n', end - line );
+            if( !next )
+            {
+                const auto lsz = end - line;
+                memmove( buf, line, lsz );
+                line = buf + lsz;
+                break;
+            }
+            HandleTraceLine( line );
+            line = ++next;
+        }
+        if( rd < 64*1024 )
+        {
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+        }
+    }
+
+    tracy_free( buf );
+}
+
+void SysTraceWorker( void* ptr )
+{
+    ThreadExitHandler threadExitHandler;
+    SetThreadName( "Tracy SysTrace" );
+    int pipefd[2];
+    if( pipe( pipefd ) == 0 )
+    {
+        const auto pid = fork();
+        if( pid == 0 )
+        {
+            // child
+            close( pipefd[0] );
+            dup2( open( "/dev/null", O_WRONLY ), STDERR_FILENO );
+            if( dup2( pipefd[1], STDOUT_FILENO ) >= 0 )
+            {
+                close( pipefd[1] );
+                sched_param sp = { 4 };
+                pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp );
+#if defined __ANDROID__ && ( defined __aarch64__ || defined __ARM_ARCH )
+                execlp( "su", "su", "root", "sh", "-c", "/data/tracy_systrace", (char*)nullptr );
+#endif
+                execlp( "su", "su", "root", "sh", "-c", "cat /sys/kernel/debug/tracing/trace_pipe", (char*)nullptr );
+                exit( 1 );
+            }
+        }
+        else if( pid > 0 )
+        {
+            // parent
+            close( pipefd[1] );
+            sched_param sp = { 5 };
+            pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp );
+            ProcessTraceLines( pipefd[0] );
+            close( pipefd[0] );
+            waitpid( pid, nullptr, 0 );
+        }
+    }
+}
+#else
+static void ProcessTraceLines( int fd )
+{
+    char* buf = (char*)tracy_malloc( 64*1024 );
+
+    struct pollfd pfd;
+    pfd.fd = fd;
+    pfd.events = POLLIN | POLLERR;
+
+    for(;;)
+    {
+        while( poll( &pfd, 1, 0 ) <= 0 )
+        {
+            if( !traceActive.load( std::memory_order_relaxed ) ) break;
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+        }
+
+        const auto rd = read( fd, buf, 64*1024 );
+        if( rd <= 0 ) break;
+
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) continue;
+#endif
+
+        auto line = buf;
+        const auto end = buf + rd;
+        for(;;)
+        {
+            auto next = (char*)memchr( line, '\n', end - line );
+            if( !next ) break;
+            HandleTraceLine( line );
+            line = ++next;
+        }
+    }
+
+    tracy_free( buf );
+}
+
+void SysTraceWorker( void* ptr )
+{
+    ThreadExitHandler threadExitHandler;
+    SetThreadName( "Tracy SysTrace" );
+    char tmp[256];
+    memcpy( tmp, BasePath, sizeof( BasePath ) - 1 );
+    memcpy( tmp + sizeof( BasePath ) - 1, TracePipe, sizeof( TracePipe ) );
+
+    int fd = open( tmp, O_RDONLY );
+    if( fd < 0 ) return;
+    sched_param sp = { 5 };
+    pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp );
+    ProcessTraceLines( fd );
+    close( fd );
+}
+#endif
+
+void SysTraceSendExternalName( uint64_t thread )
+{
+    FILE* f;
+    char fn[256];
+    sprintf( fn, "/proc/%" PRIu64 "/comm", thread );
+    f = fopen( fn, "rb" );
+    if( f )
+    {
+        char buf[256];
+        const auto sz = fread( buf, 1, 256, f );
+        if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0';
+        GetProfiler().SendString( thread, buf, QueueType::ExternalThreadName );
+        fclose( f );
+    }
+    else
+    {
+        GetProfiler().SendString( thread, "???", 3, QueueType::ExternalThreadName );
+    }
+
+    sprintf( fn, "/proc/%" PRIu64 "/status", thread );
+    f = fopen( fn, "rb" );
+    if( f )
+    {
+        int pid = -1;
+        size_t lsz = 1024;
+        auto line = (char*)tracy_malloc( lsz );
+        for(;;)
+        {
+            auto rd = getline( &line, &lsz, f );
+            if( rd <= 0 ) break;
+            if( memcmp( "Tgid:\t", line, 6 ) == 0 )
+            {
+                pid = atoi( line + 6 );
+                break;
+            }
+        }
+        tracy_free( line );
+        fclose( f );
+        if( pid >= 0 )
+        {
+            {
+                uint64_t _pid = pid;
+                TracyLfqPrepare( QueueType::TidToPid );
+                MemWrite( &item->tidToPid.tid, thread );
+                MemWrite( &item->tidToPid.pid, _pid );
+                TracyLfqCommit;
+            }
+            sprintf( fn, "/proc/%i/comm", pid );
+            f = fopen( fn, "rb" );
+            if( f )
+            {
+                char buf[256];
+                const auto sz = fread( buf, 1, 256, f );
+                if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0';
+                GetProfiler().SendString( thread, buf, QueueType::ExternalName );
+                fclose( f );
+                return;
+            }
+        }
+    }
+    GetProfiler().SendString( thread, "???", 3, QueueType::ExternalName );
+}
+
+}
+
+#  endif
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracySysTrace.hpp b/Source/ThirdParty/tracy/client/TracySysTrace.hpp
new file mode 100644
index 000000000..688cbf2ae
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracySysTrace.hpp
@@ -0,0 +1,25 @@
+#ifndef __TRACYSYSTRACE_HPP__
+#define __TRACYSYSTRACE_HPP__
+
+#if !defined TRACY_NO_SYSTEM_TRACING && ( defined _WIN32 || defined __CYGWIN__ || defined __linux__ )
+#  define TRACY_HAS_SYSTEM_TRACING
+#endif
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+
+#include <stdint.h>
+
+namespace tracy
+{
+
+bool SysTraceStart( int64_t& samplingPeriod );
+void SysTraceStop();
+void SysTraceWorker( void* ptr );
+
+void SysTraceSendExternalName( uint64_t thread );
+
+}
+
+#endif
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracySysTracePayload.hpp b/Source/ThirdParty/tracy/client/TracySysTracePayload.hpp
new file mode 100644
index 000000000..7c292f9d0
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracySysTracePayload.hpp
@@ -0,0 +1,78 @@
+// File: 'extra/systrace/tracy_systrace.armv7' (1149 bytes)
+// File: 'extra/systrace/tracy_systrace.aarch64' (1650 bytes)
+
+// Exported using binary_to_compressed_c.cpp
+
+namespace tracy
+{
+
+static const unsigned int tracy_systrace_armv7_size = 1149;
+static const unsigned int tracy_systrace_armv7_data[1152/4] =
+{
+    0x464c457f, 0x00010101, 0x00000000, 0x00000000, 0x00280003, 0x00000001, 0x000001f0, 0x00000034, 0x00000000, 0x05000200, 0x00200034, 0x00280007, 
+    0x00000000, 0x00000006, 0x00000034, 0x00000034, 0x00000034, 0x000000e0, 0x000000e0, 0x00000004, 0x00000004, 0x00000003, 0x00000114, 0x00000114, 
+    0x00000114, 0x00000013, 0x00000013, 0x00000004, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x000003fd, 0x000003fd, 0x00000005, 
+    0x00001000, 0x00000001, 0x000003fd, 0x000013fd, 0x000013fd, 0x00000080, 0x000000b3, 0x00000006, 0x00001000, 0x00000002, 0x00000400, 0x00001400, 
+    0x00001400, 0x0000007d, 0x000000b0, 0x00000006, 0x00000004, 0x6474e551, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000006, 
+    0x00000004, 0x70000001, 0x000003a4, 0x000003a4, 0x000003a4, 0x00000008, 0x00000008, 0x00000004, 0x00000004, 0x7379732f, 0x2f6d6574, 0x2f6e6962, 
+    0x6b6e696c, 0x00007265, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000012, 0x00000016, 0x00000000, 
+    0x00000000, 0x00000012, 0x6f6c6400, 0x006e6570, 0x4342494c, 0x62696c00, 0x732e6c64, 0x6c64006f, 0x006d7973, 0x00000001, 0x00000003, 0x00000001, 
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00010001, 0x0000000d, 0x00000010, 0x00000000, 0x00050d63, 0x00020000, 0x00000008, 
+    0x00000000, 0x000014bc, 0x00000116, 0x000014c0, 0x00000216, 0xe52de004, 0xe59fe004, 0xe08fe00e, 0xe5bef008, 0x000012dc, 0xe28fc600, 0xe28cca01, 
+    0xe5bcf2dc, 0xe28fc600, 0xe28cca01, 0xe5bcf2d4, 0xe92d4ff0, 0xe28db01c, 0xe24dd024, 0xe24dd801, 0xe59f017c, 0xe3a01001, 0xe3a08001, 0xe08f0000, 
+    0xebfffff0, 0xe59f116c, 0xe1a04000, 0xe08f1001, 0xebffffef, 0xe59f1160, 0xe1a06000, 0xe1a00004, 0xe08f1001, 0xebffffea, 0xe59f1150, 0xe1a07000, 
+    0xe1a00004, 0xe08f1001, 0xebffffe5, 0xe59f1140, 0xe1a05000, 0xe1a00004, 0xe08f1001, 0xebffffe0, 0xe58d0004, 0xe1a00004, 0xe59f1128, 0xe08f1001, 
+    0xebffffdb, 0xe59f1120, 0xe1a0a000, 0xe1a00004, 0xe08f1001, 0xebffffd6, 0xe1a04000, 0xe59f010c, 0xe3a01000, 0xe3a09000, 0xe08f0000, 0xe12fff36, 
+    0xe1a06000, 0xe3700001, 0xca000001, 0xe3a00000, 0xe12fff37, 0xe3a00009, 0xe3a01001, 0xe1cd01bc, 0xe3a00008, 0xe1cd01b4, 0xe3090680, 0xe3400098, 
+    0xe3a02000, 0xe58d000c, 0xe28d0010, 0xe58d7000, 0xe58d6018, 0xe58d8010, 0xe58d9008, 0xe12fff35, 0xe3500000, 0xca00001d, 0xe28d7018, 0xe28d8010, 
+    0xe28d9020, 0xe1a00007, 0xe3a01001, 0xe3a02000, 0xe12fff35, 0xe3500000, 0xda00000a, 0xe1a00006, 0xe1a01009, 0xe3a02801, 0xe12fff3a, 0xe3500001, 
+    0xba00000e, 0xe1a02000, 0xe3a00001, 0xe1a01009, 0xe12fff34, 0xea000003, 0xe59d2004, 0xe28d0008, 0xe3a01000, 0xe12fff32, 0xe1a00008, 0xe3a01001, 
+    0xe3a02000, 0xe12fff35, 0xe3500001, 0xbaffffe4, 0xe59d1000, 0xe3a00000, 0xe12fff31, 0xe24bd01c, 0xe8bd8ff0, 0x00000198, 0x00000190, 0x00000181, 
+    0x00000172, 0x00000163, 0x00000159, 0x0000014a, 0x00000138, 0x7ffffe4c, 0x00000001, 0x6362696c, 0x006f732e, 0x6e65706f, 0x69786500, 0x6f700074, 
+    0x6e006c6c, 0x736f6e61, 0x7065656c, 0x61657200, 0x72770064, 0x00657469, 0x7379732f, 0x72656b2f, 0x2f6c656e, 0x75626564, 0x72742f67, 0x6e696361, 
+    0x72742f67, 0x5f656361, 0x65706970, 0x00000000, 0x00000003, 0x000014b0, 0x00000002, 0x00000010, 0x00000017, 0x000001b4, 0x00000014, 0x00000011, 
+    0x00000015, 0x00000000, 0x00000006, 0x00000128, 0x0000000b, 0x00000010, 0x00000005, 0x00000158, 0x0000000a, 0x0000001c, 0x6ffffef5, 0x00000174, 
+    0x00000001, 0x0000000d, 0x0000001e, 0x00000008, 0x6ffffffb, 0x00000001, 0x6ffffff0, 0x0000018c, 0x6ffffffe, 0x00000194, 0x6fffffff, 0x00000001, 
+};
+
+static const unsigned int tracy_systrace_aarch64_size = 1650;
+static const unsigned int tracy_systrace_aarch64_data[1652/4] =
+{
+    0x464c457f, 0x00010102, 0x00000000, 0x00000000, 0x00b70003, 0x00000001, 0x000002e0, 0x00000000, 0x00000040, 0x00000000, 0x00000000, 0x00000000, 
+    0x00000000, 0x00380040, 0x00400006, 0x00000000, 0x00000006, 0x00000005, 0x00000040, 0x00000000, 0x00000040, 0x00000000, 0x00000040, 0x00000000, 
+    0x00000150, 0x00000000, 0x00000150, 0x00000000, 0x00000008, 0x00000000, 0x00000003, 0x00000004, 0x00000190, 0x00000000, 0x00000190, 0x00000000, 
+    0x00000190, 0x00000000, 0x00000015, 0x00000000, 0x00000015, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000005, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000004e1, 0x00000000, 0x000004e1, 0x00000000, 0x00001000, 0x00000000, 0x00000001, 0x00000006, 
+    0x000004e8, 0x00000000, 0x000014e8, 0x00000000, 0x000014e8, 0x00000000, 0x0000018a, 0x00000000, 0x00000190, 0x00000000, 0x00001000, 0x00000000, 
+    0x00000002, 0x00000006, 0x000004e8, 0x00000000, 0x000014e8, 0x00000000, 0x000014e8, 0x00000000, 0x00000160, 0x00000000, 0x00000160, 0x00000000, 
+    0x00000008, 0x00000000, 0x6474e551, 0x00000006, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x00000008, 0x00000000, 0x7379732f, 0x2f6d6574, 0x2f6e6962, 0x6b6e696c, 0x34367265, 0x00000000, 0x00000001, 0x00000001, 
+    0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+    0x00000000, 0x00090003, 0x000002e0, 0x00000000, 0x00000000, 0x00000000, 0x00000010, 0x00000012, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+    0x0000000a, 0x00000012, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x62696c00, 0x732e6c64, 0x6c64006f, 0x006d7973, 0x706f6c64, 0x4c006e65, 
+    0x00434249, 0x00000000, 0x00000000, 0x00000000, 0x00010001, 0x00000001, 0x00000010, 0x00000000, 0x00050d63, 0x00020000, 0x00000017, 0x00000000, 
+    0x00001668, 0x00000000, 0x00000402, 0x00000002, 0x00000000, 0x00000000, 0x00001670, 0x00000000, 0x00000402, 0x00000003, 0x00000000, 0x00000000, 
+    0xa9bf7bf0, 0xb0000010, 0xf9433211, 0x91198210, 0xd61f0220, 0xd503201f, 0xd503201f, 0xd503201f, 0xb0000010, 0xf9433611, 0x9119a210, 0xd61f0220, 
+    0xb0000010, 0xf9433a11, 0x9119c210, 0xd61f0220, 0xa9bb67fc, 0xa9015ff8, 0xa90257f6, 0xa9034ff4, 0xa9047bfd, 0x910103fd, 0xd14043ff, 0xd10083ff, 
+    0x90000000, 0x91124000, 0x52800021, 0x52800039, 0x97ffffec, 0x90000001, 0x91126021, 0xaa0003f7, 0x97ffffec, 0x90000001, 0xaa0003f8, 0x91127421, 
+    0xaa1703e0, 0x97ffffe7, 0x90000001, 0xaa0003f3, 0x91128821, 0xaa1703e0, 0x97ffffe2, 0x90000001, 0xaa0003f4, 0x91129c21, 0xaa1703e0, 0x97ffffdd, 
+    0x90000001, 0xaa0003f5, 0x9112c421, 0xaa1703e0, 0x97ffffd8, 0x90000001, 0xaa0003f6, 0x9112d821, 0xaa1703e0, 0x97ffffd3, 0xaa0003f7, 0x90000000, 
+    0x9112f000, 0x2a1f03e1, 0xd63f0300, 0x2a0003f8, 0x36f80060, 0x2a1f03e0, 0xd63f0260, 0x90000009, 0x3dc12120, 0x52800128, 0x79003be8, 0x52800108, 
+    0x910043e0, 0x52800021, 0x2a1f03e2, 0xb9001bf8, 0xb90013f9, 0x79002be8, 0x3d8003e0, 0xd63f0280, 0x7100001f, 0x5400036c, 0x910063e0, 0x52800021, 
+    0x2a1f03e2, 0xd63f0280, 0x7100001f, 0x5400018d, 0x910083e1, 0x52a00022, 0x2a1803e0, 0xd63f02c0, 0xf100041f, 0x540001eb, 0xaa0003e2, 0x910083e1, 
+    0x52800020, 0xd63f02e0, 0x14000004, 0x910003e0, 0xaa1f03e1, 0xd63f02a0, 0x910043e0, 0x52800021, 0x2a1f03e2, 0xd63f0280, 0x7100041f, 0x54fffceb, 
+    0x2a1f03e0, 0xd63f0260, 0x914043ff, 0x910083ff, 0xa9447bfd, 0xa9434ff4, 0xa94257f6, 0xa9415ff8, 0xa8c567fc, 0xd65f03c0, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x00989680, 0x00000000, 0x6362696c, 0x006f732e, 0x6e65706f, 0x69786500, 0x6f700074, 0x6e006c6c, 0x736f6e61, 0x7065656c, 
+    0x61657200, 0x72770064, 0x00657469, 0x7379732f, 0x72656b2f, 0x2f6c656e, 0x75626564, 0x72742f67, 0x6e696361, 0x72742f67, 0x5f656361, 0x65706970, 
+    0x00000000, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x6ffffef5, 0x00000000, 0x000001a8, 0x00000000, 0x00000005, 0x00000000, 
+    0x00000228, 0x00000000, 0x00000006, 0x00000000, 0x000001c8, 0x00000000, 0x0000000a, 0x00000000, 0x0000001c, 0x00000000, 0x0000000b, 0x00000000, 
+    0x00000018, 0x00000000, 0x00000015, 0x00000000, 0x00000000, 0x00000000, 0x00000003, 0x00000000, 0x00001650, 0x00000000, 0x00000002, 0x00000000, 
+    0x00000030, 0x00000000, 0x00000014, 0x00000000, 0x00000007, 0x00000000, 0x00000017, 0x00000000, 0x00000270, 0x00000000, 0x0000001e, 0x00000000, 
+    0x00000008, 0x00000000, 0x6ffffffb, 0x00000000, 0x00000001, 0x00000000, 0x6ffffffe, 0x00000000, 0x00000250, 0x00000000, 0x6fffffff, 0x00000000, 
+    0x00000001, 0x00000000, 0x6ffffff0, 0x00000000, 0x00000244, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
+    0x00000000, 0x00000000, 0x000002a0, 0x00000000, 0x000002a0, 
+};
+
+}
diff --git a/Source/ThirdParty/tracy/client/TracyThread.hpp b/Source/ThirdParty/tracy/client/TracyThread.hpp
new file mode 100644
index 000000000..edd255e87
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyThread.hpp
@@ -0,0 +1,85 @@
+#ifndef __TRACYTHREAD_HPP__
+#define __TRACYTHREAD_HPP__
+
+#if defined _WIN32 || defined __CYGWIN__
+#  include <windows.h>
+#else
+#  include <pthread.h>
+#endif
+
+#ifdef TRACY_MANUAL_LIFETIME
+#  include "tracy_rpmalloc.hpp"
+#endif
+
+namespace tracy
+{
+
+class ThreadExitHandler
+{
+public:
+    ~ThreadExitHandler()
+    {
+#ifdef TRACY_MANUAL_LIFETIME
+        rpmalloc_thread_finalize();
+#endif
+    }
+};
+
+#if defined _WIN32 || defined __CYGWIN__
+
+class Thread
+{
+public:
+    Thread( void(*func)( void* ptr ), void* ptr )
+        : m_func( func )
+        , m_ptr( ptr )
+        , m_hnd( CreateThread( nullptr, 0, Launch, this, 0, nullptr ) )
+    {}
+
+    ~Thread()
+    {
+        WaitForSingleObject( m_hnd, INFINITE );
+        CloseHandle( m_hnd );
+    }
+
+    HANDLE Handle() const { return m_hnd; }
+
+private:
+    static DWORD WINAPI Launch( void* ptr ) { ((Thread*)ptr)->m_func( ((Thread*)ptr)->m_ptr ); return 0; }
+
+    void(*m_func)( void* ptr );
+    void* m_ptr;
+    HANDLE m_hnd;
+};
+
+#else
+
+class Thread
+{
+public:
+    Thread( void(*func)( void* ptr ), void* ptr )
+        : m_func( func )
+        , m_ptr( ptr )
+    {
+        pthread_create( &m_thread, nullptr, Launch, this );
+    }
+
+    ~Thread()
+    {
+        pthread_join( m_thread, nullptr );
+    }
+
+    pthread_t Handle() const { return m_thread; }
+
+private:
+    static void* Launch( void* ptr ) { ((Thread*)ptr)->m_func( ((Thread*)ptr)->m_ptr ); return nullptr; }
+    void(*m_func)( void* ptr );
+    void* m_ptr;
+    pthread_t m_thread;
+};
+
+#endif
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/tracy_concurrentqueue.h b/Source/ThirdParty/tracy/client/tracy_concurrentqueue.h
new file mode 100644
index 000000000..bf095bc36
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/tracy_concurrentqueue.h
@@ -0,0 +1,1445 @@
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
+// An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+// Simplified BSD license:
+// Copyright (c) 2013-2016, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this list of
+// conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice, this list of
+// conditions and the following disclaimer in the documentation and/or other materials
+// provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#pragma once
+
+#include "../common/TracyAlloc.hpp"
+#include "../common/TracySystem.hpp"
+
+#if defined(__GNUC__)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+// upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <cassert>
+#include <cstddef>              // for max_align_t
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <climits>		// for CHAR_BIT
+#include <array>
+#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+
+namespace tracy
+{
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel { namespace details {
+#if defined(__GNUC__)
+	inline bool cqLikely(bool x) { return __builtin_expect((x), true); }
+	inline bool cqUnlikely(bool x) { return __builtin_expect((x), false); }
+#else
+	inline bool cqLikely(bool x) { return x; }
+	inline bool cqUnlikely(bool x) { return x; }
+#endif
+} }
+
+namespace
+{
+    // to avoid MSVC warning 4127: conditional expression is constant
+    template <bool>
+    struct compile_time_condition
+    {
+        static const bool value = false;
+    };
+    template <>
+    struct compile_time_condition<true>
+    {
+        static const bool value = true;
+    };
+}
+
+namespace moodycamel {
+namespace details {
+	template<typename T>
+	struct const_numeric_max {
+		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+		static const T value = std::numeric_limits<T>::is_signed
+			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
+			: static_cast<T>(-1);
+	};
+
+#if defined(__GLIBCXX__)
+	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+#else
+	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+#endif
+
+	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+	typedef union {
+		std_max_align_t x;
+		long long y;
+		void* z;
+	} max_align_t;
+}
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits
+{
+	// General-purpose size type. std::size_t is strongly recommended.
+	typedef std::size_t size_t;
+
+	// The type used for the enqueue and dequeue indices. Must be at least as
+	// large as size_t. Should be significantly larger than the number of elements
+	// you expect to hold at once, especially if you have a high turnover rate;
+	// for example, on 32-bit x86, if you expect to have over a hundred million
+	// elements or pump several million elements through your queue in a very
+	// short space of time, using a 32-bit type *may* trigger a race condition.
+	// A 64-bit int type is recommended in that case, and in practice will
+	// prevent a race condition no matter the usage of the queue. Note that
+	// whether the queue is lock-free with a 64-int type depends on the whether
+	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+	typedef std::size_t index_t;
+
+	// Internally, all elements are enqueued and dequeued from multi-element
+	// blocks; this is the smallest controllable unit. If you expect few elements
+	// but many producers, a smaller block size should be favoured. For few producers
+	// and/or many elements, a larger block size is preferred. A sane default
+	// is provided. Must be a power of 2.
+	static const size_t BLOCK_SIZE = 64*1024;
+
+	// For explicit producers (i.e. when using a producer token), the block is
+	// checked for being empty by iterating through a list of flags, one per element.
+	// For large block sizes, this is too inefficient, and switching to an atomic
+	// counter-based approach is faster. The switch is made for block sizes strictly
+	// larger than this threshold.
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+
+	// How many full blocks can be expected for a single explicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+
+	// Controls the number of items that an explicit consumer (i.e. one with a token)
+	// must consume before it causes all consumers to rotate and move on to the next
+	// internal queue.
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+
+	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+	// Enqueue operations that would cause this limit to be surpassed will fail. Note
+	// that this limit is enforced at the block level (for performance reasons), i.e.
+	// it's rounded up to the nearest block size.
+	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+
+
+	// Memory allocation can be customized if needed.
+	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+#if defined(malloc) || defined(free)
+	// Gah, this is 2015, stop defining macros that break standard code already!
+	// Work around malloc/free being special macros:
+	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
+	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+#else
+	static inline void* malloc(size_t size) { return tracy::tracy_malloc(size); }
+	static inline void free(void* ptr) { return tracy::tracy_free(ptr); }
+#endif
+};
+
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template<typename T, typename Traits> class ConcurrentQueue;
+
+
+namespace details
+{
+	struct ConcurrentQueueProducerTypelessBase
+	{
+		ConcurrentQueueProducerTypelessBase* next;
+		std::atomic<bool> inactive;
+		ProducerToken* token;
+        uint64_t threadId;
+
+		ConcurrentQueueProducerTypelessBase()
+			: next(nullptr), inactive(false), token(nullptr), threadId(0)
+		{
+		}
+	};
+
+	template<typename T>
+	static inline bool circular_less_than(T a, T b)
+	{
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4554)
+#endif
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << static_cast<T>(sizeof(T) * CHAR_BIT - 1));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+	}
+
+	template<typename U>
+	static inline char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+
+	template<typename T>
+	static inline T ceil_to_pow_2(T x)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+
+	template<typename T>
+	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+	{
+		T temp = std::move(left.load(std::memory_order_relaxed));
+		left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed);
+		right.store(std::move(temp), std::memory_order_relaxed);
+	}
+
+	template<typename T>
+	static inline T const& nomove(T const& x)
+	{
+		return x;
+	}
+
+	template<bool Enable>
+	struct nomove_if
+	{
+		template<typename T>
+		static inline T const& eval(T const& x)
+		{
+			return x;
+		}
+	};
+
+	template<>
+	struct nomove_if<false>
+	{
+		template<typename U>
+		static inline auto eval(U&& x)
+			-> decltype(std::forward<U>(x))
+		{
+			return std::forward<U>(x);
+		}
+	};
+
+	template<typename It>
+	static inline auto deref_noexcept(It& it) noexcept -> decltype(*it)
+	{
+		return *it;
+	}
+
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+#else
+	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+#endif
+
+	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
+	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
+	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
+	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
+	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
+}
+
+
+struct ProducerToken
+{
+	template<typename T, typename Traits>
+	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+
+	ProducerToken(ProducerToken&& other) noexcept
+		: producer(other.producer)
+	{
+		other.producer = nullptr;
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+	}
+
+	inline ProducerToken& operator=(ProducerToken&& other) noexcept
+	{
+		swap(other);
+		return *this;
+	}
+
+	void swap(ProducerToken& other) noexcept
+	{
+		std::swap(producer, other.producer);
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+		if (other.producer != nullptr) {
+			other.producer->token = &other;
+		}
+	}
+
+	// A token is always valid unless:
+	//     1) Memory allocation failed during construction
+	//     2) It was moved via the move constructor
+	//        (Note: assignment does a swap, leaving both potentially valid)
+	//     3) The associated queue was destroyed
+	// Note that if valid() returns true, that only indicates
+	// that the token is valid for use with a specific queue,
+	// but not which one; that's up to the user to track.
+	inline bool valid() const { return producer != nullptr; }
+
+	~ProducerToken()
+	{
+		if (producer != nullptr) {
+			producer->token = nullptr;
+			producer->inactive.store(true, std::memory_order_release);
+		}
+	}
+
+	// Disable copying and assignment
+	ProducerToken(ProducerToken const&) = delete;
+	ProducerToken& operator=(ProducerToken const&) = delete;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+
+protected:
+	details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+
+struct ConsumerToken
+{
+	template<typename T, typename Traits>
+	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+
+	ConsumerToken(ConsumerToken&& other) noexcept
+		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
+	{
+	}
+
+	inline ConsumerToken& operator=(ConsumerToken&& other) noexcept
+	{
+		swap(other);
+		return *this;
+	}
+
+	void swap(ConsumerToken& other) noexcept
+	{
+		std::swap(initialOffset, other.initialOffset);
+		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+		std::swap(currentProducer, other.currentProducer);
+		std::swap(desiredProducer, other.desiredProducer);
+	}
+
+	// Disable copying and assignment
+	ConsumerToken(ConsumerToken const&) = delete;
+	ConsumerToken& operator=(ConsumerToken const&) = delete;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+
+private: // but shared with ConcurrentQueue
+	std::uint32_t initialOffset;
+	std::uint32_t lastKnownGlobalOffset;
+	std::uint32_t itemsConsumedFromCurrent;
+	details::ConcurrentQueueProducerTypelessBase* currentProducer;
+	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
+public:
+    struct ExplicitProducer;
+
+	typedef moodycamel::ProducerToken producer_token_t;
+	typedef moodycamel::ConsumerToken consumer_token_t;
+
+	typedef typename Traits::index_t index_t;
+	typedef typename Traits::size_t size_t;
+
+	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#endif
+	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+	}
+
+	// Computes the correct amount of pre-allocated blocks for you based
+	// on the minimum number of elements you want available at any given
+	// time, and the maximum concurrent number of each type of producer.
+	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers);
+		populate_initial_block_list(blocks);
+	}
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	// This method is not thread safe.
+	~ConcurrentQueue()
+	{
+		// Destroy producers
+		auto ptr = producerListTail.load(std::memory_order_relaxed);
+		while (ptr != nullptr) {
+			auto next = ptr->next_prod();
+			if (ptr->token != nullptr) {
+				ptr->token->producer = nullptr;
+			}
+			destroy(ptr);
+			ptr = next;
+		}
+
+		// Destroy global free list
+		auto block = freeList.head_unsafe();
+		while (block != nullptr) {
+			auto next = block->freeListNext.load(std::memory_order_relaxed);
+			if (block->dynamicallyAllocated) {
+				destroy(block);
+			}
+			block = next;
+		}
+
+		// Destroy initial free list
+		destroy_array(initialBlockPool, initialBlockPoolSize);
+	}
+
+	// Disable copying and copy assignment
+	ConcurrentQueue(ConcurrentQueue const&) = delete;
+    ConcurrentQueue(ConcurrentQueue&& other) = delete;
+	ConcurrentQueue& operator=(ConcurrentQueue const&) = delete;
+    ConcurrentQueue& operator=(ConcurrentQueue&& other) = delete;
+
+public:
+    tracy_force_inline T* enqueue_begin(producer_token_t const& token, index_t& currentTailIndex)
+    {
+        return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::enqueue_begin(currentTailIndex);
+    }
+
+	template<class NotifyThread, class ProcessData>
+    size_t try_dequeue_bulk_single(consumer_token_t& token, NotifyThread notifyThread, ProcessData processData )
+    {
+        if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+            if (!update_current_producer_after_rotation(token)) {
+                return 0;
+            }
+        }
+
+        size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(notifyThread, processData);
+        token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+
+        auto tail = producerListTail.load(std::memory_order_acquire);
+        auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+        if (ptr == nullptr) {
+            ptr = tail;
+        }
+        if( count == 0 )
+        {
+            while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+                auto dequeued = ptr->dequeue_bulk(notifyThread, processData);
+                if (dequeued != 0) {
+                    token.currentProducer = ptr;
+                    token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+                    return dequeued;
+                }
+                ptr = ptr->next_prod();
+                if (ptr == nullptr) {
+                    ptr = tail;
+                }
+            }
+            return 0;
+        }
+        else
+        {
+            token.currentProducer = ptr;
+            token.itemsConsumedFromCurrent = 0;
+            return count;
+        }
+    }
+
+
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	size_t size_approx() const
+	{
+		size_t size = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			size += ptr->size_approx();
+		}
+		return size;
+	}
+
+
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static bool is_lock_free()
+	{
+		return
+			details::static_is_lock_free<bool>::value == 2 &&
+			details::static_is_lock_free<size_t>::value == 2 &&
+			details::static_is_lock_free<std::uint32_t>::value == 2 &&
+			details::static_is_lock_free<index_t>::value == 2 &&
+			details::static_is_lock_free<void*>::value == 2;
+	}
+
+
+private:
+	friend struct ProducerToken;
+	friend struct ConsumerToken;
+	friend struct ExplicitProducer;
+
+
+	///////////////////////////////
+	// Queue methods
+	///////////////////////////////
+
+	inline bool update_current_producer_after_rotation(consumer_token_t& token)
+	{
+		// Ah, there's been a rotation, figure out where we should be!
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		if (token.desiredProducer == nullptr && tail == nullptr) {
+			return false;
+		}
+		auto prodCount = producerCount.load(std::memory_order_relaxed);
+		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+		if (details::cqUnlikely(token.desiredProducer == nullptr)) {
+			// Aha, first time we're dequeueing anything.
+			// Figure out our local position
+			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+			token.desiredProducer = tail;
+			for (std::uint32_t i = 0; i != offset; ++i) {
+				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+				if (token.desiredProducer == nullptr) {
+					token.desiredProducer = tail;
+				}
+			}
+		}
+
+		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+		if (delta >= prodCount) {
+			delta = delta % prodCount;
+		}
+		for (std::uint32_t i = 0; i != delta; ++i) {
+			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+			if (token.desiredProducer == nullptr) {
+				token.desiredProducer = tail;
+			}
+		}
+
+		token.lastKnownGlobalOffset = globalOffset;
+		token.currentProducer = token.desiredProducer;
+		token.itemsConsumedFromCurrent = 0;
+		return true;
+	}
+
+
+	///////////////////////////
+	// Free list
+	///////////////////////////
+
+	template <typename N>
+	struct FreeListNode
+	{
+		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
+
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<N*> freeListNext;
+	};
+
+	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+	// speedy under low contention.
+	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
+	struct FreeList
+	{
+		FreeList() : freeListHead(nullptr) { }
+		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
+		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+
+		FreeList(FreeList const&) = delete;
+		FreeList& operator=(FreeList const&) = delete;
+
+		inline void add(N* node)
+		{
+			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+			// set it using a fetch_add
+			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+				// Oh look! We were the last ones referencing this node, and we know
+				// we want to add it to the free list, so let's do it!
+		 		add_knowing_refcount_is_zero(node);
+			}
+		}
+
+		inline N* try_get()
+		{
+			auto head = freeListHead.load(std::memory_order_acquire);
+			while (head != nullptr) {
+				auto prevHead = head;
+				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+					head = freeListHead.load(std::memory_order_acquire);
+					continue;
+				}
+
+				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
+				// next and not worry about it changing between now and the time we do the CAS
+				auto next = head->freeListNext.load(std::memory_order_relaxed);
+				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+
+					// Decrease refcount twice, once for our ref, and once for the list's ref
+					head->freeListRefs.fetch_sub(2, std::memory_order_release);
+					return head;
+				}
+
+				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
+				// count decrement happens-after the CAS on the head.
+				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+				if (refs == SHOULD_BE_ON_FREELIST + 1) {
+					add_knowing_refcount_is_zero(prevHead);
+				}
+			}
+
+			return nullptr;
+		}
+
+		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+
+	private:
+		inline void add_knowing_refcount_is_zero(N* node)
+		{
+			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+			// only one copy of this method per node at a time, i.e. the single thread case), then we know
+			// we can safely change the next pointer of the node; however, once the refcount is back above
+			// zero, then other threads could increase it (happens under heavy contention, when the refcount
+			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
+			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
+			auto head = freeListHead.load(std::memory_order_relaxed);
+			while (true) {
+				node->freeListNext.store(head, std::memory_order_relaxed);
+				node->freeListRefs.store(1, std::memory_order_release);
+				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
+					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
+					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) {
+						continue;
+					}
+				}
+				return;
+			}
+		}
+
+	private:
+		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+		std::atomic<N*> freeListHead;
+
+	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+	};
+
+
+	///////////////////////////
+	// Block
+	///////////////////////////
+
+	struct Block
+	{
+		Block()
+			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true)
+		{
+		}
+
+		inline bool is_empty() const
+		{
+			if (compile_time_condition<BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD>::value) {
+				// Check flags
+				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+						return false;
+					}
+				}
+
+				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+				std::atomic_thread_fence(std::memory_order_acquire);
+				return true;
+			}
+			else {
+				// Check counter
+				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+					std::atomic_thread_fence(std::memory_order_acquire);
+					return true;
+				}
+				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+				return false;
+			}
+		}
+
+		// Returns true if the block is now empty (does not apply in explicit context)
+		inline bool set_empty(index_t i)
+		{
+			if (BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flag
+				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
+				assert(prevVal < BLOCK_SIZE);
+				return prevVal == BLOCK_SIZE - 1;
+			}
+		}
+
+		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+		// Returns true if the block is now empty (does not apply in explicit context).
+		inline bool set_many_empty(index_t i, size_t count)
+		{
+			if (compile_time_condition<BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD>::value) {
+				// Set flags
+				std::atomic_thread_fence(std::memory_order_release);
+				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+				for (size_t j = 0; j != count; ++j) {
+					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+					emptyFlags[i + j].store(true, std::memory_order_relaxed);
+				}
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release);
+				assert(prevVal + count <= BLOCK_SIZE);
+				return prevVal + count == BLOCK_SIZE;
+			}
+		}
+
+		inline void set_all_empty()
+		{
+			if (BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set all flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(true, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+			}
+		}
+
+		inline void reset_empty()
+		{
+			if (compile_time_condition<BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD>::value) {
+				// Reset flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(false, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+			}
+		}
+
+		inline T* operator[](index_t idx) noexcept { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		inline T const* operator[](index_t idx) const noexcept { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+
+	private:
+		// IMPORTANT: This must be the first member in Block, so that if T depends on the alignment of
+		// addresses returned by malloc, that alignment will be preserved. Apparently clang actually
+		// generates code that uses this assumption for AVX instructions in some cases. Ideally, we
+		// should also align Block to the alignment of T in case it's higher than malloc's 16-byte
+		// alignment, but this is hard to do in a cross-platform way. Assert for this case:
+		static_assert(std::alignment_of<T>::value <= std::alignment_of<details::max_align_t>::value, "The queue does not support super-aligned types at this time");
+		// Additionally, we need the alignment of Block itself to be a multiple of max_align_t since
+		// otherwise the appropriate padding will not be added at the end of Block in order to make
+		// arrays of Blocks all be properly aligned (not just the first one). We use a union to force
+		// this.
+		union {
+			char elements[sizeof(T) * BLOCK_SIZE];
+			details::max_align_t dummy;
+		};
+	public:
+		Block* next;
+		std::atomic<size_t> elementsCompletelyDequeued;
+		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+	public:
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<Block*> freeListNext;
+		std::atomic<bool> shouldBeOnFreeList;
+		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+	};
+	static_assert(std::alignment_of<Block>::value >= std::alignment_of<details::max_align_t>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+
+
+	///////////////////////////
+	// Producer base
+	///////////////////////////
+
+	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+	{
+		ProducerBase(ConcurrentQueue* parent_) :
+			tailIndex(0),
+			headIndex(0),
+			dequeueOptimisticCount(0),
+			dequeueOvercommit(0),
+			tailBlock(nullptr),
+			parent(parent_)
+		{
+		}
+
+		virtual ~ProducerBase() { };
+
+		template<class NotifyThread, class ProcessData>
+		inline size_t dequeue_bulk(NotifyThread notifyThread, ProcessData processData)
+		{
+			return static_cast<ExplicitProducer*>(this)->dequeue_bulk(notifyThread, processData);
+		}
+
+		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+
+		inline size_t size_approx() const
+		{
+			auto tail = tailIndex.load(std::memory_order_relaxed);
+			auto head = headIndex.load(std::memory_order_relaxed);
+			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+		}
+
+		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+	protected:
+		std::atomic<index_t> tailIndex;		// Where to enqueue to next
+		std::atomic<index_t> headIndex;		// Where to dequeue from next
+
+		std::atomic<index_t> dequeueOptimisticCount;
+		std::atomic<index_t> dequeueOvercommit;
+
+		Block* tailBlock;
+
+	public:
+		ConcurrentQueue* parent;
+	};
+
+
+    public:
+	///////////////////////////
+	// Explicit queue
+	///////////////////////////
+	struct ExplicitProducer : public ProducerBase
+	{
+		explicit ExplicitProducer(ConcurrentQueue* _parent) :
+			ProducerBase(_parent),
+			blockIndex(nullptr),
+			pr_blockIndexSlotsUsed(0),
+			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+			pr_blockIndexFront(0),
+			pr_blockIndexEntries(nullptr),
+			pr_blockIndexRaw(nullptr)
+		{
+			size_t poolBasedIndexSize = details::ceil_to_pow_2(_parent->initialBlockPoolSize) >> 1;
+			if (poolBasedIndexSize > pr_blockIndexSize) {
+				pr_blockIndexSize = poolBasedIndexSize;
+			}
+
+			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+		}
+
+		~ExplicitProducer()
+		{
+			// Destruct any elements not yet dequeued.
+			// Since we're in the destructor, we can assume all elements
+			// are either completely dequeued or completely not (no halfways).
+			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
+				// First find the block that's partially dequeued, if any
+				Block* halfDequeuedBlock = nullptr;
+				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
+					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
+						i = (i + 1) & (pr_blockIndexSize - 1);
+					}
+					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+					halfDequeuedBlock = pr_blockIndexEntries[i].block;
+				}
+
+				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+				auto block = this->tailBlock;
+				do {
+					block = block->next;
+					if (block->ConcurrentQueue::Block::is_empty()) {
+						continue;
+					}
+
+					size_t i = 0;	// Offset into block
+					if (block == halfDequeuedBlock) {
+						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					}
+
+					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+						(*block)[i++]->~T();
+					}
+				} while (block != this->tailBlock);
+			}
+
+			// Destroy all blocks that we own
+			if (this->tailBlock != nullptr) {
+				auto block = this->tailBlock;
+				do {
+					auto nextBlock = block->next;
+					if (block->dynamicallyAllocated) {
+						destroy(block);
+					}
+					else {
+						this->parent->add_block_to_free_list(block);
+					}
+					block = nextBlock;
+				} while (block != this->tailBlock);
+			}
+
+			// Destroy the block indices
+			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+			while (header != nullptr) {
+				auto prev = static_cast<BlockIndexHeader*>(header->prev);
+				header->~BlockIndexHeader();
+				(Traits::free)(header);
+				header = prev;
+			}
+		}
+
+        inline void enqueue_begin_alloc(index_t currentTailIndex)
+        {
+            // We reached the end of a block, start a new one
+            if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::is_empty()) {
+                // We can re-use the block ahead of us, it's empty!
+                this->tailBlock = this->tailBlock->next;
+                this->tailBlock->ConcurrentQueue::Block::reset_empty();
+
+                // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+                // last block from it first -- except instead of removing then adding, we can just overwrite).
+                // Note that there must be a valid block index here, since even if allocation failed in the ctor,
+                // it would have been re-attempted when adding the first block to the queue; since there is such
+                // a block, a block index must have been successfully allocated.
+            }
+            else {
+                // We're going to need a new block; check that the block index has room
+                if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+                    // Hmm, the circular block index is already full -- we'll need
+                    // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+                    // the initial allocation failed in the constructor.
+                    new_block_index(pr_blockIndexSlotsUsed);
+                }
+
+                // Insert a new block in the circular linked list
+                auto newBlock = this->parent->ConcurrentQueue::requisition_block();
+                newBlock->ConcurrentQueue::Block::reset_empty();
+                if (this->tailBlock == nullptr) {
+                    newBlock->next = newBlock;
+                }
+                else {
+                    newBlock->next = this->tailBlock->next;
+                    this->tailBlock->next = newBlock;
+                }
+                this->tailBlock = newBlock;
+                ++pr_blockIndexSlotsUsed;
+            }
+
+            // Add block to block index
+            auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+            entry.base = currentTailIndex;
+            entry.block = this->tailBlock;
+            blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+            pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+        }
+
+        tracy_force_inline T* enqueue_begin(index_t& currentTailIndex)
+        {
+            currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+            if (details::cqUnlikely((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0)) {
+                this->enqueue_begin_alloc(currentTailIndex);
+            }
+            return (*this->tailBlock)[currentTailIndex];
+        }
+
+        tracy_force_inline std::atomic<index_t>& get_tail_index()
+        {
+            return this->tailIndex;
+        }
+
+		template<class NotifyThread, class ProcessData>
+		size_t dequeue_bulk(NotifyThread notifyThread, ProcessData processData)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < 8192 ? desiredCount : 8192;
+				std::atomic_thread_fence(std::memory_order_acquire);
+
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				assert(overcommit <= myDequeueCount);
+
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+					// Determine which block the first element is in
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE);
+					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+
+					notifyThread( this->threadId );
+
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					do {
+						auto firstIndexInBlock = index;
+						auto endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						auto block = localBlockIndex->entries[indexIndex].block;
+
+						const auto sz = endIndex - index;
+						processData( (*block)[index], sz );
+						index += sz;
+
+						block->ConcurrentQueue::Block::set_many_empty(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+					} while (index != firstIndex + actualCount);
+
+					return actualCount;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+
+			return 0;
+		}
+
+	private:
+		struct BlockIndexEntry
+		{
+			index_t base;
+			Block* block;
+		};
+
+		struct BlockIndexHeader
+		{
+			size_t size;
+			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
+			BlockIndexEntry* entries;
+			void* prev;
+		};
+
+
+		bool new_block_index(size_t numberOfFilledSlotsToExpose)
+		{
+			auto prevBlockSizeMask = pr_blockIndexSize - 1;
+
+			// Create the new block
+			pr_blockIndexSize <<= 1;
+			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+			if (newRawPtr == nullptr) {
+				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
+				return false;
+			}
+
+			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+
+			// Copy in all the old indices, if any
+			size_t j = 0;
+			if (pr_blockIndexSlotsUsed != 0) {
+				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+				do {
+					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+					i = (i + 1) & prevBlockSizeMask;
+				} while (i != pr_blockIndexFront);
+			}
+
+			// Update everything
+			auto header = new (newRawPtr) BlockIndexHeader;
+			header->size = pr_blockIndexSize;
+			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+			header->entries = newBlockIndexEntries;
+			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
+
+			pr_blockIndexFront = j;
+			pr_blockIndexEntries = newBlockIndexEntries;
+			pr_blockIndexRaw = newRawPtr;
+			blockIndex.store(header, std::memory_order_release);
+
+			return true;
+		}
+
+	private:
+		std::atomic<BlockIndexHeader*> blockIndex;
+
+		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
+		size_t pr_blockIndexSlotsUsed;
+		size_t pr_blockIndexSize;
+		size_t pr_blockIndexFront;		// Next slot (not current)
+		BlockIndexEntry* pr_blockIndexEntries;
+		void* pr_blockIndexRaw;
+	};
+
+    ExplicitProducer* get_explicit_producer(producer_token_t const& token)
+    {
+        return static_cast<ExplicitProducer*>(token.producer);
+    }
+
+    private:
+
+	//////////////////////////////////
+	// Block pool manipulation
+	//////////////////////////////////
+
+	void populate_initial_block_list(size_t blockCount)
+	{
+		initialBlockPoolSize = blockCount;
+		if (initialBlockPoolSize == 0) {
+			initialBlockPool = nullptr;
+			return;
+		}
+
+		initialBlockPool = create_array<Block>(blockCount);
+		if (initialBlockPool == nullptr) {
+			initialBlockPoolSize = 0;
+		}
+		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+			initialBlockPool[i].dynamicallyAllocated = false;
+		}
+	}
+
+	inline Block* try_get_block_from_initial_pool()
+	{
+		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+			return nullptr;
+		}
+
+		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+
+		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+	}
+
+	inline void add_block_to_free_list(Block* block)
+	{
+		freeList.add(block);
+	}
+
+	inline void add_blocks_to_free_list(Block* block)
+	{
+		while (block != nullptr) {
+			auto next = block->next;
+			add_block_to_free_list(block);
+			block = next;
+		}
+	}
+
+	inline Block* try_get_block_from_free_list()
+	{
+		return freeList.try_get();
+	}
+
+	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+	Block* requisition_block()
+	{
+		auto block = try_get_block_from_initial_pool();
+		if (block != nullptr) {
+			return block;
+		}
+
+		block = try_get_block_from_free_list();
+		if (block != nullptr) {
+			return block;
+		}
+
+		return create<Block>();
+	}
+
+
+	//////////////////////////////////
+	// Producer list manipulation
+	//////////////////////////////////
+
+	ProducerBase* recycle_or_create_producer()
+	{
+		bool recycled;
+		return recycle_or_create_producer(recycled);
+	}
+
+    ProducerBase* recycle_or_create_producer(bool& recycled)
+    {
+        // Try to re-use one first
+        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+            if (ptr->inactive.load(std::memory_order_relaxed)) {
+                if( ptr->size_approx() == 0 )
+                {
+                    bool expected = true;
+                    if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
+                        // We caught one! It's been marked as activated, the caller can have it
+                        recycled = true;
+                        return ptr;
+                    }
+                }
+            }
+        }
+
+        recycled = false;
+        return add_producer(static_cast<ProducerBase*>(create<ExplicitProducer>(this)));
+    }
+
+	ProducerBase* add_producer(ProducerBase* producer)
+	{
+		// Handle failed memory allocation
+		if (producer == nullptr) {
+			return nullptr;
+		}
+
+		producerCount.fetch_add(1, std::memory_order_relaxed);
+
+		// Add it to the lock-free list
+		auto prevTail = producerListTail.load(std::memory_order_relaxed);
+		do {
+			producer->next = prevTail;
+		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+
+		return producer;
+	}
+
+	void reown_producers()
+	{
+		// After another instance is moved-into/swapped-with this one, all the
+		// producers we stole still think their parents are the other queue.
+		// So fix them up!
+		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+			ptr->parent = this;
+		}
+	}
+
+	//////////////////////////////////
+	// Utility functions
+	//////////////////////////////////
+
+	template<typename U>
+	static inline U* create_array(size_t count)
+	{
+		assert(count > 0);
+		return static_cast<U*>((Traits::malloc)(sizeof(U) * count));
+	}
+
+	template<typename U>
+	static inline void destroy_array(U* p, size_t count)
+	{
+		((void)count);
+		if (p != nullptr) {
+			assert(count > 0);
+			(Traits::free)(p);
+		}
+	}
+
+	template<typename U>
+	static inline U* create()
+	{
+		auto p = (Traits::malloc)(sizeof(U));
+		return new (p) U;
+	}
+
+	template<typename U, typename A1>
+	static inline U* create(A1&& a1)
+	{
+		auto p = (Traits::malloc)(sizeof(U));
+		return new (p) U(std::forward<A1>(a1));
+	}
+
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr) {
+			p->~U();
+		}
+		(Traits::free)(p);
+	}
+
+private:
+	std::atomic<ProducerBase*> producerListTail;
+	std::atomic<std::uint32_t> producerCount;
+
+	std::atomic<size_t> initialBlockPoolIndex;
+	Block* initialBlockPool;
+	size_t initialBlockPoolSize;
+
+	FreeList<Block> freeList;
+
+	std::atomic<std::uint32_t> nextExplicitConsumerId;
+	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+};
+
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+	: producer(queue.recycle_or_create_producer())
+{
+	if (producer != nullptr) {
+		producer->token = this;
+        producer->threadId = detail::GetThreadHandleImpl();
+	}
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) noexcept
+{
+	a.swap(b);
+}
+
+inline void swap(ProducerToken& a, ProducerToken& b) noexcept
+{
+	a.swap(b);
+}
+
+inline void swap(ConsumerToken& a, ConsumerToken& b) noexcept
+{
+	a.swap(b);
+}
+
+}
+
+} /* namespace tracy */
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/Source/ThirdParty/tracy/client/tracy_rpmalloc.cpp b/Source/ThirdParty/tracy/client/tracy_rpmalloc.cpp
new file mode 100644
index 000000000..8aae78e03
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/tracy_rpmalloc.cpp
@@ -0,0 +1,2495 @@
+#ifdef TRACY_ENABLE
+
+/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016 Mattias Jansson
+ *
+ * This library provides a cross-platform lock free thread caching malloc implementation in C11.
+ * The latest source code is always available at
+ *
+ * https://github.com/mjansson/rpmalloc
+ *
+ * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions.
+ *
+ */
+
+#include "tracy_rpmalloc.hpp"
+
+/// Build time configurable limits
+#ifndef HEAP_ARRAY_SIZE
+//! Size of heap hashmap
+#define HEAP_ARRAY_SIZE           47
+#endif
+#ifndef ENABLE_THREAD_CACHE
+//! Enable per-thread cache
+#define ENABLE_THREAD_CACHE       1
+#endif
+#ifndef ENABLE_GLOBAL_CACHE
+//! Enable global cache shared between all threads, requires thread cache
+#define ENABLE_GLOBAL_CACHE       1
+#endif
+#ifndef ENABLE_VALIDATE_ARGS
+//! Enable validation of args to public entry points
+#define ENABLE_VALIDATE_ARGS      0
+#endif
+#ifndef ENABLE_STATISTICS
+//! Enable statistics collection
+#define ENABLE_STATISTICS         0
+#endif
+#ifndef ENABLE_ASSERTS
+//! Enable asserts
+#define ENABLE_ASSERTS            0
+#endif
+#ifndef ENABLE_OVERRIDE
+//! Override standard library malloc/free and new/delete entry points
+#define ENABLE_OVERRIDE           0
+#endif
+#ifndef ENABLE_PRELOAD
+//! Support preloading
+#define ENABLE_PRELOAD            0
+#endif
+#ifndef DISABLE_UNMAP
+//! Disable unmapping memory pages
+#define DISABLE_UNMAP             0
+#endif
+#ifndef DEFAULT_SPAN_MAP_COUNT
+//! Default number of spans to map in call to map more virtual memory (default values yield 4MiB here)
+#define DEFAULT_SPAN_MAP_COUNT    64
+#endif
+
+#if ENABLE_THREAD_CACHE
+#ifndef ENABLE_UNLIMITED_CACHE
+//! Unlimited thread and global cache
+#define ENABLE_UNLIMITED_CACHE    0
+#endif
+#ifndef ENABLE_UNLIMITED_THREAD_CACHE
+//! Unlimited cache disables any thread cache limitations
+#define ENABLE_UNLIMITED_THREAD_CACHE ENABLE_UNLIMITED_CACHE
+#endif
+#if !ENABLE_UNLIMITED_THREAD_CACHE
+#ifndef THREAD_CACHE_MULTIPLIER
+//! Multiplier for thread cache (cache limit will be span release count multiplied by this value)
+#define THREAD_CACHE_MULTIPLIER 16
+#endif
+#ifndef ENABLE_ADAPTIVE_THREAD_CACHE
+//! Enable adaptive size of per-thread cache (still bounded by THREAD_CACHE_MULTIPLIER hard limit)
+#define ENABLE_ADAPTIVE_THREAD_CACHE  0
+#endif
+#endif
+#endif
+
+#if ENABLE_GLOBAL_CACHE && ENABLE_THREAD_CACHE
+#ifndef ENABLE_UNLIMITED_GLOBAL_CACHE
+//! Unlimited cache disables any global cache limitations
+#define ENABLE_UNLIMITED_GLOBAL_CACHE ENABLE_UNLIMITED_CACHE
+#endif
+#if !ENABLE_UNLIMITED_GLOBAL_CACHE
+//! Multiplier for global cache (cache limit will be span release count multiplied by this value)
+#define GLOBAL_CACHE_MULTIPLIER (THREAD_CACHE_MULTIPLIER * 6)
+#endif
+#else
+#  undef ENABLE_GLOBAL_CACHE
+#  define ENABLE_GLOBAL_CACHE 0
+#endif
+
+#if !ENABLE_THREAD_CACHE || ENABLE_UNLIMITED_THREAD_CACHE
+#  undef ENABLE_ADAPTIVE_THREAD_CACHE
+#  define ENABLE_ADAPTIVE_THREAD_CACHE 0
+#endif
+
+#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE
+#  error Must use global cache if unmap is disabled
+#endif
+
+#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 )
+#  define PLATFORM_WINDOWS 1
+#  define PLATFORM_POSIX 0
+#else
+#  define PLATFORM_WINDOWS 0
+#  define PLATFORM_POSIX 1
+#endif
+
+#define _Static_assert static_assert
+
+/// Platform and arch specifics
+#ifndef FORCEINLINE
+#  if defined(_MSC_VER) && !defined(__clang__)
+#    define FORCEINLINE inline __forceinline
+#  else
+#    define FORCEINLINE inline __attribute__((__always_inline__))
+#  endif
+#endif
+#if PLATFORM_WINDOWS
+#  ifndef WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN
+#  endif
+#  include <windows.h>
+#  if ENABLE_VALIDATE_ARGS
+#    include <Intsafe.h>
+#  endif
+#else
+#  include <unistd.h>
+#  include <stdio.h>
+#  include <stdlib.h>
+#  if defined(__APPLE__)
+#    include <mach/mach_vm.h>
+#    include <mach/vm_statistics.h>
+#    include <pthread.h>
+#  endif
+#  if defined(__HAIKU__)
+#    include <OS.h>
+#    include <pthread.h>
+#  endif
+#endif
+
+#include <stdint.h>
+#include <string.h>
+
+#if ENABLE_ASSERTS
+#  undef NDEBUG
+#  if defined(_MSC_VER) && !defined(_DEBUG)
+#    define _DEBUG
+#  endif
+#  include <assert.h>
+#else
+#  undef  assert
+#  define assert(x) do {} while(0)
+#endif
+#if ENABLE_STATISTICS
+#  include <stdio.h>
+#endif
+
+#include <atomic>
+
+namespace tracy
+{
+
+typedef std::atomic<int32_t> atomic32_t;
+typedef std::atomic<int64_t> atomic64_t;
+typedef std::atomic<void*> atomicptr_t;
+
+#define atomic_thread_fence_acquire() std::atomic_thread_fence(std::memory_order_acquire)
+#define atomic_thread_fence_release() std::atomic_thread_fence(std::memory_order_release)
+
+static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return std::atomic_load_explicit(src, std::memory_order_relaxed); }
+static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { std::atomic_store_explicit(dst, val, std::memory_order_relaxed); }
+static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return std::atomic_fetch_add_explicit(val, 1, std::memory_order_relaxed) + 1; }
+#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
+static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; }
+#endif
+static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return std::atomic_fetch_add_explicit(val, add, std::memory_order_relaxed) + add; }
+static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return std::atomic_load_explicit(src, std::memory_order_relaxed); }
+static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { std::atomic_store_explicit(dst, val, std::memory_order_relaxed); }
+static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return std::atomic_compare_exchange_weak_explicit(dst, &ref, val, std::memory_order_release, std::memory_order_acquire); }
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#  define EXPECTED(x) (x)
+#  define UNEXPECTED(x) (x)
+#else
+#  define EXPECTED(x) __builtin_expect((x), 1)
+#  define UNEXPECTED(x) __builtin_expect((x), 0)
+#endif
+
+/// Preconfigured limits and sizes
+//! Granularity of a small allocation block
+#define SMALL_GRANULARITY         16
+//! Small granularity shift count
+#define SMALL_GRANULARITY_SHIFT   4
+//! Number of small block size classes
+#define SMALL_CLASS_COUNT         65
+//! Maximum size of a small block
+#define SMALL_SIZE_LIMIT          (SMALL_GRANULARITY * (SMALL_CLASS_COUNT - 1))
+//! Granularity of a medium allocation block
+#define MEDIUM_GRANULARITY        512
+//! Medium granularity shift count
+#define MEDIUM_GRANULARITY_SHIFT  9
+//! Number of medium block size classes
+#define MEDIUM_CLASS_COUNT        61
+//! Total number of small + medium size classes
+#define SIZE_CLASS_COUNT          (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT)
+//! Number of large block size classes
+#define LARGE_CLASS_COUNT         32
+//! Maximum size of a medium block
+#define MEDIUM_SIZE_LIMIT         (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT))
+//! Maximum size of a large block
+#define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
+//! Size of a span header (must be a multiple of SMALL_GRANULARITY)
+#define SPAN_HEADER_SIZE          96
+
+#if ENABLE_VALIDATE_ARGS
+//! Maximum allocation size to avoid integer overflow
+#undef  MAX_ALLOC_SIZE
+#define MAX_ALLOC_SIZE            (((size_t)-1) - _memory_span_size)
+#endif
+
+#define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs))
+#define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second))
+
+#define INVALID_POINTER ((void*)((uintptr_t)-1))
+
+/// Data types
+//! A memory heap, per thread
+typedef struct heap_t heap_t;
+//! Heap spans per size class
+typedef struct heap_class_t heap_class_t;
+//! Span of memory pages
+typedef struct span_t span_t;
+//! Span list
+typedef struct span_list_t span_list_t;
+//! Span active data
+typedef struct span_active_t span_active_t;
+//! Size class definition
+typedef struct size_class_t size_class_t;
+//! Global cache
+typedef struct global_cache_t global_cache_t;
+
+//! Flag indicating span is the first (master) span of a split superspan
+#define SPAN_FLAG_MASTER 1U
+//! Flag indicating span is a secondary (sub) span of a split superspan
+#define SPAN_FLAG_SUBSPAN 2U
+//! Flag indicating span has blocks with increased alignment
+#define SPAN_FLAG_ALIGNED_BLOCKS 4U
+
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+struct span_use_t {
+	//! Current number of spans used (actually used, not in cache)
+	atomic32_t current;
+	//! High water mark of spans used
+	uint32_t high;
+#if ENABLE_STATISTICS
+	//! Number of spans transitioned to global cache
+	uint32_t spans_to_global;
+	//! Number of spans transitioned from global cache
+	uint32_t spans_from_global;
+	//! Number of spans transitioned to thread cache
+	uint32_t spans_to_cache;
+	//! Number of spans transitioned from thread cache
+	uint32_t spans_from_cache;
+	//! Number of spans transitioned to reserved state
+	uint32_t spans_to_reserved;
+	//! Number of spans transitioned from reserved state
+	uint32_t spans_from_reserved;
+	//! Number of raw memory map calls
+	uint32_t spans_map_calls;
+#endif
+};
+typedef struct span_use_t span_use_t;
+#endif
+
+#if ENABLE_STATISTICS
+struct size_class_use_t {
+	//! Current number of allocations
+	atomic32_t alloc_current;
+	//! Peak number of allocations
+	int32_t alloc_peak;
+	//! Total number of allocations
+	int32_t alloc_total;
+	//! Total number of frees
+	atomic32_t free_total;
+	//! Number of spans in use
+	uint32_t spans_current;
+	//! Number of spans transitioned to cache
+	uint32_t spans_peak;
+	//! Number of spans transitioned to cache
+	uint32_t spans_to_cache;
+	//! Number of spans transitioned from cache
+	uint32_t spans_from_cache;
+	//! Number of spans transitioned from reserved state
+	uint32_t spans_from_reserved;
+	//! Number of spans mapped
+	uint32_t spans_map_calls;
+};
+typedef struct size_class_use_t size_class_use_t;
+#endif
+
+typedef enum span_state_t {
+	SPAN_STATE_ACTIVE = 0,
+	SPAN_STATE_PARTIAL,
+	SPAN_STATE_FULL
+} span_state_t;
+
+//A span can either represent a single span of memory pages with size declared by span_map_count configuration variable,
+//or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single
+//span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first
+//(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans
+//that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire
+//superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released
+//in the same call to release the virtual memory range, but individual subranges can be decommitted individually
+//to reduce physical memory use).
+struct span_t {
+	//! Free list
+	void*       free_list;
+	//! State
+	uint32_t    state;
+	//! Used count when not active (not including deferred free list)
+	uint32_t    used_count;
+	//! Block count
+	uint32_t    block_count;
+	//! Size class
+	uint32_t    size_class;
+	//! Index of last block initialized in free list
+	uint32_t    free_list_limit;
+	//! Span list size when part of a cache list, or size of deferred free list when partial/full
+	uint32_t    list_size;
+	//! Deferred free list
+	atomicptr_t free_list_deferred;
+	//! Size of a block
+	uint32_t    block_size;
+	//! Flags and counters
+	uint32_t    flags;
+	//! Number of spans
+	uint32_t    span_count;
+	//! Total span counter for master spans, distance for subspans
+	uint32_t    total_spans_or_distance;
+	//! Remaining span counter, for master spans
+	atomic32_t  remaining_spans;
+	//! Alignment offset
+	uint32_t    align_offset;
+	//! Owning heap
+	heap_t*     heap;
+	//! Next span
+	span_t*     next;
+	//! Previous span
+	span_t*     prev;
+};
+_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch");
+
+struct heap_class_t {
+	//! Free list of active span
+	void*        free_list;
+	//! Double linked list of partially used spans with free blocks for each size class.
+	//  Current active span is at head of list. Previous span pointer in head points to tail span of list.
+	span_t*      partial_span;
+};
+
+struct heap_t {
+	//! Active and semi-used span data per size class
+	heap_class_t span_class[SIZE_CLASS_COUNT];
+#if ENABLE_THREAD_CACHE
+	//! List of free spans (single linked list)
+	span_t*      span_cache[LARGE_CLASS_COUNT];
+	//! List of deferred free spans of class 0 (single linked list)
+	atomicptr_t  span_cache_deferred;
+#endif
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	//! Current and high water mark of spans used per span count
+	span_use_t   span_use[LARGE_CLASS_COUNT];
+#endif
+	//! Mapped but unused spans
+	span_t*      span_reserve;
+	//! Master span for mapped but unused spans
+	span_t*      span_reserve_master;
+	//! Number of mapped but unused spans
+	size_t       spans_reserved;
+	//! Next heap in id list
+	heap_t*      next_heap;
+	//! Next heap in orphan list
+	heap_t*      next_orphan;
+	//! Memory pages alignment offset
+	size_t       align_offset;
+	//! Heap ID
+	int32_t      id;
+#if ENABLE_STATISTICS
+	//! Number of bytes transitioned thread -> global
+	size_t       thread_to_global;
+	//! Number of bytes transitioned global -> thread
+	size_t       global_to_thread;
+	//! Allocation stats per size class
+	size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1];
+#endif
+};
+
+struct size_class_t {
+	//! Size of blocks in this class
+	uint32_t block_size;
+	//! Number of blocks in each chunk
+	uint16_t block_count;
+	//! Class index this class is merged with
+	uint16_t class_idx;
+};
+_Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch");
+
+struct global_cache_t {
+	//! Cache list pointer
+	atomicptr_t cache;
+	//! Cache size
+	atomic32_t size;
+	//! ABA counter
+	atomic32_t counter;
+};
+
+/// Global data
+//! Initialized flag
+static int _rpmalloc_initialized;
+//! Configuration
+static rpmalloc_config_t _memory_config;
+//! Memory page size
+static size_t _memory_page_size;
+//! Shift to divide by page size
+static size_t _memory_page_size_shift;
+//! Granularity at which memory pages are mapped by OS
+static size_t _memory_map_granularity;
+#if RPMALLOC_CONFIGURABLE
+//! Size of a span of memory pages
+static size_t _memory_span_size;
+//! Shift to divide by span size
+static size_t _memory_span_size_shift;
+//! Mask to get to start of a memory span
+static uintptr_t _memory_span_mask;
+#else
+//! Hardwired span size (64KiB)
+#define _memory_span_size (64 * 1024)
+#define _memory_span_size_shift 16
+#define _memory_span_mask (~((uintptr_t)(_memory_span_size - 1)))
+#endif
+//! Number of spans to map in each map call
+static size_t _memory_span_map_count;
+//! Number of spans to release from thread cache to global cache (single spans)
+static size_t _memory_span_release_count;
+//! Number of spans to release from thread cache to global cache (large multiple spans)
+static size_t _memory_span_release_count_large;
+//! Global size classes
+static size_class_t _memory_size_class[SIZE_CLASS_COUNT];
+//! Run-time size limit of medium blocks
+static size_t _memory_medium_size_limit;
+//! Heap ID counter
+static atomic32_t _memory_heap_id;
+//! Huge page support
+static int _memory_huge_pages;
+#if ENABLE_GLOBAL_CACHE
+//! Global span cache
+static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
+#endif
+//! All heaps
+static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE];
+//! Orphaned heaps
+static atomicptr_t _memory_orphan_heaps;
+//! Running orphan counter to avoid ABA issues in linked list
+static atomic32_t _memory_orphan_counter;
+#if ENABLE_STATISTICS
+//! Active heap count
+static atomic32_t _memory_active_heaps;
+//! Number of currently mapped memory pages
+static atomic32_t _mapped_pages;
+//! Peak number of concurrently mapped memory pages
+static int32_t _mapped_pages_peak;
+//! Number of currently unused spans
+static atomic32_t _reserved_spans;
+//! Running counter of total number of mapped memory pages since start
+static atomic32_t _mapped_total;
+//! Running counter of total number of unmapped memory pages since start
+static atomic32_t _unmapped_total;
+//! Number of currently mapped memory pages in OS calls
+static atomic32_t _mapped_pages_os;
+//! Number of currently allocated pages in huge allocations
+static atomic32_t _huge_pages_current;
+//! Peak number of currently allocated pages in huge allocations
+static int32_t _huge_pages_peak;
+#endif
+
+//! Current thread heap
+#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
+static pthread_key_t _memory_thread_heap;
+#else
+#  ifdef _MSC_VER
+#    define _Thread_local __declspec(thread)
+#    define TLS_MODEL
+#  else
+#    define TLS_MODEL __attribute__((tls_model("initial-exec")))
+#    if !defined(__clang__) && defined(__GNUC__)
+#      define _Thread_local __thread
+#    endif
+#  endif
+static _Thread_local heap_t* _memory_thread_heap TLS_MODEL;
+#endif
+
+static inline heap_t*
+get_thread_heap_raw(void) {
+#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
+	return pthread_getspecific(_memory_thread_heap);
+#else
+	return _memory_thread_heap;
+#endif
+}
+
+//! Get the current thread heap
+static inline heap_t*
+get_thread_heap(void) {
+	heap_t* heap = get_thread_heap_raw();
+#if ENABLE_PRELOAD
+	if (EXPECTED(heap != 0))
+		return heap;
+	rpmalloc_initialize();
+	return get_thread_heap_raw();
+#else
+	return heap;
+#endif
+}
+
+//! Set the current thread heap
+static void
+set_thread_heap(heap_t* heap) {
+#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
+	pthread_setspecific(_memory_thread_heap, heap);
+#else
+	_memory_thread_heap = heap;
+#endif
+}
+
+//! Default implementation to map more virtual memory
+static void*
+_memory_map_os(size_t size, size_t* offset);
+
+//! Default implementation to unmap virtual memory
+static void
+_memory_unmap_os(void* address, size_t size, size_t offset, size_t release);
+
+//! Lookup a memory heap from heap ID
+static heap_t*
+_memory_heap_lookup(int32_t id) {
+	uint32_t list_idx = id % HEAP_ARRAY_SIZE;
+	heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
+	while (heap && (heap->id != id))
+		heap = heap->next_heap;
+	return heap;
+}
+
+#if ENABLE_STATISTICS
+#  define _memory_statistics_inc(counter, value) counter += value
+#  define _memory_statistics_dec(counter, value) counter -= value
+#  define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value))
+#  define _memory_statistics_add_peak(atomic_counter, value, peak) do { int32_t _cur_count = atomic_add32(atomic_counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0)
+#  define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value))
+#  define _memory_statistics_inc_alloc(heap, class_idx) do { \
+	int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \
+	if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \
+		heap->size_class_use[class_idx].alloc_peak = alloc_current; \
+	heap->size_class_use[class_idx].alloc_total++; \
+} while(0)
+#  define _memory_statistics_inc_free(heap, class_idx) do { \
+	atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \
+	atomic_incr32(&heap->size_class_use[class_idx].free_total); \
+} while(0)
+#else
+#  define _memory_statistics_inc(counter, value) do {} while(0)
+#  define _memory_statistics_dec(counter, value) do {} while(0)
+#  define _memory_statistics_add(atomic_counter, value) do {} while(0)
+#  define _memory_statistics_add_peak(atomic_counter, value, peak) do {} while (0)
+#  define _memory_statistics_sub(atomic_counter, value) do {} while(0)
+#  define _memory_statistics_inc_alloc(heap, class_idx) do {} while(0)
+#  define _memory_statistics_inc_free(heap, class_idx) do {} while(0)
+#endif
+
+static void
+_memory_heap_cache_insert(heap_t* heap, span_t* span);
+
+//! Map more virtual memory
+static void*
+_memory_map(size_t size, size_t* offset) {
+	assert(!(size % _memory_page_size));
+	assert(size >= _memory_page_size);
+	_memory_statistics_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak);
+	_memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift));
+	return _memory_config.memory_map(size, offset);
+}
+
+//! Unmap virtual memory
+static void
+_memory_unmap(void* address, size_t size, size_t offset, size_t release) {
+	assert(!release || (release >= size));
+	assert(!release || (release >= _memory_page_size));
+	if (release) {
+		assert(!(release % _memory_page_size));
+		_memory_statistics_sub(&_mapped_pages, (release >> _memory_page_size_shift));
+		_memory_statistics_add(&_unmapped_total, (release >> _memory_page_size_shift));
+	}
+	_memory_config.memory_unmap(address, size, offset, release);
+}
+
+//! Declare the span to be a subspan and store distance from master span and span count
+static void
+_memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) {
+	assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER));
+	if (subspan != master) {
+		subspan->flags = SPAN_FLAG_SUBSPAN;
+		subspan->total_spans_or_distance = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift);
+		subspan->align_offset = 0;
+	}
+	subspan->span_count = (uint32_t)span_count;
+}
+
+//! Use reserved spans to fulfill a memory map request (reserve size must be checked by caller)
+static span_t*
+_memory_map_from_reserve(heap_t* heap, size_t span_count) {
+	//Update the heap span reserve
+	span_t* span = heap->span_reserve;
+	heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size);
+	heap->spans_reserved -= span_count;
+
+	_memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count);
+	if (span_count <= LARGE_CLASS_COUNT)
+		_memory_statistics_inc(heap->span_use[span_count - 1].spans_from_reserved, 1);
+
+	return span;
+}
+
+//! Get the aligned number of spans to map in based on wanted count, configured mapping granularity and the page size
+static size_t
+_memory_map_align_span_count(size_t span_count) {
+	size_t request_count = (span_count > _memory_span_map_count) ? span_count : _memory_span_map_count;
+	if ((_memory_page_size > _memory_span_size) && ((request_count * _memory_span_size) % _memory_page_size))
+		request_count += _memory_span_map_count - (request_count % _memory_span_map_count);	
+	return request_count;
+}
+
+//! Store the given spans as reserve in the given heap
+static void
+_memory_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) {
+	heap->span_reserve_master = master;
+	heap->span_reserve = reserve;
+	heap->spans_reserved = reserve_span_count;
+}
+
+//! Setup a newly mapped span
+static void
+_memory_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) {
+	span->total_spans_or_distance = (uint32_t)total_span_count;
+	span->span_count = (uint32_t)span_count;
+	span->align_offset = (uint32_t)align_offset;
+	span->flags = SPAN_FLAG_MASTER;
+	atomic_store32(&span->remaining_spans, (int32_t)total_span_count);	
+}
+
+//! Map a akigned set of spans, taking configured mapping granularity and the page size into account
+static span_t*
+_memory_map_aligned_span_count(heap_t* heap, size_t span_count) {
+	//If we already have some, but not enough, reserved spans, release those to heap cache and map a new
+	//full set of spans. Otherwise we would waste memory if page size > span size (huge pages)
+	size_t aligned_span_count = _memory_map_align_span_count(span_count);
+	size_t align_offset = 0;
+	span_t* span = (span_t*)_memory_map(aligned_span_count * _memory_span_size, &align_offset);
+	if (!span)
+		return 0;
+	_memory_span_initialize(span, aligned_span_count, span_count, align_offset);
+	_memory_statistics_add(&_reserved_spans, aligned_span_count);
+	if (span_count <= LARGE_CLASS_COUNT)
+		_memory_statistics_inc(heap->span_use[span_count - 1].spans_map_calls, 1);
+	if (aligned_span_count > span_count) {
+		if (heap->spans_reserved) {
+			_memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
+			_memory_heap_cache_insert(heap, heap->span_reserve);
+		}
+		_memory_heap_set_reserved_spans(heap, span, (span_t*)pointer_offset(span, span_count * _memory_span_size), aligned_span_count - span_count);
+	}
+	return span;
+}
+
+//! Map in memory pages for the given number of spans (or use previously reserved pages)
+static span_t*
+_memory_map_spans(heap_t* heap, size_t span_count) {
+	if (span_count <= heap->spans_reserved)
+		return _memory_map_from_reserve(heap, span_count);
+	return _memory_map_aligned_span_count(heap, span_count);
+}
+
+//! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
+static void
+_memory_unmap_span(span_t* span) {
+	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
+	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
+
+	int is_master = !!(span->flags & SPAN_FLAG_MASTER);
+	span_t* master = is_master ? span : (span_t*)(pointer_offset(span, -(int32_t)(span->total_spans_or_distance * _memory_span_size)));
+	assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN));
+	assert(master->flags & SPAN_FLAG_MASTER);
+
+	size_t span_count = span->span_count;
+	if (!is_master) {
+		//Directly unmap subspans (unless huge pages, in which case we defer and unmap entire page range with master)
+		assert(span->align_offset == 0);
+		if (_memory_span_size >= _memory_page_size) {
+			_memory_unmap(span, span_count * _memory_span_size, 0, 0);
+			_memory_statistics_sub(&_reserved_spans, span_count);
+		}
+	} else {
+		//Special double flag to denote an unmapped master
+		//It must be kept in memory since span header must be used
+		span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN;
+	}
+
+	if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) {
+		//Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span
+		assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN));
+		size_t unmap_count = master->span_count;
+		if (_memory_span_size < _memory_page_size)
+			unmap_count = master->total_spans_or_distance;
+		_memory_statistics_sub(&_reserved_spans, unmap_count);
+		_memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, master->total_spans_or_distance * _memory_span_size);
+	}
+}
+
+#if ENABLE_THREAD_CACHE
+
+//! Unmap a single linked list of spans
+static void
+_memory_unmap_span_list(span_t* span) {
+	size_t list_size = span->list_size;
+	for (size_t ispan = 0; ispan < list_size; ++ispan) {
+		span_t* next_span = span->next;
+		_memory_unmap_span(span);
+		span = next_span;
+	}
+	assert(!span);
+}
+
+//! Add span to head of single linked span list
+static size_t
+_memory_span_list_push(span_t** head, span_t* span) {
+	span->next = *head;
+	if (*head)
+		span->list_size = (*head)->list_size + 1;
+	else
+		span->list_size = 1;
+	*head = span;
+	return span->list_size;
+}
+
+//! Remove span from head of single linked span list, returns the new list head
+static span_t*
+_memory_span_list_pop(span_t** head) {
+	span_t* span = *head;
+	span_t* next_span = 0;
+	if (span->list_size > 1) {
+		assert(span->next);
+		next_span = span->next;
+		assert(next_span);
+		next_span->list_size = span->list_size - 1;
+	}
+	*head = next_span;
+	return span;
+}
+
+//! Split a single linked span list
+static span_t*
+_memory_span_list_split(span_t* span, size_t limit) {
+	span_t* next = 0;
+	if (limit < 2)
+		limit = 2;
+	if (span->list_size > limit) {
+		uint32_t list_size = 1;
+		span_t* last = span;
+		next = span->next;
+		while (list_size < limit) {
+			last = next;
+			next = next->next;
+			++list_size;
+		}
+		last->next = 0;
+		assert(next);
+		next->list_size = span->list_size - list_size;
+		span->list_size = list_size;
+		span->prev = 0;
+	}
+	return next;
+}
+
+#endif
+
+//! Add a span to partial span double linked list at the head
+static void
+_memory_span_partial_list_add(span_t** head, span_t* span) {
+	if (*head) {
+		span->next = *head;
+		//Maintain pointer to tail span
+		span->prev = (*head)->prev;
+		(*head)->prev = span;
+	} else {
+		span->next = 0;
+		span->prev = span;
+	}
+	*head = span;
+}
+
+//! Add a span to partial span double linked list at the tail
+static void
+_memory_span_partial_list_add_tail(span_t** head, span_t* span) {
+	span->next = 0;
+	if (*head) {
+		span_t* tail = (*head)->prev;
+		tail->next = span;
+		span->prev = tail;
+		//Maintain pointer to tail span
+		(*head)->prev = span;
+	} else {
+		span->prev = span;
+		*head = span;
+	}
+}
+
+//! Pop head span from partial span double linked list
+static void
+_memory_span_partial_list_pop_head(span_t** head) {
+	span_t* span = *head;
+	*head = span->next;
+	if (*head) {
+		//Maintain pointer to tail span
+		(*head)->prev = span->prev;
+	}
+}
+
+//! Remove a span from partial span double linked list
+static void
+_memory_span_partial_list_remove(span_t** head, span_t* span) {
+	if (UNEXPECTED(*head == span)) {
+		_memory_span_partial_list_pop_head(head);
+	} else {
+		span_t* next_span = span->next;
+		span_t* prev_span = span->prev;
+		prev_span->next = next_span;
+		if (EXPECTED(next_span != 0)) {
+			next_span->prev = prev_span;
+		} else {
+			//Update pointer to tail span
+			(*head)->prev = prev_span;
+		}
+	}
+}
+
+#if ENABLE_GLOBAL_CACHE
+
+//! Insert the given list of memory page spans in the global cache
+static void
+_memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) {
+	assert((span->list_size == 1) || (span->next != 0));
+	int32_t list_size = (int32_t)span->list_size;
+	//Unmap if cache has reached the limit
+	if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
+#if !ENABLE_UNLIMITED_GLOBAL_CACHE
+		_memory_unmap_span_list(span);
+		atomic_add32(&cache->size, -list_size);
+		return;
+#endif
+	}
+	void* current_cache, *new_cache;
+	do {
+		current_cache = atomic_load_ptr(&cache->cache);
+		span->prev = (span_t*)((uintptr_t)current_cache & _memory_span_mask);
+		new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
+	} while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache));
+}
+
+//! Extract a number of memory page spans from the global cache
+static span_t*
+_memory_cache_extract(global_cache_t* cache) {
+	uintptr_t span_ptr;
+	do {
+		void* global_span = atomic_load_ptr(&cache->cache);
+		span_ptr = (uintptr_t)global_span & _memory_span_mask;
+		if (span_ptr) {
+			span_t* span = (span_t*)span_ptr;
+			//By accessing the span ptr before it is swapped out of list we assume that a contending thread
+			//does not manage to traverse the span to being unmapped before we access it
+			void* new_cache = (void*)((uintptr_t)span->prev | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
+			if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) {
+				atomic_add32(&cache->size, -(int32_t)span->list_size);
+				return span;
+			}
+		}
+	} while (span_ptr);
+	return 0;
+}
+
+//! Finalize a global cache, only valid from allocator finalization (not thread safe)
+static void
+_memory_cache_finalize(global_cache_t* cache) {
+	void* current_cache = atomic_load_ptr(&cache->cache);
+	span_t* span = (span_t*)((uintptr_t)current_cache & _memory_span_mask);
+	while (span) {
+		span_t* skip_span = (span_t*)((uintptr_t)span->prev & _memory_span_mask);
+		atomic_add32(&cache->size, -(int32_t)span->list_size);
+		_memory_unmap_span_list(span);
+		span = skip_span;
+	}
+	assert(!atomic_load32(&cache->size));
+	atomic_store_ptr(&cache->cache, 0);
+	atomic_store32(&cache->size, 0);
+}
+
+//! Insert the given list of memory page spans in the global cache
+static void
+_memory_global_cache_insert(span_t* span) {
+	size_t span_count = span->span_count;
+#if ENABLE_UNLIMITED_GLOBAL_CACHE
+	_memory_cache_insert(&_memory_span_cache[span_count - 1], span, 0);
+#else
+	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * ((span_count == 1) ? _memory_span_release_count : _memory_span_release_count_large));
+	_memory_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit);
+#endif
+}
+
+//! Extract a number of memory page spans from the global cache for large blocks
+static span_t*
+_memory_global_cache_extract(size_t span_count) {
+	span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]);
+	assert(!span || (span->span_count == span_count));
+	return span;
+}
+
+#endif
+
+#if ENABLE_THREAD_CACHE
+//! Adopt the deferred span cache list
+static void
+_memory_heap_cache_adopt_deferred(heap_t* heap) {
+	atomic_thread_fence_acquire();
+	span_t* span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred);
+	if (!span)
+		return;
+	do {
+		span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred);
+	} while (!atomic_cas_ptr(&heap->span_cache_deferred, 0, span));
+	while (span) {
+		span_t* next_span = span->next;
+		_memory_span_list_push(&heap->span_cache[0], span);
+#if ENABLE_STATISTICS
+		atomic_decr32(&heap->span_use[span->span_count - 1].current);
+		++heap->size_class_use[span->size_class].spans_to_cache;
+		--heap->size_class_use[span->size_class].spans_current;
+#endif
+		span = next_span;
+	}
+}
+#endif
+
+//! Insert a single span into thread heap cache, releasing to global cache if overflow
+static void
+_memory_heap_cache_insert(heap_t* heap, span_t* span) {
+#if ENABLE_THREAD_CACHE
+	size_t span_count = span->span_count;
+	size_t idx = span_count - 1;
+	_memory_statistics_inc(heap->span_use[idx].spans_to_cache, 1);
+	if (!idx)
+		_memory_heap_cache_adopt_deferred(heap);
+#if ENABLE_UNLIMITED_THREAD_CACHE
+	_memory_span_list_push(&heap->span_cache[idx], span);
+#else
+	const size_t release_count = (!idx ? _memory_span_release_count : _memory_span_release_count_large);
+	size_t current_cache_size = _memory_span_list_push(&heap->span_cache[idx], span);
+	if (current_cache_size <= release_count)
+		return;
+	const size_t hard_limit = release_count * THREAD_CACHE_MULTIPLIER;
+	if (current_cache_size <= hard_limit) {
+#if ENABLE_ADAPTIVE_THREAD_CACHE
+		//Require 25% of high water mark to remain in cache (and at least 1, if use is 0)
+		const size_t high_mark = heap->span_use[idx].high;
+		const size_t min_limit = (high_mark >> 2) + release_count + 1;
+		if (current_cache_size < min_limit)
+			return;
+#else
+		return;
+#endif
+	}
+	heap->span_cache[idx] = _memory_span_list_split(span, release_count);
+	assert(span->list_size == release_count);
+#if ENABLE_STATISTICS
+	heap->thread_to_global += (size_t)span->list_size * span_count * _memory_span_size;
+	heap->span_use[idx].spans_to_global += span->list_size;
+#endif
+#if ENABLE_GLOBAL_CACHE
+	_memory_global_cache_insert(span);
+#else
+	_memory_unmap_span_list(span);
+#endif
+#endif
+#else
+	(void)sizeof(heap);
+	_memory_unmap_span(span);
+#endif
+}
+
+//! Extract the given number of spans from the different cache levels
+static span_t*
+_memory_heap_thread_cache_extract(heap_t* heap, size_t span_count) {
+#if ENABLE_THREAD_CACHE
+	size_t idx = span_count - 1;
+	if (!idx)
+		_memory_heap_cache_adopt_deferred(heap);
+	if (heap->span_cache[idx]) {
+#if ENABLE_STATISTICS
+		heap->span_use[idx].spans_from_cache++;
+#endif
+		return _memory_span_list_pop(&heap->span_cache[idx]);
+	}
+#endif
+	return 0;
+}
+
+static span_t*
+_memory_heap_reserved_extract(heap_t* heap, size_t span_count) {
+	if (heap->spans_reserved >= span_count)
+		return _memory_map_spans(heap, span_count);
+	return 0;
+}
+
+//! Extract a span from the global cache
+static span_t*
+_memory_heap_global_cache_extract(heap_t* heap, size_t span_count) {
+#if ENABLE_GLOBAL_CACHE
+	size_t idx = span_count - 1;
+	heap->span_cache[idx] = _memory_global_cache_extract(span_count);
+	if (heap->span_cache[idx]) {
+#if ENABLE_STATISTICS
+		heap->global_to_thread += (size_t)heap->span_cache[idx]->list_size * span_count * _memory_span_size;
+		heap->span_use[idx].spans_from_global += heap->span_cache[idx]->list_size;
+#endif
+		return _memory_span_list_pop(&heap->span_cache[idx]);
+	}
+#endif
+	return 0;
+}
+
+//! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory
+static span_t*
+_memory_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_idx) {
+	(void)sizeof(class_idx);
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	uint32_t idx = (uint32_t)span_count - 1;
+	uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current);
+	if (current_count > heap->span_use[idx].high)
+		heap->span_use[idx].high = current_count;
+#if ENABLE_STATISTICS
+	uint32_t spans_current = ++heap->size_class_use[class_idx].spans_current;
+	if (spans_current > heap->size_class_use[class_idx].spans_peak)
+		heap->size_class_use[class_idx].spans_peak = spans_current;
+#endif
+#endif	
+	span_t* span = _memory_heap_thread_cache_extract(heap, span_count);
+	if (EXPECTED(span != 0)) {
+		_memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1);
+		return span;
+	}
+	span = _memory_heap_reserved_extract(heap, span_count);
+	if (EXPECTED(span != 0)) {
+		_memory_statistics_inc(heap->size_class_use[class_idx].spans_from_reserved, 1);
+		return span;
+	}
+	span = _memory_heap_global_cache_extract(heap, span_count);
+	if (EXPECTED(span != 0)) {
+		_memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1);
+		return span;
+	}
+	//Final fallback, map in more virtual memory
+	span = _memory_map_spans(heap, span_count);
+	_memory_statistics_inc(heap->size_class_use[class_idx].spans_map_calls, 1);
+	return span;
+}
+
+//! Move the span (used for small or medium allocations) to the heap thread cache
+static void
+_memory_span_release_to_cache(heap_t* heap, span_t* span) {
+	heap_class_t* heap_class = heap->span_class + span->size_class;
+	assert(heap_class->partial_span != span);
+	if (span->state == SPAN_STATE_PARTIAL)
+		_memory_span_partial_list_remove(&heap_class->partial_span, span);
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	atomic_decr32(&heap->span_use[0].current);
+#endif
+	_memory_statistics_inc(heap->span_use[0].spans_to_cache, 1);
+	_memory_statistics_inc(heap->size_class_use[span->size_class].spans_to_cache, 1);
+	_memory_statistics_dec(heap->size_class_use[span->size_class].spans_current, 1);
+	_memory_heap_cache_insert(heap, span);
+}
+
+//! Initialize a (partial) free list up to next system memory page, while reserving the first block
+//! as allocated, returning number of blocks in list
+static uint32_t
+free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start,
+                       uint32_t block_count, uint32_t block_size) {
+	assert(block_count);
+	*first_block = block_start;
+	if (block_count > 1) {
+		void* free_block = pointer_offset(block_start, block_size);
+		void* block_end = pointer_offset(block_start, block_size * block_count);
+		//If block size is less than half a memory page, bound init to next memory page boundary
+		if (block_size < (_memory_page_size >> 1)) {
+			void* page_end = pointer_offset(page_start, _memory_page_size);
+			if (page_end < block_end)
+				block_end = page_end;
+		}
+		*list = free_block;
+		block_count = 2;
+		void* next_block = pointer_offset(free_block, block_size);
+		while (next_block < block_end) {
+			*((void**)free_block) = next_block;
+			free_block = next_block;
+			++block_count;
+			next_block = pointer_offset(next_block, block_size);
+		}
+		*((void**)free_block) = 0;
+	} else {
+		*list = 0;
+	}
+	return block_count;
+}
+
+//! Initialize an unused span (from cache or mapped) to be new active span
+static void*
+_memory_span_set_new_active(heap_t* heap, heap_class_t* heap_class, span_t* span, uint32_t class_idx) {
+	assert(span->span_count == 1);
+	size_class_t* size_class = _memory_size_class + class_idx;
+	span->size_class = class_idx;
+	span->heap = heap;
+	span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS;
+	span->block_count = size_class->block_count;
+	span->block_size = size_class->block_size;
+	span->state = SPAN_STATE_ACTIVE;
+	span->free_list = 0;
+
+	//Setup free list. Only initialize one system page worth of free blocks in list
+	void* block;
+	span->free_list_limit = free_list_partial_init(&heap_class->free_list, &block, 
+		span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size);
+	atomic_store_ptr(&span->free_list_deferred, 0);
+	span->list_size = 0;
+	atomic_thread_fence_release();
+
+	_memory_span_partial_list_add(&heap_class->partial_span, span);
+	return block;
+}
+
+//! Promote a partially used span (from heap used list) to be new active span
+static void
+_memory_span_set_partial_active(heap_class_t* heap_class, span_t* span) {
+	assert(span->state == SPAN_STATE_PARTIAL);
+	assert(span->block_count == _memory_size_class[span->size_class].block_count);
+	//Move data to heap size class and set span as active
+	heap_class->free_list = span->free_list;
+	span->state = SPAN_STATE_ACTIVE;
+	span->free_list = 0;
+	assert(heap_class->free_list);
+}
+
+//! Mark span as full (from active)
+static void
+_memory_span_set_active_full(heap_class_t* heap_class, span_t* span) {
+	assert(span->state == SPAN_STATE_ACTIVE);
+	assert(span == heap_class->partial_span);
+	_memory_span_partial_list_pop_head(&heap_class->partial_span);
+	span->used_count = span->block_count;
+	span->state = SPAN_STATE_FULL;
+	span->free_list = 0;
+}
+
+//! Move span from full to partial state
+static void
+_memory_span_set_full_partial(heap_t* heap, span_t* span) {
+	assert(span->state == SPAN_STATE_FULL);
+	heap_class_t* heap_class = &heap->span_class[span->size_class];
+	span->state = SPAN_STATE_PARTIAL;
+	_memory_span_partial_list_add_tail(&heap_class->partial_span, span);
+}
+
+static void*
+_memory_span_extract_deferred(span_t* span) {
+	void* free_list;
+	do {
+		free_list = atomic_load_ptr(&span->free_list_deferred);
+	} while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list));
+	span->list_size = 0;
+	atomic_store_ptr(&span->free_list_deferred, 0);
+	atomic_thread_fence_release();
+	return free_list;
+}
+
+//! Pop first block from a free list
+static void*
+free_list_pop(void** list) {
+	void* block = *list;
+	*list = *((void**)block);
+	return block;
+}
+
+//! Allocate a small/medium sized memory block from the given heap
+static void*
+_memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
+	heap_class_t* heap_class = &heap->span_class[class_idx];
+	void* block;
+
+	span_t* active_span = heap_class->partial_span;
+	if (EXPECTED(active_span != 0)) {
+		assert(active_span->state == SPAN_STATE_ACTIVE);
+		assert(active_span->block_count == _memory_size_class[active_span->size_class].block_count);
+		//Swap in free list if not empty
+		if (active_span->free_list) {
+			heap_class->free_list = active_span->free_list;
+			active_span->free_list = 0;
+			return free_list_pop(&heap_class->free_list);
+		}
+		//If the span did not fully initialize free list, link up another page worth of blocks
+		if (active_span->free_list_limit < active_span->block_count) {
+			void* block_start = pointer_offset(active_span, SPAN_HEADER_SIZE + (active_span->free_list_limit * active_span->block_size));
+			active_span->free_list_limit += free_list_partial_init(&heap_class->free_list, &block,
+				(void*)((uintptr_t)block_start & ~(_memory_page_size - 1)), block_start,
+				active_span->block_count - active_span->free_list_limit, active_span->block_size);
+			return block;
+		}
+		//Swap in deferred free list
+		atomic_thread_fence_acquire();
+		if (atomic_load_ptr(&active_span->free_list_deferred)) {
+			heap_class->free_list = _memory_span_extract_deferred(active_span);
+			return free_list_pop(&heap_class->free_list);
+		}
+
+		//If the active span is fully allocated, mark span as free floating (fully allocated and not part of any list)
+		assert(!heap_class->free_list);
+		assert(active_span->free_list_limit >= active_span->block_count);
+		_memory_span_set_active_full(heap_class, active_span);
+	}
+	assert(!heap_class->free_list);
+
+	//Try promoting a semi-used span to active
+	active_span = heap_class->partial_span;
+	if (EXPECTED(active_span != 0)) {
+		_memory_span_set_partial_active(heap_class, active_span);
+		return free_list_pop(&heap_class->free_list);
+	}
+	assert(!heap_class->free_list);
+	assert(!heap_class->partial_span);
+
+	//Find a span in one of the cache levels
+	active_span = _memory_heap_extract_new_span(heap, 1, class_idx);
+
+	//Mark span as owned by this heap and set base data, return first block
+	return _memory_span_set_new_active(heap, heap_class, active_span, class_idx);
+}
+
+//! Allocate a small sized memory block from the given heap
+static void*
+_memory_allocate_small(heap_t* heap, size_t size) {
+	//Small sizes have unique size classes
+	const uint32_t class_idx = (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT);
+	_memory_statistics_inc_alloc(heap, class_idx);
+	if (EXPECTED(heap->span_class[class_idx].free_list != 0))
+		return free_list_pop(&heap->span_class[class_idx].free_list);
+	return _memory_allocate_from_heap_fallback(heap, class_idx);
+}
+
+//! Allocate a medium sized memory block from the given heap
+static void*
+_memory_allocate_medium(heap_t* heap, size_t size) {
+	//Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes)
+	const uint32_t base_idx = (uint32_t)(SMALL_CLASS_COUNT + ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT));
+	const uint32_t class_idx = _memory_size_class[base_idx].class_idx;
+	_memory_statistics_inc_alloc(heap, class_idx);
+	if (EXPECTED(heap->span_class[class_idx].free_list != 0))
+		return free_list_pop(&heap->span_class[class_idx].free_list);
+	return _memory_allocate_from_heap_fallback(heap, class_idx);
+}
+
+//! Allocate a large sized memory block from the given heap
+static void*
+_memory_allocate_large(heap_t* heap, size_t size) {
+	//Calculate number of needed max sized spans (including header)
+	//Since this function is never called if size > LARGE_SIZE_LIMIT
+	//the span_count is guaranteed to be <= LARGE_CLASS_COUNT
+	size += SPAN_HEADER_SIZE;
+	size_t span_count = size >> _memory_span_size_shift;
+	if (size & (_memory_span_size - 1))
+		++span_count;
+	size_t idx = span_count - 1;
+
+	//Find a span in one of the cache levels
+	span_t* span = _memory_heap_extract_new_span(heap, span_count, SIZE_CLASS_COUNT);
+
+	//Mark span as owned by this heap and set base data
+	assert(span->span_count == span_count);
+	span->size_class = (uint32_t)(SIZE_CLASS_COUNT + idx);
+	span->heap = heap;
+	atomic_thread_fence_release();
+
+	return pointer_offset(span, SPAN_HEADER_SIZE);
+}
+
+//! Allocate a huge block by mapping memory pages directly
+static void*
+_memory_allocate_huge(size_t size) {
+	size += SPAN_HEADER_SIZE;
+	size_t num_pages = size >> _memory_page_size_shift;
+	if (size & (_memory_page_size - 1))
+		++num_pages;
+	size_t align_offset = 0;
+	span_t* span = (span_t*)_memory_map(num_pages * _memory_page_size, &align_offset);
+	if (!span)
+		return span;
+	//Store page count in span_count
+	span->size_class = (uint32_t)-1;
+	span->span_count = (uint32_t)num_pages;
+	span->align_offset = (uint32_t)align_offset;
+	_memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
+
+	return pointer_offset(span, SPAN_HEADER_SIZE);
+}
+
+//! Allocate a block larger than medium size
+static void*
+_memory_allocate_oversized(heap_t* heap, size_t size) {
+	if (size <= LARGE_SIZE_LIMIT)
+		return _memory_allocate_large(heap, size);
+	return _memory_allocate_huge(size);
+}
+
+//! Allocate a block of the given size
+static void*
+_memory_allocate(heap_t* heap, size_t size) {
+	if (EXPECTED(size <= SMALL_SIZE_LIMIT))
+		return _memory_allocate_small(heap, size);
+	else if (size <= _memory_medium_size_limit)
+		return _memory_allocate_medium(heap, size);
+	return _memory_allocate_oversized(heap, size);
+}
+
+//! Allocate a new heap
+static heap_t*
+_memory_allocate_heap(void) {
+	void* raw_heap;
+	void* next_raw_heap;
+	uintptr_t orphan_counter;
+	heap_t* heap;
+	heap_t* next_heap;
+	//Try getting an orphaned heap
+	atomic_thread_fence_acquire();
+	do {
+		raw_heap = atomic_load_ptr(&_memory_orphan_heaps);
+		heap = (heap_t*)((uintptr_t)raw_heap & ~(uintptr_t)0x1FF);
+		if (!heap)
+			break;
+		next_heap = heap->next_orphan;
+		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
+		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & (uintptr_t)0x1FF));
+	} while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap));
+
+	if (!heap) {
+		//Map in pages for a new heap
+		size_t align_offset = 0;
+		heap = (heap_t*)_memory_map((1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, &align_offset);
+		if (!heap)
+			return heap;
+		memset((char*)heap, 0, sizeof(heap_t));
+		heap->align_offset = align_offset;
+
+		//Get a new heap ID
+		do {
+			heap->id = atomic_incr32(&_memory_heap_id);
+			if (_memory_heap_lookup(heap->id))
+				heap->id = 0;
+		} while (!heap->id);
+
+		//Link in heap in heap ID map
+		size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
+		do {
+			next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
+			heap->next_heap = next_heap;
+		} while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap));
+	}
+
+	return heap;
+}
+
+//! Deallocate the given small/medium memory block in the current thread local heap
+static void
+_memory_deallocate_direct(span_t* span, void* block) {
+	assert(span->heap == get_thread_heap_raw());
+	uint32_t state = span->state;
+	//Add block to free list
+	*((void**)block) = span->free_list;
+	span->free_list = block;
+	if (UNEXPECTED(state == SPAN_STATE_ACTIVE))
+		return;
+	uint32_t used = --span->used_count;
+	uint32_t free = span->list_size;
+	if (UNEXPECTED(used == free))
+		_memory_span_release_to_cache(span->heap, span);
+	else if (UNEXPECTED(state == SPAN_STATE_FULL))
+		_memory_span_set_full_partial(span->heap, span);
+}
+
+//! Put the block in the deferred free list of the owning span
+static void
+_memory_deallocate_defer(span_t* span, void* block) {
+	atomic_thread_fence_acquire();
+	if (span->state == SPAN_STATE_FULL) {
+		if ((span->list_size + 1) == span->block_count) {
+			//Span will be completely freed by deferred deallocations, no other thread can
+			//currently touch it. Safe to move to owner heap deferred cache
+			span_t* last_head;
+			heap_t* heap = span->heap;
+			do {
+				last_head = (span_t*)atomic_load_ptr(&heap->span_cache_deferred);
+				span->next = last_head;
+			} while (!atomic_cas_ptr(&heap->span_cache_deferred, span, last_head));
+			return;
+		}
+	}
+
+	void* free_list;
+	do {
+		atomic_thread_fence_acquire();
+		free_list = atomic_load_ptr(&span->free_list_deferred);
+		*((void**)block) = free_list;
+	} while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list));
+	++span->list_size;
+	atomic_store_ptr(&span->free_list_deferred, block);
+}
+
+static void
+_memory_deallocate_small_or_medium(span_t* span, void* p) {
+	_memory_statistics_inc_free(span->heap, span->size_class);
+	if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) {
+		//Realign pointer to block start
+		void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+		uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
+		p = pointer_offset(p, -(int32_t)(block_offset % span->block_size));
+	}
+	//Check if block belongs to this heap or if deallocation should be deferred
+	if (span->heap == get_thread_heap_raw())
+		_memory_deallocate_direct(span, p);
+	else
+		_memory_deallocate_defer(span, p);
+}
+
+//! Deallocate the given large memory block to the current heap
+static void
+_memory_deallocate_large(span_t* span) {
+	//Decrease counter
+	assert(span->span_count == ((size_t)span->size_class - SIZE_CLASS_COUNT + 1));
+	assert(span->size_class >= SIZE_CLASS_COUNT);
+	assert(span->size_class - SIZE_CLASS_COUNT < LARGE_CLASS_COUNT);
+	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
+	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
+	//Large blocks can always be deallocated and transferred between heaps
+	//Investigate if it is better to defer large spans as well through span_cache_deferred,
+	//possibly with some heuristics to pick either scheme at runtime per deallocation
+	heap_t* heap = get_thread_heap();
+	if (!heap) return;
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	size_t idx = span->span_count - 1;
+	atomic_decr32(&span->heap->span_use[idx].current);
+#endif
+	if ((span->span_count > 1) && !heap->spans_reserved) {
+		heap->span_reserve = span;
+		heap->spans_reserved = span->span_count;
+		if (span->flags & SPAN_FLAG_MASTER) {
+			heap->span_reserve_master = span;
+		} else { //SPAN_FLAG_SUBSPAN
+			uint32_t distance = span->total_spans_or_distance;
+			span_t* master = (span_t*)pointer_offset(span, -(int32_t)(distance * _memory_span_size));
+			heap->span_reserve_master = master;
+			assert(master->flags & SPAN_FLAG_MASTER);
+			assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count);
+		}
+		_memory_statistics_inc(heap->span_use[idx].spans_to_reserved, 1);
+	} else {
+		//Insert into cache list
+		_memory_heap_cache_insert(heap, span);
+	}
+}
+
+//! Deallocate the given huge span
+static void
+_memory_deallocate_huge(span_t* span) {
+	//Oversized allocation, page count is stored in span_count
+	size_t num_pages = span->span_count;
+	_memory_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size);
+	_memory_statistics_sub(&_huge_pages_current, num_pages);
+}
+
+//! Deallocate the given block
+static void
+_memory_deallocate(void* p) {
+	//Grab the span (always at start of span, using span alignment)
+	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
+	if (UNEXPECTED(!span))
+		return;
+	if (EXPECTED(span->size_class < SIZE_CLASS_COUNT))
+		_memory_deallocate_small_or_medium(span, p);
+	else if (span->size_class != (uint32_t)-1)
+		_memory_deallocate_large(span);
+	else
+		_memory_deallocate_huge(span);
+}
+
+//! Reallocate the given block to the given size
+static void*
+_memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
+	if (p) {
+		//Grab the span using guaranteed span alignment
+		span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
+		if (span->heap) {
+			if (span->size_class < SIZE_CLASS_COUNT) {
+				//Small/medium sized block
+				assert(span->span_count == 1);
+				void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+				uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
+				uint32_t block_idx = block_offset / span->block_size;
+				void* block = pointer_offset(blocks_start, block_idx * span->block_size);
+				if (!oldsize)
+					oldsize = span->block_size - (uint32_t)pointer_diff(p, block);
+				if ((size_t)span->block_size >= size) {
+					//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+					if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+						memmove(block, p, oldsize);
+					return block;
+				}
+			} else {
+				//Large block
+				size_t total_size = size + SPAN_HEADER_SIZE;
+				size_t num_spans = total_size >> _memory_span_size_shift;
+				if (total_size & (_memory_span_mask - 1))
+					++num_spans;
+				size_t current_spans = span->span_count;
+				assert(current_spans == ((span->size_class - SIZE_CLASS_COUNT) + 1));
+				void* block = pointer_offset(span, SPAN_HEADER_SIZE);
+				if (!oldsize)
+					oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
+				if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) {
+					//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+					if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+						memmove(block, p, oldsize);
+					return block;
+				}
+			}
+		} else {
+			//Oversized block
+			size_t total_size = size + SPAN_HEADER_SIZE;
+			size_t num_pages = total_size >> _memory_page_size_shift;
+			if (total_size & (_memory_page_size - 1))
+				++num_pages;
+			//Page count is stored in span_count
+			size_t current_pages = span->span_count;
+			void* block = pointer_offset(span, SPAN_HEADER_SIZE);
+			if (!oldsize)
+				oldsize = (current_pages * _memory_page_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
+			if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) {
+				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block, p, oldsize);
+				return block;
+			}
+		}
+	} else {
+		oldsize = 0;
+	}
+
+	//Size is greater than block size, need to allocate a new block and deallocate the old
+	heap_t* heap = get_thread_heap();
+	//Avoid hysteresis by overallocating if increase is small (below 37%)
+	size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3);
+	size_t new_size = (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size);
+	void* block = _memory_allocate(heap, new_size);
+	if (p && block) {
+		if (!(flags & RPMALLOC_NO_PRESERVE))
+			memcpy(block, p, oldsize < new_size ? oldsize : new_size);
+		_memory_deallocate(p);
+	}
+
+	return block;
+}
+
+//! Get the usable size of the given block
+static size_t
+_memory_usable_size(void* p) {
+	//Grab the span using guaranteed span alignment
+	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
+	if (span->heap) {
+		//Small/medium block
+		if (span->size_class < SIZE_CLASS_COUNT) {
+			void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+			return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size);
+		}
+
+		//Large block
+		size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1;
+		return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span);
+	}
+
+	//Oversized block, page count is stored in span_count
+	size_t current_pages = span->span_count;
+	return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span);
+}
+
+//! Adjust and optimize the size class properties for the given class
+static void
+_memory_adjust_size_class(size_t iclass) {
+	size_t block_size = _memory_size_class[iclass].block_size;
+	size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size;
+
+	_memory_size_class[iclass].block_count = (uint16_t)block_count;
+	_memory_size_class[iclass].class_idx = (uint16_t)iclass;
+
+	//Check if previous size classes can be merged
+	size_t prevclass = iclass;
+	while (prevclass > 0) {
+		--prevclass;
+		//A class can be merged if number of pages and number of blocks are equal
+		if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)
+			memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass]));
+		else
+			break;
+	}
+}
+
+static void
+_memory_heap_finalize(void* heapptr) {
+	heap_t* heap = (heap_t*)heapptr;
+	if (!heap)
+		return;
+	//Release thread cache spans back to global cache
+#if ENABLE_THREAD_CACHE
+	_memory_heap_cache_adopt_deferred(heap);
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_t* span = heap->span_cache[iclass];
+#if ENABLE_GLOBAL_CACHE
+		while (span) {
+			assert(span->span_count == (iclass + 1));
+			size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large);
+			span_t* next = _memory_span_list_split(span, (uint32_t)release_count);
+#if ENABLE_STATISTICS
+			heap->thread_to_global += (size_t)span->list_size * span->span_count * _memory_span_size;
+			heap->span_use[iclass].spans_to_global += span->list_size;
+#endif
+			_memory_global_cache_insert(span);
+			span = next;
+		}
+#else
+		if (span)
+			_memory_unmap_span_list(span);
+#endif
+		heap->span_cache[iclass] = 0;
+	}
+#endif
+
+	//Orphan the heap
+	void* raw_heap;
+	uintptr_t orphan_counter;
+	heap_t* last_heap;
+	do {
+		last_heap = (heap_t*)atomic_load_ptr(&_memory_orphan_heaps);
+		heap->next_orphan = (heap_t*)((uintptr_t)last_heap & ~(uintptr_t)0x1FF);
+		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
+		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & (uintptr_t)0x1FF));
+	} while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap));
+
+	set_thread_heap(0);
+
+#if ENABLE_STATISTICS
+	atomic_decr32(&_memory_active_heaps);
+	assert(atomic_load32(&_memory_active_heaps) >= 0);
+#endif
+}
+
+#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+#include <fibersapi.h>
+static DWORD fls_key;
+static void NTAPI
+rp_thread_destructor(void* value) {
+	if (value)
+		rpmalloc_thread_finalize();
+}
+#endif
+
+#if PLATFORM_POSIX
+#  include <sys/mman.h>
+#  include <sched.h>
+#  ifdef __FreeBSD__
+#    include <sys/sysctl.h>
+#    define MAP_HUGETLB MAP_ALIGNED_SUPER
+#  endif
+#  ifndef MAP_UNINITIALIZED
+#    define MAP_UNINITIALIZED 0
+#  endif
+#endif
+#include <errno.h>
+
+//! Initialize the allocator and setup global data
+TRACY_API int
+rpmalloc_initialize(void) {
+	if (_rpmalloc_initialized) {
+		rpmalloc_thread_initialize();
+		return 0;
+	}
+	memset(&_memory_config, 0, sizeof(rpmalloc_config_t));
+	return rpmalloc_initialize_config(0);
+}
+
+int
+rpmalloc_initialize_config(const rpmalloc_config_t* config) {
+	if (_rpmalloc_initialized) {
+		rpmalloc_thread_initialize();
+		return 0;
+	}
+	_rpmalloc_initialized = 1;
+
+	if (config)
+		memcpy(&_memory_config, config, sizeof(rpmalloc_config_t));
+
+	if (!_memory_config.memory_map || !_memory_config.memory_unmap) {
+		_memory_config.memory_map = _memory_map_os;
+		_memory_config.memory_unmap = _memory_unmap_os;
+	}
+
+#if RPMALLOC_CONFIGURABLE
+	_memory_page_size = _memory_config.page_size;
+#else
+	_memory_page_size = 0;
+#endif
+	_memory_huge_pages = 0;
+	_memory_map_granularity = _memory_page_size;
+	if (!_memory_page_size) {
+#if PLATFORM_WINDOWS
+		SYSTEM_INFO system_info;
+		memset(&system_info, 0, sizeof(system_info));
+		GetSystemInfo(&system_info);
+		_memory_page_size = system_info.dwPageSize;
+		_memory_map_granularity = system_info.dwAllocationGranularity;
+		if (config && config->enable_huge_pages) {
+			HANDLE token = 0;
+			size_t large_page_minimum = GetLargePageMinimum();
+			if (large_page_minimum)
+				OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
+			if (token) {
+				LUID luid;
+				if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) {
+					TOKEN_PRIVILEGES token_privileges;
+					memset(&token_privileges, 0, sizeof(token_privileges));
+					token_privileges.PrivilegeCount = 1;
+					token_privileges.Privileges[0].Luid = luid;
+					token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+					if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) {
+						DWORD err = GetLastError();
+						if (err == ERROR_SUCCESS) {
+							_memory_huge_pages = 1;
+							_memory_page_size = large_page_minimum;
+							_memory_map_granularity = large_page_minimum;
+						}
+					}
+				}
+				CloseHandle(token);
+			}
+		}
+#else
+		_memory_page_size = (size_t)sysconf(_SC_PAGESIZE);
+		_memory_map_granularity = _memory_page_size;
+		if (config && config->enable_huge_pages) {
+#if defined(__linux__)
+			size_t huge_page_size = 0;
+			FILE* meminfo = fopen("/proc/meminfo", "r");
+			if (meminfo) {
+				char line[128];
+				while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) {
+					line[sizeof(line) - 1] = 0;
+					if (strstr(line, "Hugepagesize:"))
+						huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024;
+				}
+				fclose(meminfo);
+			}
+			if (huge_page_size) {
+				_memory_huge_pages = 1;
+				_memory_page_size = huge_page_size;
+				_memory_map_granularity = huge_page_size;
+			}
+#elif defined(__FreeBSD__)
+			int rc;
+			size_t sz = sizeof(rc);
+
+			if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && rc == 1) {
+				_memory_huge_pages = 1;
+				_memory_page_size = 2 * 1024 * 1024;
+				_memory_map_granularity = _memory_page_size;
+			}
+#elif defined(__APPLE__)
+			_memory_huge_pages = 1;
+			_memory_page_size = 2 * 1024 * 1024;
+			_memory_map_granularity = _memory_page_size;
+#endif
+		}
+#endif
+	} else {
+		if (config && config->enable_huge_pages)
+			_memory_huge_pages = 1;
+	}
+
+	//The ABA counter in heap orphan list is tied to using 512 (bitmask 0x1FF)
+	if (_memory_page_size < 512)
+		_memory_page_size = 512;
+	if (_memory_page_size > (64 * 1024 * 1024))
+		_memory_page_size = (64 * 1024 * 1024);
+	_memory_page_size_shift = 0;
+	size_t page_size_bit = _memory_page_size;
+	while (page_size_bit != 1) {
+		++_memory_page_size_shift;
+		page_size_bit >>= 1;
+	}
+	_memory_page_size = ((size_t)1 << _memory_page_size_shift);
+
+#if RPMALLOC_CONFIGURABLE
+	size_t span_size = _memory_config.span_size;
+	if (!span_size)
+		span_size = (64 * 1024);
+	if (span_size > (256 * 1024))
+		span_size = (256 * 1024);
+	_memory_span_size = 4096;
+	_memory_span_size_shift = 12;
+	while (_memory_span_size < span_size) {
+		_memory_span_size <<= 1;
+		++_memory_span_size_shift;
+	}
+	_memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
+#endif
+
+	_memory_span_map_count = ( _memory_config.span_map_count ? _memory_config.span_map_count : DEFAULT_SPAN_MAP_COUNT);
+	if ((_memory_span_size * _memory_span_map_count) < _memory_page_size)
+		_memory_span_map_count = (_memory_page_size / _memory_span_size);
+	if ((_memory_page_size >= _memory_span_size) && ((_memory_span_map_count * _memory_span_size) % _memory_page_size))
+		_memory_span_map_count = (_memory_page_size / _memory_span_size);
+
+	_memory_config.page_size = _memory_page_size;
+	_memory_config.span_size = _memory_span_size;
+	_memory_config.span_map_count = _memory_span_map_count;
+	_memory_config.enable_huge_pages = _memory_huge_pages;
+
+	_memory_span_release_count = (_memory_span_map_count > 4 ? ((_memory_span_map_count < 64) ? _memory_span_map_count : 64) : 4);
+	_memory_span_release_count_large = (_memory_span_release_count > 8 ? (_memory_span_release_count / 4) : 2);
+
+#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
+	if (pthread_key_create(&_memory_thread_heap, _memory_heap_finalize))
+		return -1;
+#endif
+#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+    fls_key = FlsAlloc(&rp_thread_destructor);
+#endif
+
+	atomic_store32(&_memory_heap_id, 0);
+	atomic_store32(&_memory_orphan_counter, 0);
+#if ENABLE_STATISTICS
+	atomic_store32(&_memory_active_heaps, 0);
+	atomic_store32(&_reserved_spans, 0);
+	atomic_store32(&_mapped_pages, 0);
+	_mapped_pages_peak = 0;
+	atomic_store32(&_mapped_total, 0);
+	atomic_store32(&_unmapped_total, 0);
+	atomic_store32(&_mapped_pages_os, 0);
+	atomic_store32(&_huge_pages_current, 0);
+	_huge_pages_peak = 0;
+#endif
+
+	//Setup all small and medium size classes
+	size_t iclass = 0;
+	_memory_size_class[iclass].block_size = SMALL_GRANULARITY;
+	_memory_adjust_size_class(iclass);
+	for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) {
+		size_t size = iclass * SMALL_GRANULARITY;
+		_memory_size_class[iclass].block_size = (uint32_t)size;
+		_memory_adjust_size_class(iclass);
+	}
+	//At least two blocks per span, then fall back to large allocations
+	_memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1;
+	if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT)
+		_memory_medium_size_limit = MEDIUM_SIZE_LIMIT;
+	for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) {
+		size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY);
+		if (size > _memory_medium_size_limit)
+			break;
+		_memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size;
+		_memory_adjust_size_class(SMALL_CLASS_COUNT + iclass);
+	}
+
+	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx)
+		atomic_store_ptr(&_memory_heaps[list_idx], 0);
+
+	//Initialize this thread
+	rpmalloc_thread_initialize();
+	return 0;
+}
+
+//! Finalize the allocator
+TRACY_API void
+rpmalloc_finalize(void) {
+	atomic_thread_fence_acquire();
+
+	rpmalloc_thread_finalize();
+	//rpmalloc_dump_statistics(stderr);
+
+	//Free all thread caches
+	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
+		heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
+		while (heap) {
+			if (heap->spans_reserved) {
+				span_t* span = _memory_map_spans(heap, heap->spans_reserved);
+				_memory_unmap_span(span);
+			}
+
+			for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+				heap_class_t* heap_class = heap->span_class + iclass;
+				span_t* span = heap_class->partial_span;
+				while (span) {
+					span_t* next = span->next;
+					if (span->state == SPAN_STATE_ACTIVE) {
+						uint32_t used_blocks = span->block_count;
+						if (span->free_list_limit < span->block_count)
+							used_blocks = span->free_list_limit;
+						uint32_t free_blocks = 0;
+						void* block = heap_class->free_list;
+						while (block) {
+							++free_blocks;
+							block = *((void**)block);
+						}
+						block = span->free_list;
+						while (block) {
+							++free_blocks;
+							block = *((void**)block);
+						}
+						if (used_blocks == (free_blocks + span->list_size))
+							_memory_heap_cache_insert(heap, span);
+					} else {
+						if (span->used_count == span->list_size)
+							_memory_heap_cache_insert(heap, span);
+					}
+					span = next;
+				}
+			}
+
+#if ENABLE_THREAD_CACHE
+			//Free span caches (other thread might have deferred after the thread using this heap finalized)
+			_memory_heap_cache_adopt_deferred(heap);
+			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+				if (heap->span_cache[iclass])
+					_memory_unmap_span_list(heap->span_cache[iclass]);
+			}
+#endif
+			heap_t* next_heap = heap->next_heap;
+			size_t heap_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
+			_memory_unmap(heap, heap_size, heap->align_offset, heap_size);
+			heap = next_heap;
+		}
+	}
+
+#if ENABLE_GLOBAL_CACHE
+	//Free global caches
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
+		_memory_cache_finalize(&_memory_span_cache[iclass]);
+#endif
+
+	atomic_store_ptr(&_memory_orphan_heaps, 0);
+	atomic_thread_fence_release();
+
+#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
+	pthread_key_delete(_memory_thread_heap);
+#endif
+#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+    FlsFree(fls_key);
+#endif
+
+#if ENABLE_STATISTICS
+	//If you hit these asserts you probably have memory leaks or double frees in your code
+	assert(!atomic_load32(&_mapped_pages));
+	assert(!atomic_load32(&_reserved_spans));
+	assert(!atomic_load32(&_mapped_pages_os));
+#endif
+
+	_rpmalloc_initialized = 0;
+}
+
+//! Initialize thread, assign heap
+TRACY_API void
+rpmalloc_thread_initialize(void) {
+	if (!get_thread_heap_raw()) {
+		heap_t* heap = _memory_allocate_heap();
+		if (heap) {
+			atomic_thread_fence_acquire();
+#if ENABLE_STATISTICS
+			atomic_incr32(&_memory_active_heaps);
+#endif
+			set_thread_heap(heap);
+#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+			FlsSetValue(fls_key, heap);
+#endif
+		}
+	}
+}
+
+//! Finalize thread, orphan heap
+TRACY_API void
+rpmalloc_thread_finalize(void) {
+	heap_t* heap = get_thread_heap_raw();
+	if (heap)
+		_memory_heap_finalize(heap);
+}
+
+int
+rpmalloc_is_thread_initialized(void) {
+	return (get_thread_heap_raw() != 0) ? 1 : 0;
+}
+
+const rpmalloc_config_t*
+rpmalloc_config(void) {
+	return &_memory_config;
+}
+
+//! Map new pages to virtual memory
+static void*
+_memory_map_os(size_t size, size_t* offset) {
+	//Either size is a heap (a single page) or a (multiple) span - we only need to align spans, and only if larger than map granularity
+	size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0;
+	assert(size >= _memory_page_size);
+#if PLATFORM_WINDOWS
+	//Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed"
+	void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+	if (!ptr) {
+		assert(!"Failed to map virtual memory block");
+		return 0;
+	}
+#else
+	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED;
+#  if defined(__APPLE__)
+	int fd = (int)VM_MAKE_TAG(240U);
+	if (_memory_huge_pages)
+		fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0);
+#  elif defined(MAP_HUGETLB)
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0);
+#  else
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
+#  endif
+	if ((ptr == MAP_FAILED) || !ptr) {
+		assert("Failed to map virtual memory block" == 0);
+		return 0;
+	}
+#endif
+#if ENABLE_STATISTICS
+	atomic_add32(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift));
+#endif
+	if (padding) {
+		size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask);
+		assert(final_padding <= _memory_span_size);
+		assert(final_padding <= padding);
+		assert(!(final_padding % 8));
+		ptr = pointer_offset(ptr, final_padding);
+		*offset = final_padding >> 3;
+	}
+	assert((size < _memory_span_size) || !((uintptr_t)ptr & ~_memory_span_mask));
+	return ptr;
+}
+
+//! Unmap pages from virtual memory
+static void
+_memory_unmap_os(void* address, size_t size, size_t offset, size_t release) {
+	assert(release || (offset == 0));
+	assert(!release || (release >= _memory_page_size));
+	assert(size >= _memory_page_size);
+	if (release && offset) {
+		offset <<= 3;
+		address = pointer_offset(address, -(int32_t)offset);
+#if PLATFORM_POSIX
+		//Padding is always one span size
+		release += _memory_span_size;
+#endif
+	}
+#if !DISABLE_UNMAP
+#if PLATFORM_WINDOWS
+	if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) {
+		assert(!"Failed to unmap virtual memory block");
+	}
+#else
+	if (release) {
+		if (munmap(address, release)) {
+			assert("Failed to unmap virtual memory block" == 0);
+		}
+	}
+	else {
+#if defined(POSIX_MADV_FREE)
+		if (posix_madvise(address, size, POSIX_MADV_FREE))
+#endif
+#if defined(POSIX_MADV_DONTNEED)
+		if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) {
+			assert("Failed to madvise virtual memory block as free" == 0);
+		}
+#endif
+	}
+#endif
+#endif
+#if ENABLE_STATISTICS
+	if (release)
+		atomic_add32(&_mapped_pages_os, -(int32_t)(release >> _memory_page_size_shift));
+#endif
+}
+
+// Extern interface
+
+TRACY_API RPMALLOC_ALLOCATOR void*
+rpmalloc(size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return _memory_allocate(heap, size);
+}
+
+TRACY_API void
+rpfree(void* ptr) {
+	_memory_deallocate(ptr);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpcalloc(size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	heap_t* heap = get_thread_heap();
+	void* block = _memory_allocate(heap, total);
+	memset(block, 0, total);
+	return block;
+}
+
+TRACY_API RPMALLOC_ALLOCATOR void*
+rprealloc(void* ptr, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	return _memory_reallocate(ptr, size, 0, 0);
+}
+
+extern RPMALLOC_ALLOCATOR void*
+rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize,
+                  unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	void* block;
+	if (alignment > 32) {
+		size_t usablesize = _memory_usable_size(ptr);
+		if ((usablesize >= size) && (size >= (usablesize / 2)) && !((uintptr_t)ptr & (alignment - 1)))
+			return ptr;
+
+		block = rpaligned_alloc(alignment, size);
+		if (ptr) {
+			if (!oldsize)
+				oldsize = usablesize;
+			if (!(flags & RPMALLOC_NO_PRESERVE))
+				memcpy(block, ptr, oldsize < size ? oldsize : size);
+			rpfree(ptr);
+		}
+		//Mark as having aligned blocks
+		span_t* span = (span_t*)((uintptr_t)block & _memory_span_mask);
+		span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
+	} else {
+		block = _memory_reallocate(ptr, size, oldsize, flags);
+	}
+	return block;
+}
+
+extern RPMALLOC_ALLOCATOR void*
+rpaligned_alloc(size_t alignment, size_t size) {
+	if (alignment <= 16)
+		return rpmalloc(size);
+
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment) < size) {
+		errno = EINVAL;
+		return 0;
+	}
+	if (alignment & (alignment - 1)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+
+	void* ptr = 0;
+	size_t align_mask = alignment - 1;
+	if (alignment < _memory_page_size) {
+		ptr = rpmalloc(size + alignment);
+		if ((uintptr_t)ptr & align_mask)
+			ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
+		//Mark as having aligned blocks
+		span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask);
+		span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
+		return ptr;
+	}
+
+	// Fallback to mapping new pages for this request. Since pointers passed
+	// to rpfree must be able to reach the start of the span by bitmasking of
+	// the address with the span size, the returned aligned pointer from this
+	// function must be with a span size of the start of the mapped area.
+	// In worst case this requires us to loop and map pages until we get a
+	// suitable memory address. It also means we can never align to span size
+	// or greater, since the span header will push alignment more than one
+	// span size away from span start (thus causing pointer mask to give us
+	// an invalid span start on free)
+	if (alignment & align_mask) {
+		errno = EINVAL;
+		return 0;
+	}
+	if (alignment >= _memory_span_size) {
+		errno = EINVAL;
+		return 0;
+	}
+
+	size_t extra_pages = alignment / _memory_page_size;
+
+	// Since each span has a header, we will at least need one extra memory page
+	size_t num_pages = 1 + (size / _memory_page_size);
+	if (size & (_memory_page_size - 1))
+		++num_pages;
+
+	if (extra_pages > num_pages)
+		num_pages = 1 + extra_pages;
+
+	size_t original_pages = num_pages;
+	size_t limit_pages = (_memory_span_size / _memory_page_size) * 2;
+	if (limit_pages < (original_pages * 2))
+		limit_pages = original_pages * 2;
+
+	size_t mapped_size, align_offset;
+	span_t* span;
+
+retry:
+	align_offset = 0;
+	mapped_size = num_pages * _memory_page_size;
+
+	span = (span_t*)_memory_map(mapped_size, &align_offset);
+	if (!span) {
+		errno = ENOMEM;
+		return 0;
+	}
+	ptr = pointer_offset(span, SPAN_HEADER_SIZE);
+
+	if ((uintptr_t)ptr & align_mask)
+		ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
+
+	if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) ||
+	    (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) ||
+	    (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) {
+		_memory_unmap(span, mapped_size, align_offset, mapped_size);
+		++num_pages;
+		if (num_pages > limit_pages) {
+			errno = EINVAL;
+			return 0;
+		}
+		goto retry;
+	}
+
+	//Store page count in span_count
+	span->size_class = (uint32_t)-1;
+	span->span_count = (uint32_t)num_pages;
+	span->align_offset = (uint32_t)align_offset;
+	_memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
+
+	return ptr;
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmemalign(size_t alignment, size_t size) {
+	return rpaligned_alloc(alignment, size);
+}
+
+extern inline int
+rpposix_memalign(void **memptr, size_t alignment, size_t size) {
+	if (memptr)
+		*memptr = rpaligned_alloc(alignment, size);
+	else
+		return EINVAL;
+	return *memptr ? 0 : ENOMEM;
+}
+
+extern inline size_t
+rpmalloc_usable_size(void* ptr) {
+	return (ptr ? _memory_usable_size(ptr) : 0);
+}
+
+extern inline void
+rpmalloc_thread_collect(void) {
+}
+
+void
+rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
+	memset(stats, 0, sizeof(rpmalloc_thread_statistics_t));
+	heap_t* heap = get_thread_heap_raw();
+	if (!heap)
+		return;
+
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		size_class_t* size_class = _memory_size_class + iclass;
+		heap_class_t* heap_class = heap->span_class + iclass;
+		span_t* span = heap_class->partial_span;
+		while (span) {
+			atomic_thread_fence_acquire();
+			size_t free_count = span->list_size;
+			if (span->state == SPAN_STATE_PARTIAL)
+				free_count += (size_class->block_count - span->used_count);
+			stats->sizecache = free_count * size_class->block_size;
+			span = span->next;
+		}
+	}
+
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		if (heap->span_cache[iclass])
+			stats->spancache = (size_t)heap->span_cache[iclass]->list_size * (iclass + 1) * _memory_span_size;
+		span_t* deferred_list = !iclass ? (span_t*)atomic_load_ptr(&heap->span_cache_deferred) : 0;
+		//TODO: Incorrect, for deferred lists the size is NOT stored in list_size
+		if (deferred_list)
+			stats->spancache = (size_t)deferred_list->list_size * (iclass + 1) * _memory_span_size;
+	}
+#endif
+#if ENABLE_STATISTICS
+	stats->thread_to_global = heap->thread_to_global;
+	stats->global_to_thread = heap->global_to_thread;
+
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		stats->span_use[iclass].current = (size_t)atomic_load32(&heap->span_use[iclass].current);
+		stats->span_use[iclass].peak = (size_t)heap->span_use[iclass].high;
+		stats->span_use[iclass].to_global = (size_t)heap->span_use[iclass].spans_to_global;
+		stats->span_use[iclass].from_global = (size_t)heap->span_use[iclass].spans_from_global;
+		stats->span_use[iclass].to_cache = (size_t)heap->span_use[iclass].spans_to_cache;
+		stats->span_use[iclass].from_cache = (size_t)heap->span_use[iclass].spans_from_cache;
+		stats->span_use[iclass].to_reserved = (size_t)heap->span_use[iclass].spans_to_reserved;
+		stats->span_use[iclass].from_reserved = (size_t)heap->span_use[iclass].spans_from_reserved;
+		stats->span_use[iclass].map_calls = (size_t)heap->span_use[iclass].spans_map_calls;
+	}
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		stats->size_use[iclass].alloc_current = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current);
+		stats->size_use[iclass].alloc_peak = (size_t)heap->size_class_use[iclass].alloc_peak;
+		stats->size_use[iclass].alloc_total = (size_t)heap->size_class_use[iclass].alloc_total;
+		stats->size_use[iclass].free_total = (size_t)atomic_load32(&heap->size_class_use[iclass].free_total);
+		stats->size_use[iclass].spans_to_cache = (size_t)heap->size_class_use[iclass].spans_to_cache;
+		stats->size_use[iclass].spans_from_cache = (size_t)heap->size_class_use[iclass].spans_from_cache;
+		stats->size_use[iclass].spans_from_reserved = (size_t)heap->size_class_use[iclass].spans_from_reserved;
+		stats->size_use[iclass].map_calls = (size_t)heap->size_class_use[iclass].spans_map_calls;
+	}
+#endif
+}
+
+void
+rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
+	memset(stats, 0, sizeof(rpmalloc_global_statistics_t));
+#if ENABLE_STATISTICS
+	stats->mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size;
+	stats->mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size;
+	stats->mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size;
+	stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
+	stats->huge_alloc = (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size;
+	stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size;
+#endif
+#if ENABLE_GLOBAL_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		stats->cached += (size_t)atomic_load32(&_memory_span_cache[iclass].size) * (iclass + 1) * _memory_span_size;
+	}
+#endif
+}
+
+void
+rpmalloc_dump_statistics(void* file) {
+#if ENABLE_STATISTICS
+	//If you hit this assert, you still have active threads or forgot to finalize some thread(s)
+	assert(atomic_load32(&_memory_active_heaps) == 0);
+
+	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
+		heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
+		while (heap) {
+			fprintf(file, "Heap %d stats:\n", heap->id);
+			fprintf(file, "Class   CurAlloc  PeakAlloc   TotAlloc    TotFree  BlkSize BlkCount SpansCur SpansPeak  PeakAllocMiB  ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n");
+			for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+				if (!heap->size_class_use[iclass].alloc_total) {
+					assert(!atomic_load32(&heap->size_class_use[iclass].free_total));
+					assert(!heap->size_class_use[iclass].spans_map_calls);
+					continue;
+				}
+				fprintf(file, "%3u:  %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass,
+					atomic_load32(&heap->size_class_use[iclass].alloc_current),
+					heap->size_class_use[iclass].alloc_peak,
+					heap->size_class_use[iclass].alloc_total,
+					atomic_load32(&heap->size_class_use[iclass].free_total),
+					_memory_size_class[iclass].block_size,
+					_memory_size_class[iclass].block_count,
+					heap->size_class_use[iclass].spans_current,
+					heap->size_class_use[iclass].spans_peak,
+					((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024),
+					((size_t)heap->size_class_use[iclass].spans_to_cache * _memory_span_size) / (size_t)(1024 * 1024),
+					((size_t)heap->size_class_use[iclass].spans_from_cache * _memory_span_size) / (size_t)(1024 * 1024),
+					((size_t)heap->size_class_use[iclass].spans_from_reserved * _memory_span_size) / (size_t)(1024 * 1024),
+					heap->size_class_use[iclass].spans_map_calls);
+			}
+			fprintf(file, "Spans  Current     Peak  PeakMiB  Cached  ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB  MmapCalls\n");
+			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+				if (!heap->span_use[iclass].high && !heap->span_use[iclass].spans_map_calls)
+					continue;
+				fprintf(file, "%4u: %8d %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1),
+					atomic_load32(&heap->span_use[iclass].current),
+					heap->span_use[iclass].high,
+					((size_t)heap->span_use[iclass].high * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+					heap->span_cache[iclass] ? heap->span_cache[iclass]->list_size : 0,
+					((size_t)heap->span_use[iclass].spans_to_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+					((size_t)heap->span_use[iclass].spans_from_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+					((size_t)heap->span_use[iclass].spans_to_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+					((size_t)heap->span_use[iclass].spans_from_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+					((size_t)heap->span_use[iclass].spans_to_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+					((size_t)heap->span_use[iclass].spans_from_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+					heap->span_use[iclass].spans_map_calls);
+			}
+			fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n");
+			fprintf(file, "%17zu %17zu\n", (size_t)heap->thread_to_global / (size_t)(1024 * 1024), (size_t)heap->global_to_thread / (size_t)(1024 * 1024));
+			heap = heap->next_heap;
+		}
+	}
+
+	fprintf(file, "Global stats:\n");
+	size_t huge_current = (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size;
+	size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size;
+	fprintf(file, "HugeCurrentMiB HugePeakMiB\n");
+	fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024), huge_peak / (size_t)(1024 * 1024));
+
+	size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size;
+	size_t mapped_os = (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size;
+	size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size;
+	size_t mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size;
+	size_t unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
+	size_t reserved_total = (size_t)atomic_load32(&_reserved_spans) * _memory_span_size;
+	fprintf(file, "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB ReservedTotalMiB\n");
+	fprintf(file, "%9zu %11zu %13zu %14zu %16zu %16zu\n",
+		mapped / (size_t)(1024 * 1024),
+		mapped_os / (size_t)(1024 * 1024),
+		mapped_peak / (size_t)(1024 * 1024),
+		mapped_total / (size_t)(1024 * 1024),
+		unmapped_total / (size_t)(1024 * 1024),
+		reserved_total / (size_t)(1024 * 1024));
+
+	fprintf(file, "\n");
+#else
+	(void)sizeof(file);
+#endif
+}
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/tracy_rpmalloc.hpp b/Source/ThirdParty/tracy/client/tracy_rpmalloc.hpp
new file mode 100644
index 000000000..3e8c4f1b5
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/tracy_rpmalloc.hpp
@@ -0,0 +1,261 @@
+/* rpmalloc.h  -  Memory allocator  -  Public Domain  -  2016 Mattias Jansson
+ *
+ * This library provides a cross-platform lock free thread caching malloc implementation in C11.
+ * The latest source code is always available at
+ *
+ * https://github.com/mjansson/rpmalloc
+ *
+ * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions.
+ *
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include "../common/TracySystem.hpp"
+
+namespace tracy
+{
+
+#if defined(__clang__) || defined(__GNUC__)
+# define RPMALLOC_EXPORT __attribute__((visibility("default")))
+# define RPMALLOC_ALLOCATOR 
+# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__))
+# if defined(__clang_major__) && (__clang_major__ < 4)
+# define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
+# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
+# else
+# define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size)))
+# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)  __attribute__((alloc_size(count, size)))
+# endif
+# define RPMALLOC_CDECL
+#elif defined(_MSC_VER)
+# define RPMALLOC_EXPORT
+# define RPMALLOC_ALLOCATOR __declspec(allocator) __declspec(restrict)
+# define RPMALLOC_ATTRIB_MALLOC
+# define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
+# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count,size)
+# define RPMALLOC_CDECL __cdecl
+#else
+# define RPMALLOC_EXPORT
+# define RPMALLOC_ALLOCATOR
+# define RPMALLOC_ATTRIB_MALLOC
+# define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
+# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count,size)
+# define RPMALLOC_CDECL
+#endif
+
+//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes
+#ifndef RPMALLOC_CONFIGURABLE
+#define RPMALLOC_CONFIGURABLE 0
+#endif
+
+//! Flag to rpaligned_realloc to not preserve content in reallocation
+#define RPMALLOC_NO_PRESERVE    1
+
+typedef struct rpmalloc_global_statistics_t {
+	//! Current amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1)
+	size_t mapped;
+	//! Peak amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1)
+	size_t mapped_peak;
+	//! Current amount of memory in global caches for small and medium sizes (<32KiB)
+	size_t cached;
+	//! Current amount of memory allocated in huge allocations, i.e larger than LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1)
+	size_t huge_alloc;
+	//! Peak amount of memory allocated in huge allocations, i.e larger than LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1)
+	size_t huge_alloc_peak;
+	//! Total amount of memory mapped since initialization (only if ENABLE_STATISTICS=1)
+	size_t mapped_total;
+	//! Total amount of memory unmapped since initialization  (only if ENABLE_STATISTICS=1)
+	size_t unmapped_total;
+} rpmalloc_global_statistics_t;
+
+typedef struct rpmalloc_thread_statistics_t {
+	//! Current number of bytes available in thread size class caches for small and medium sizes (<32KiB)
+	size_t sizecache;
+	//! Current number of bytes available in thread span caches for small and medium sizes (<32KiB)
+	size_t spancache;
+	//! Total number of bytes transitioned from thread cache to global cache (only if ENABLE_STATISTICS=1)
+	size_t thread_to_global;
+	//! Total number of bytes transitioned from global cache to thread cache (only if ENABLE_STATISTICS=1)
+	size_t global_to_thread;
+	//! Per span count statistics (only if ENABLE_STATISTICS=1)
+	struct {
+		//! Currently used number of spans
+		size_t current;
+		//! High water mark of spans used
+		size_t peak;
+		//! Number of spans transitioned to global cache
+		size_t to_global;
+		//! Number of spans transitioned from global cache
+		size_t from_global;
+		//! Number of spans transitioned to thread cache
+		size_t to_cache;
+		//! Number of spans transitioned from thread cache
+		size_t from_cache;
+		//! Number of spans transitioned to reserved state
+		size_t to_reserved;
+		//! Number of spans transitioned from reserved state
+		size_t from_reserved;
+		//! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls)
+		size_t map_calls;
+	} span_use[32];
+	//! Per size class statistics (only if ENABLE_STATISTICS=1)
+	struct {
+		//! Current number of allocations
+		size_t alloc_current;
+		//! Peak number of allocations
+		size_t alloc_peak;
+		//! Total number of allocations
+		size_t alloc_total;
+		//! Total number of frees
+		size_t free_total;
+		//! Number of spans transitioned to cache
+		size_t spans_to_cache;
+		//! Number of spans transitioned from cache
+		size_t spans_from_cache;
+		//! Number of spans transitioned from reserved state
+		size_t spans_from_reserved;
+		//! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls)
+		size_t map_calls;
+	} size_use[128];
+} rpmalloc_thread_statistics_t;
+
+typedef struct rpmalloc_config_t {
+	//! Map memory pages for the given number of bytes. The returned address MUST be
+	//  aligned to the rpmalloc span size, which will always be a power of two.
+	//  Optionally the function can store an alignment offset in the offset variable
+	//  in case it performs alignment and the returned pointer is offset from the
+	//  actual start of the memory region due to this alignment. The alignment offset
+	//  will be passed to the memory unmap function. The alignment offset MUST NOT be
+	//  larger than 65535 (storable in an uint16_t), if it is you must use natural
+	//  alignment to shift it into 16 bits. If you set a memory_map function, you
+	//  must also set a memory_unmap function or else the default implementation will
+	//  be used for both.
+	void* (*memory_map)(size_t size, size_t* offset);
+	//! Unmap the memory pages starting at address and spanning the given number of bytes.
+	//  If release is set to non-zero, the unmap is for an entire span range as returned by
+	//  a previous call to memory_map and that the entire range should be released. The
+	//  release argument holds the size of the entire span range. If release is set to 0,
+	//  the unmap is a partial decommit of a subset of the mapped memory range.
+	//  If you set a memory_unmap function, you must also set a memory_map function or
+	//  else the default implementation will be used for both.
+	void (*memory_unmap)(void* address, size_t size, size_t offset, size_t release);
+	//! Size of memory pages. The page size MUST be a power of two. All memory mapping
+	//  requests to memory_map will be made with size set to a multiple of the page size.
+	//  Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system page size is used.
+	size_t page_size;
+	//! Size of a span of memory blocks. MUST be a power of two, and in [4096,262144]
+	//  range (unless 0 - set to 0 to use the default span size). Used if RPMALLOC_CONFIGURABLE
+	//  is defined to 1.
+	size_t span_size;
+	//! Number of spans to map at each request to map new virtual memory blocks. This can
+	//  be used to minimize the system call overhead at the cost of virtual memory address
+	//  space. The extra mapped pages will not be written until actually used, so physical
+	//  committed memory should not be affected in the default implementation. Will be
+	//  aligned to a multiple of spans that match memory page size in case of huge pages.
+	size_t span_map_count;
+	//! Enable use of large/huge pages. If this flag is set to non-zero and page size is
+	//  zero, the allocator will try to enable huge pages and auto detect the configuration.
+	//  If this is set to non-zero and page_size is also non-zero, the allocator will
+	//  assume huge pages have been configured and enabled prior to initializing the
+	//  allocator.
+	//  For Windows, see https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support
+	//  For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
+	int enable_huge_pages;
+} rpmalloc_config_t;
+
+//! Initialize allocator with default configuration
+TRACY_API int
+rpmalloc_initialize(void);
+
+//! Initialize allocator with given configuration
+RPMALLOC_EXPORT int
+rpmalloc_initialize_config(const rpmalloc_config_t* config);
+
+//! Get allocator configuration
+RPMALLOC_EXPORT const rpmalloc_config_t*
+rpmalloc_config(void);
+
+//! Finalize allocator
+TRACY_API void
+rpmalloc_finalize(void);
+
+//! Initialize allocator for calling thread
+TRACY_API void
+rpmalloc_thread_initialize(void);
+
+//! Finalize allocator for calling thread
+TRACY_API void
+rpmalloc_thread_finalize(void);
+
+//! Perform deferred deallocations pending for the calling thread heap
+RPMALLOC_EXPORT void
+rpmalloc_thread_collect(void);
+
+//! Query if allocator is initialized for calling thread
+RPMALLOC_EXPORT int
+rpmalloc_is_thread_initialized(void);
+
+//! Get per-thread statistics
+RPMALLOC_EXPORT void
+rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats);
+
+//! Get global statistics
+RPMALLOC_EXPORT void
+rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats);
+
+//! Dump all statistics in human readable format to file (should be a FILE*)
+RPMALLOC_EXPORT void
+rpmalloc_dump_statistics(void* file);
+
+//! Allocate a memory block of at least the given size
+TRACY_API RPMALLOC_ALLOCATOR void*
+rpmalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1);
+
+//! Free the given memory block
+TRACY_API void
+rpfree(void* ptr);
+
+//! Allocate a memory block of at least the given size and zero initialize it
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2);
+
+//! Reallocate the given block to at least the given size
+TRACY_API RPMALLOC_ALLOCATOR void*
+rprealloc(void* ptr, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Reallocate the given block to at least the given size and alignment,
+//  with optional control flags (see RPMALLOC_NO_PRESERVE).
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT int
+rpposix_memalign(void **memptr, size_t alignment, size_t size);
+
+//! Query the usable size of the given memory block (from given pointer to the end of block)
+RPMALLOC_EXPORT size_t
+rpmalloc_usable_size(void* ptr);
+
+}
diff --git a/Source/ThirdParty/tracy/common/TracyAlign.hpp b/Source/ThirdParty/tracy/common/TracyAlign.hpp
new file mode 100644
index 000000000..730342df0
--- /dev/null
+++ b/Source/ThirdParty/tracy/common/TracyAlign.hpp
@@ -0,0 +1,25 @@
+#ifndef __TRACYALIGN_HPP__
+#define __TRACYALIGN_HPP__
+
+#include <string.h>
+
+namespace tracy
+{
+
+template<typename T>
+tracy_force_inline T MemRead( const void* ptr )
+{
+    T val;
+    memcpy( &val, ptr, sizeof( T ) );
+    return val;
+}
+
+template<typename T>
+tracy_force_inline void MemWrite( void* ptr, T val )
+{
+    memcpy( ptr, &val, sizeof( T ) );
+}
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/common/TracyAlloc.hpp b/Source/ThirdParty/tracy/common/TracyAlloc.hpp
new file mode 100644
index 000000000..a3cbec057
--- /dev/null
+++ b/Source/ThirdParty/tracy/common/TracyAlloc.hpp
@@ -0,0 +1,42 @@
+#ifndef __TRACYALLOC_HPP__
+#define __TRACYALLOC_HPP__
+
+#include <stdlib.h>
+
+#ifdef TRACY_ENABLE
+#  include "../client/tracy_rpmalloc.hpp"
+#endif
+
+namespace tracy
+{
+
+static inline void* tracy_malloc( size_t size )
+{
+#ifdef TRACY_ENABLE
+    return rpmalloc( size );
+#else
+    return malloc( size );
+#endif
+}
+
+static inline void tracy_free( void* ptr )
+{
+#ifdef TRACY_ENABLE
+    rpfree( ptr );
+#else
+    free( ptr );
+#endif
+}
+
+static inline void* tracy_realloc( void* ptr, size_t size )
+{
+#ifdef TRACY_ENABLE
+    return rprealloc( ptr, size );
+#else
+    return realloc( ptr, size );
+#endif
+}
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/common/TracyMutex.hpp b/Source/ThirdParty/tracy/common/TracyMutex.hpp
new file mode 100644
index 000000000..57fb01a0c
--- /dev/null
+++ b/Source/ThirdParty/tracy/common/TracyMutex.hpp
@@ -0,0 +1,24 @@
+#ifndef __TRACYMUTEX_HPP__
+#define __TRACYMUTEX_HPP__
+
+#if defined _MSC_VER
+
+#  include <shared_mutex>
+
+namespace tracy
+{
+using TracyMutex = std::shared_mutex;
+}
+
+#else
+
+#include <mutex>
+
+namespace tracy
+{
+using TracyMutex = std::mutex;
+}
+
+#endif
+
+#endif
diff --git a/Source/ThirdParty/tracy/common/TracyProtocol.hpp b/Source/ThirdParty/tracy/common/TracyProtocol.hpp
new file mode 100644
index 000000000..2326a7f32
--- /dev/null
+++ b/Source/ThirdParty/tracy/common/TracyProtocol.hpp
@@ -0,0 +1,128 @@
+#ifndef __TRACYPROTOCOL_HPP__
+#define __TRACYPROTOCOL_HPP__
+
+#include <limits>
+#include <stdint.h>
+
+namespace tracy
+{
+
+constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; }
+
+enum : uint32_t { ProtocolVersion = 46 };
+enum : uint16_t { BroadcastVersion = 2 };
+
+using lz4sz_t = uint32_t;
+
+enum { TargetFrameSize = 256 * 1024 };
+enum { LZ4Size = Lz4CompressBound( TargetFrameSize ) };
+static_assert( LZ4Size <= std::numeric_limits<lz4sz_t>::max(), "LZ4Size greater than lz4sz_t" );
+static_assert( TargetFrameSize * 2 >= 64 * 1024, "Not enough space for LZ4 stream buffer" );
+
+enum { HandshakeShibbolethSize = 8 };
+static const char HandshakeShibboleth[HandshakeShibbolethSize] = { 'T', 'r', 'a', 'c', 'y', 'P', 'r', 'f' };
+
+enum HandshakeStatus : uint8_t
+{
+    HandshakePending,
+    HandshakeWelcome,
+    HandshakeProtocolMismatch,
+    HandshakeNotAvailable,
+    HandshakeDropped
+};
+
+enum { WelcomeMessageProgramNameSize = 64 };
+enum { WelcomeMessageHostInfoSize = 1024 };
+
+#pragma pack( 1 )
+
+// Must increase left query space after handling!
+enum ServerQuery : uint8_t
+{
+    ServerQueryTerminate,
+    ServerQueryString,
+    ServerQueryThreadString,
+    ServerQuerySourceLocation,
+    ServerQueryPlotName,
+    ServerQueryCallstackFrame,
+    ServerQueryFrameName,
+    ServerQueryDisconnect,
+    ServerQueryExternalName,
+    ServerQueryParameter,
+    ServerQuerySymbol,
+    ServerQuerySymbolCode,
+    ServerQueryCodeLocation,
+    ServerQuerySourceCode,
+    ServerQueryDataTransfer,
+    ServerQueryDataTransferPart
+};
+
+struct ServerQueryPacket
+{
+    ServerQuery type;
+    uint64_t ptr;
+    uint32_t extra;
+};
+
+enum { ServerQueryPacketSize = sizeof( ServerQueryPacket ) };
+
+
+enum CpuArchitecture : uint8_t
+{
+    CpuArchUnknown,
+    CpuArchX86,
+    CpuArchX64,
+    CpuArchArm32,
+    CpuArchArm64
+};
+
+
+struct WelcomeMessage
+{
+    double timerMul;
+    int64_t initBegin;
+    int64_t initEnd;
+    uint64_t delay;
+    uint64_t resolution;
+    uint64_t epoch;
+    uint64_t exectime;
+    uint64_t pid;
+    int64_t samplingPeriod;
+    uint8_t onDemand;
+    uint8_t isApple;
+    uint8_t cpuArch;
+    uint8_t codeTransfer;
+    char cpuManufacturer[12];
+    uint32_t cpuId;
+    char programName[WelcomeMessageProgramNameSize];
+    char hostInfo[WelcomeMessageHostInfoSize];
+};
+
+enum { WelcomeMessageSize = sizeof( WelcomeMessage ) };
+
+
+struct OnDemandPayloadMessage
+{
+    uint64_t frames;
+    uint64_t currentTime;
+};
+
+enum { OnDemandPayloadMessageSize = sizeof( OnDemandPayloadMessage ) };
+
+
+struct BroadcastMessage
+{
+    uint16_t broadcastVersion;
+    uint16_t listenPort;
+    uint32_t protocolVersion;
+    int32_t activeTime;        // in seconds
+    char programName[WelcomeMessageProgramNameSize];
+};
+
+enum { BroadcastMessageSize = sizeof( BroadcastMessage ) };
+
+#pragma pack()
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/common/TracyQueue.hpp b/Source/ThirdParty/tracy/common/TracyQueue.hpp
new file mode 100644
index 000000000..d99945013
--- /dev/null
+++ b/Source/ThirdParty/tracy/common/TracyQueue.hpp
@@ -0,0 +1,678 @@
+#ifndef __TRACYQUEUE_HPP__
+#define __TRACYQUEUE_HPP__
+
+#include <stdint.h>
+
+namespace tracy
+{
+
+enum class QueueType : uint8_t
+{
+    ZoneText,
+    ZoneName,
+    Message,
+    MessageColor,
+    MessageCallstack,
+    MessageColorCallstack,
+    MessageAppInfo,
+    ZoneBeginAllocSrcLoc,
+    ZoneBeginAllocSrcLocCallstack,
+    CallstackSerial,
+    Callstack,
+    CallstackAlloc,
+    CallstackSample,
+    FrameImage,
+    ZoneBegin,
+    ZoneBeginCallstack,
+    ZoneEnd,
+    LockWait,
+    LockObtain,
+    LockRelease,
+    LockSharedWait,
+    LockSharedObtain,
+    LockSharedRelease,
+    LockName,
+    MemAlloc,
+    MemAllocNamed,
+    MemFree,
+    MemFreeNamed,
+    MemAllocCallstack,
+    MemAllocCallstackNamed,
+    MemFreeCallstack,
+    MemFreeCallstackNamed,
+    GpuZoneBegin,
+    GpuZoneBeginCallstack,
+    GpuZoneBeginAllocSrcLoc,
+    GpuZoneBeginAllocSrcLocCallstack,
+    GpuZoneEnd,
+    GpuZoneBeginSerial,
+    GpuZoneBeginCallstackSerial,
+    GpuZoneBeginAllocSrcLocSerial,
+    GpuZoneBeginAllocSrcLocCallstackSerial,
+    GpuZoneEndSerial,
+    PlotData,
+    ContextSwitch,
+    ThreadWakeup,
+    GpuTime,
+    GpuContextName,
+    Terminate,
+    KeepAlive,
+    ThreadContext,
+    GpuCalibration,
+    Crash,
+    CrashReport,
+    ZoneValidation,
+    ZoneColor,
+    ZoneValue,
+    FrameMarkMsg,
+    FrameMarkMsgStart,
+    FrameMarkMsgEnd,
+    SourceLocation,
+    LockAnnounce,
+    LockTerminate,
+    LockMark,
+    MessageLiteral,
+    MessageLiteralColor,
+    MessageLiteralCallstack,
+    MessageLiteralColorCallstack,
+    GpuNewContext,
+    CallstackFrameSize,
+    CallstackFrame,
+    SymbolInformation,
+    CodeInformation,
+    SysTimeReport,
+    TidToPid,
+    PlotConfig,
+    ParamSetup,
+    AckServerQueryNoop,
+    AckSourceCodeNotAvailable,
+    CpuTopology,
+    SingleStringData,
+    SecondStringData,
+    MemNamePayload,
+    StringData,
+    ThreadName,
+    PlotName,
+    SourceLocationPayload,
+    CallstackPayload,
+    CallstackAllocPayload,
+    FrameName,
+    FrameImageData,
+    ExternalName,
+    ExternalThreadName,
+    SymbolCode,
+    SourceCode,
+    NUM_TYPES
+};
+
+#pragma pack( 1 )
+
+struct QueueThreadContext
+{
+    uint64_t thread;
+};
+
+struct QueueZoneBeginLean
+{
+    int64_t time;
+};
+
+struct QueueZoneBegin : public QueueZoneBeginLean
+{
+    uint64_t srcloc;    // ptr
+};
+
+struct QueueZoneEnd
+{
+    int64_t time;
+};
+
+struct QueueZoneValidation
+{
+    uint32_t id;
+};
+
+struct QueueZoneColor
+{
+    uint8_t r;
+    uint8_t g;
+    uint8_t b;
+};
+
+struct QueueZoneValue
+{
+    uint64_t value;
+};
+
+struct QueueStringTransfer
+{
+    uint64_t ptr;
+};
+
+struct QueueFrameMark
+{
+    int64_t time;
+    uint64_t name;      // ptr
+};
+
+struct QueueFrameImage
+{
+    uint32_t frame;
+    uint16_t w;
+    uint16_t h;
+    uint8_t flip;
+};
+
+struct QueueFrameImageFat : public QueueFrameImage
+{
+    uint64_t image;     // ptr
+};
+
+struct QueueSourceLocation
+{
+    uint64_t name;
+    uint64_t function;  // ptr
+    uint64_t file;      // ptr
+    uint32_t line;
+    uint8_t r;
+    uint8_t g;
+    uint8_t b;
+};
+
+struct QueueZoneTextFat
+{
+    uint64_t text;      // ptr
+    uint16_t size;
+};
+
+enum class LockType : uint8_t
+{
+    Lockable,
+    SharedLockable
+};
+
+struct QueueLockAnnounce
+{
+    uint32_t id;
+    int64_t time;
+    uint64_t lckloc;    // ptr
+    LockType type;
+};
+
+struct QueueLockTerminate
+{
+    uint32_t id;
+    int64_t time;
+};
+
+struct QueueLockWait
+{
+    uint64_t thread;
+    uint32_t id;
+    int64_t time;
+};
+
+struct QueueLockObtain
+{
+    uint64_t thread;
+    uint32_t id;
+    int64_t time;
+};
+
+struct QueueLockRelease
+{
+    uint64_t thread;
+    uint32_t id;
+    int64_t time;
+};
+
+struct QueueLockMark
+{
+    uint64_t thread;
+    uint32_t id;
+    uint64_t srcloc;    // ptr
+};
+
+struct QueueLockName
+{
+    uint32_t id;
+};
+
+struct QueueLockNameFat : public QueueLockName
+{
+    uint64_t name;      // ptr
+    uint16_t size;
+};
+
+enum class PlotDataType : uint8_t
+{
+    Float,
+    Double,
+    Int
+};
+
+struct QueuePlotData
+{
+    uint64_t name;      // ptr
+    int64_t time;
+    PlotDataType type;
+    union
+    {
+        double d;
+        float f;
+        int64_t i;
+    } data;
+};
+
+struct QueueMessage
+{
+    int64_t time;
+};
+
+struct QueueMessageColor : public QueueMessage
+{
+    uint8_t r;
+    uint8_t g;
+    uint8_t b;
+};
+
+struct QueueMessageLiteral : public QueueMessage
+{
+    uint64_t text;      // ptr
+};
+
+struct QueueMessageColorLiteral : public QueueMessageColor
+{
+    uint64_t text;      // ptr
+};
+
+struct QueueMessageFat : public QueueMessage
+{
+    uint64_t text;      // ptr
+    uint16_t size;
+};
+
+struct QueueMessageColorFat : public QueueMessageColor
+{
+    uint64_t text;      // ptr
+    uint16_t size;
+};
+
+// Don't change order, only add new entries at the end, this is also used on trace dumps!
+enum class GpuContextType : uint8_t
+{
+    Invalid,
+    OpenGl,
+    Vulkan,
+    OpenCL,
+    Direct3D12
+};
+
+enum GpuContextFlags : uint8_t
+{
+    GpuContextCalibration   = 1 << 0
+};
+
+struct QueueGpuNewContext
+{
+    int64_t cpuTime;
+    int64_t gpuTime;
+    uint64_t thread;
+    float period;
+    uint8_t context;
+    GpuContextFlags flags;
+    GpuContextType type;
+};
+
+struct QueueGpuZoneBeginLean
+{
+    int64_t cpuTime;
+    uint64_t thread;
+    uint16_t queryId;
+    uint8_t context;
+};
+
+struct QueueGpuZoneBegin : public QueueGpuZoneBeginLean
+{
+    uint64_t srcloc;
+};
+
+struct QueueGpuZoneEnd
+{
+    int64_t cpuTime;
+    uint64_t thread;
+    uint16_t queryId;
+    uint8_t context;
+};
+
+struct QueueGpuTime
+{
+    int64_t gpuTime;
+    uint16_t queryId;
+    uint8_t context;
+};
+
+struct QueueGpuCalibration
+{
+    int64_t gpuTime;
+    int64_t cpuTime;
+    int64_t cpuDelta;
+    uint8_t context;
+};
+
+struct QueueGpuContextName
+{
+    uint8_t context;
+};
+
+struct QueueGpuContextNameFat : public QueueGpuContextName
+{
+    uint64_t ptr;
+    uint16_t size;
+};
+
+struct QueueMemNamePayload
+{
+    uint64_t name;
+};
+
+struct QueueMemAlloc
+{
+    int64_t time;
+    uint64_t thread;
+    uint64_t ptr;
+    char size[6];
+};
+
+struct QueueMemFree
+{
+    int64_t time;
+    uint64_t thread;
+    uint64_t ptr;
+};
+
+struct QueueCallstackFat
+{
+    uint64_t ptr;
+};
+
+struct QueueCallstackAllocFat
+{
+    uint64_t ptr;
+    uint64_t nativePtr;
+};
+
+struct QueueCallstackSample
+{
+    int64_t time;
+    uint64_t thread;
+};
+
+struct QueueCallstackSampleFat : public QueueCallstackSample
+{
+    uint64_t ptr;
+};
+
+struct QueueCallstackFrameSize
+{
+    uint64_t ptr;
+    uint8_t size;
+};
+
+struct QueueCallstackFrame
+{
+    uint32_t line;
+    uint64_t symAddr;
+    uint32_t symLen;
+};
+
+struct QueueSymbolInformation
+{
+    uint32_t line;
+    uint64_t symAddr;
+};
+
+struct QueueCodeInformation
+{
+    uint64_t ptr;
+    uint32_t line;
+};
+
+struct QueueCrashReport
+{
+    int64_t time;
+    uint64_t text;      // ptr
+};
+
+struct QueueSysTime
+{
+    int64_t time;
+    float sysTime;
+};
+
+struct QueueContextSwitch
+{
+    int64_t time;
+    uint64_t oldThread;
+    uint64_t newThread;
+    uint8_t cpu;
+    uint8_t reason;
+    uint8_t state;
+};
+
+struct QueueThreadWakeup
+{
+    int64_t time;
+    uint64_t thread;
+};
+
+struct QueueTidToPid
+{
+    uint64_t tid;
+    uint64_t pid;
+};
+
+struct QueuePlotConfig
+{
+    uint64_t name;      // ptr
+    uint8_t type;
+};
+
+struct QueueParamSetup
+{
+    uint32_t idx;
+    uint64_t name;      // ptr
+    uint8_t isBool;
+    int32_t val;
+};
+
+struct QueueCpuTopology
+{
+    uint32_t package;
+    uint32_t core;
+    uint32_t thread;
+};
+
+struct QueueHeader
+{
+    union
+    {
+        QueueType type;
+        uint8_t idx;
+    };
+};
+
+struct QueueItem
+{
+    QueueHeader hdr;
+    union
+    {
+        QueueThreadContext threadCtx;
+        QueueZoneBegin zoneBegin;
+        QueueZoneBeginLean zoneBeginLean;
+        QueueZoneEnd zoneEnd;
+        QueueZoneValidation zoneValidation;
+        QueueZoneColor zoneColor;
+        QueueZoneValue zoneValue;
+        QueueStringTransfer stringTransfer;
+        QueueFrameMark frameMark;
+        QueueFrameImage frameImage;
+        QueueFrameImageFat frameImageFat;
+        QueueSourceLocation srcloc;
+        QueueZoneTextFat zoneTextFat;
+        QueueLockAnnounce lockAnnounce;
+        QueueLockTerminate lockTerminate;
+        QueueLockWait lockWait;
+        QueueLockObtain lockObtain;
+        QueueLockRelease lockRelease;
+        QueueLockMark lockMark;
+        QueueLockName lockName;
+        QueueLockNameFat lockNameFat;
+        QueuePlotData plotData;
+        QueueMessage message;
+        QueueMessageColor messageColor;
+        QueueMessageLiteral messageLiteral;
+        QueueMessageColorLiteral messageColorLiteral;
+        QueueMessageFat messageFat;
+        QueueMessageColorFat messageColorFat;
+        QueueGpuNewContext gpuNewContext;
+        QueueGpuZoneBegin gpuZoneBegin;
+        QueueGpuZoneBeginLean gpuZoneBeginLean;
+        QueueGpuZoneEnd gpuZoneEnd;
+        QueueGpuTime gpuTime;
+        QueueGpuCalibration gpuCalibration;
+        QueueGpuContextName gpuContextName;
+        QueueGpuContextNameFat gpuContextNameFat;
+        QueueMemAlloc memAlloc;
+        QueueMemFree memFree;
+        QueueMemNamePayload memName;
+        QueueCallstackFat callstackFat;
+        QueueCallstackAllocFat callstackAllocFat;
+        QueueCallstackSample callstackSample;
+        QueueCallstackSampleFat callstackSampleFat;
+        QueueCallstackFrameSize callstackFrameSize;
+        QueueCallstackFrame callstackFrame;
+        QueueSymbolInformation symbolInformation;
+        QueueCodeInformation codeInformation;
+        QueueCrashReport crashReport;
+        QueueSysTime sysTime;
+        QueueContextSwitch contextSwitch;
+        QueueThreadWakeup threadWakeup;
+        QueueTidToPid tidToPid;
+        QueuePlotConfig plotConfig;
+        QueueParamSetup paramSetup;
+        QueueCpuTopology cpuTopology;
+    };
+};
+#pragma pack()
+
+
+enum { QueueItemSize = sizeof( QueueItem ) };
+
+static constexpr size_t QueueDataSize[] = {
+    sizeof( QueueHeader ),                                  // zone text
+    sizeof( QueueHeader ),                                  // zone name
+    sizeof( QueueHeader ) + sizeof( QueueMessage ),
+    sizeof( QueueHeader ) + sizeof( QueueMessageColor ),
+    sizeof( QueueHeader ) + sizeof( QueueMessage ),         // callstack
+    sizeof( QueueHeader ) + sizeof( QueueMessageColor ),    // callstack
+    sizeof( QueueHeader ) + sizeof( QueueMessage ),         // app info
+    sizeof( QueueHeader ) + sizeof( QueueZoneBeginLean ),   // allocated source location
+    sizeof( QueueHeader ) + sizeof( QueueZoneBeginLean ),   // allocated source location, callstack
+    sizeof( QueueHeader ),                                  // callstack memory
+    sizeof( QueueHeader ),                                  // callstack
+    sizeof( QueueHeader ),                                  // callstack alloc
+    sizeof( QueueHeader ) + sizeof( QueueCallstackSample ),
+    sizeof( QueueHeader ) + sizeof( QueueFrameImage ),
+    sizeof( QueueHeader ) + sizeof( QueueZoneBegin ),
+    sizeof( QueueHeader ) + sizeof( QueueZoneBegin ),       // callstack
+    sizeof( QueueHeader ) + sizeof( QueueZoneEnd ),
+    sizeof( QueueHeader ) + sizeof( QueueLockWait ),
+    sizeof( QueueHeader ) + sizeof( QueueLockObtain ),
+    sizeof( QueueHeader ) + sizeof( QueueLockRelease ),
+    sizeof( QueueHeader ) + sizeof( QueueLockWait ),        // shared
+    sizeof( QueueHeader ) + sizeof( QueueLockObtain ),      // shared
+    sizeof( QueueHeader ) + sizeof( QueueLockRelease ),     // shared
+    sizeof( QueueHeader ) + sizeof( QueueLockName ),
+    sizeof( QueueHeader ) + sizeof( QueueMemAlloc ),
+    sizeof( QueueHeader ) + sizeof( QueueMemAlloc ),        // named
+    sizeof( QueueHeader ) + sizeof( QueueMemFree ),
+    sizeof( QueueHeader ) + sizeof( QueueMemFree ),         // named
+    sizeof( QueueHeader ) + sizeof( QueueMemAlloc ),        // callstack
+    sizeof( QueueHeader ) + sizeof( QueueMemAlloc ),        // callstack, named
+    sizeof( QueueHeader ) + sizeof( QueueMemFree ),         // callstack
+    sizeof( QueueHeader ) + sizeof( QueueMemFree ),         // callstack, named
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ),    // callstack
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// allocated source location
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// allocated source location, callstack
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneEnd ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ),    // serial
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ),    // serial, callstack
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// serial, allocated source location
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// serial, allocated source location, callstack
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneEnd ),      // serial
+    sizeof( QueueHeader ) + sizeof( QueuePlotData ),
+    sizeof( QueueHeader ) + sizeof( QueueContextSwitch ),
+    sizeof( QueueHeader ) + sizeof( QueueThreadWakeup ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuTime ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuContextName ),
+    // above items must be first
+    sizeof( QueueHeader ),                                  // terminate
+    sizeof( QueueHeader ),                                  // keep alive
+    sizeof( QueueHeader ) + sizeof( QueueThreadContext ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuCalibration ),
+    sizeof( QueueHeader ),                                  // crash
+    sizeof( QueueHeader ) + sizeof( QueueCrashReport ),
+    sizeof( QueueHeader ) + sizeof( QueueZoneValidation ),
+    sizeof( QueueHeader ) + sizeof( QueueZoneColor ),
+    sizeof( QueueHeader ) + sizeof( QueueZoneValue ),
+    sizeof( QueueHeader ) + sizeof( QueueFrameMark ),       // continuous frames
+    sizeof( QueueHeader ) + sizeof( QueueFrameMark ),       // start
+    sizeof( QueueHeader ) + sizeof( QueueFrameMark ),       // end
+    sizeof( QueueHeader ) + sizeof( QueueSourceLocation ),
+    sizeof( QueueHeader ) + sizeof( QueueLockAnnounce ),
+    sizeof( QueueHeader ) + sizeof( QueueLockTerminate ),
+    sizeof( QueueHeader ) + sizeof( QueueLockMark ),
+    sizeof( QueueHeader ) + sizeof( QueueMessageLiteral ),
+    sizeof( QueueHeader ) + sizeof( QueueMessageColorLiteral ),
+    sizeof( QueueHeader ) + sizeof( QueueMessageLiteral ),  // callstack
+    sizeof( QueueHeader ) + sizeof( QueueMessageColorLiteral ), // callstack
+    sizeof( QueueHeader ) + sizeof( QueueGpuNewContext ),
+    sizeof( QueueHeader ) + sizeof( QueueCallstackFrameSize ),
+    sizeof( QueueHeader ) + sizeof( QueueCallstackFrame ),
+    sizeof( QueueHeader ) + sizeof( QueueSymbolInformation ),
+    sizeof( QueueHeader ) + sizeof( QueueCodeInformation ),
+    sizeof( QueueHeader ) + sizeof( QueueSysTime ),
+    sizeof( QueueHeader ) + sizeof( QueueTidToPid ),
+    sizeof( QueueHeader ) + sizeof( QueuePlotConfig ),
+    sizeof( QueueHeader ) + sizeof( QueueParamSetup ),
+    sizeof( QueueHeader ),                                  // server query acknowledgement
+    sizeof( QueueHeader ),                                  // source code not available
+    sizeof( QueueHeader ) + sizeof( QueueCpuTopology ),
+    sizeof( QueueHeader ),                                  // single string data
+    sizeof( QueueHeader ),                                  // second string data
+    sizeof( QueueHeader ) + sizeof( QueueMemNamePayload ),
+    // keep all QueueStringTransfer below
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // string data
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // thread name
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // plot name
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // allocated source location payload
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // callstack payload
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // callstack alloc payload
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // frame name
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // frame image data
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // external name
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // external thread name
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // symbol code
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // source code
+};
+
+static_assert( QueueItemSize == 32, "Queue item size not 32 bytes" );
+static_assert( sizeof( QueueDataSize ) / sizeof( size_t ) == (uint8_t)QueueType::NUM_TYPES, "QueueDataSize mismatch" );
+static_assert( sizeof( void* ) <= sizeof( uint64_t ), "Pointer size > 8 bytes" );
+static_assert( sizeof( void* ) == sizeof( uintptr_t ), "Pointer size != uintptr_t" );
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/common/TracySocket.cpp b/Source/ThirdParty/tracy/common/TracySocket.cpp
new file mode 100644
index 000000000..f16569b06
--- /dev/null
+++ b/Source/ThirdParty/tracy/common/TracySocket.cpp
@@ -0,0 +1,748 @@
+#include <assert.h>
+#include <inttypes.h>
+#include <new>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "TracyAlloc.hpp"
+#include "TracySocket.hpp"
+
+#ifdef _WIN32
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#  include <winsock2.h>
+#  include <ws2tcpip.h>
+#  ifdef _MSC_VER
+#    pragma warning(disable:4244)
+#    pragma warning(disable:4267)
+#  endif
+#  define poll WSAPoll
+#else
+#  include <arpa/inet.h>
+#  include <sys/socket.h>
+#  include <sys/param.h>
+#  include <errno.h>
+#  include <fcntl.h>
+#  include <netinet/in.h>
+#  include <netdb.h>
+#  include <unistd.h>
+#  include <poll.h>
+#endif
+
+#ifndef MSG_NOSIGNAL
+#  define MSG_NOSIGNAL 0
+#endif
+
+namespace tracy
+{
+
+#ifdef _WIN32
+typedef SOCKET socket_t;
+#else
+typedef int socket_t;
+#endif
+
+#ifdef _WIN32
+struct __wsinit
+{
+    __wsinit()
+    {
+        WSADATA wsaData;
+        if( WSAStartup( MAKEWORD( 2, 2 ), &wsaData ) != 0 )
+        {
+            fprintf( stderr, "Cannot init winsock.\n" );
+            exit( 1 );
+        }
+    }
+};
+
+void InitWinSock()
+{
+    static __wsinit init;
+}
+#endif
+
+
+enum { BufSize = 128 * 1024 };
+
+Socket::Socket()
+    : m_buf( (char*)tracy_malloc( BufSize ) )
+    , m_bufPtr( nullptr )
+    , m_sock( -1 )
+    , m_bufLeft( 0 )
+    , m_ptr( nullptr )
+{
+#ifdef _WIN32
+    InitWinSock();
+#endif
+}
+
+Socket::Socket( int sock )
+    : m_buf( (char*)tracy_malloc( BufSize ) )
+    , m_bufPtr( nullptr )
+    , m_sock( sock )
+    , m_bufLeft( 0 )
+    , m_ptr( nullptr )
+{
+}
+
+Socket::~Socket()
+{
+    tracy_free( m_buf );
+    if( m_sock.load( std::memory_order_relaxed ) != -1 )
+    {
+        Close();
+    }
+    if( m_ptr )
+    {
+        freeaddrinfo( m_res );
+#ifdef _WIN32
+        closesocket( m_connSock );
+#else
+        close( m_connSock );
+#endif
+    }
+}
+
+bool Socket::Connect( const char* addr, uint16_t port )
+{
+    assert( !IsValid() );
+
+    if( m_ptr )
+    {
+        const auto c = connect( m_connSock, m_ptr->ai_addr, m_ptr->ai_addrlen );
+        if( c == -1 )
+        {
+#if defined _WIN32
+            const auto err = WSAGetLastError();
+            if( err == WSAEALREADY || err == WSAEINPROGRESS ) return false;
+            if( err != WSAEISCONN )
+            {
+                freeaddrinfo( m_res );
+                closesocket( m_connSock );
+                m_ptr = nullptr;
+                return false;
+            }
+#else
+            const auto err = errno;
+            if( err == EALREADY || err == EINPROGRESS ) return false;
+            if( err != EISCONN )
+            {
+                freeaddrinfo( m_res );
+                close( m_connSock );
+                m_ptr = nullptr;
+                return false;
+            }
+#endif
+        }
+
+#if defined _WIN32
+        u_long nonblocking = 0;
+        ioctlsocket( m_connSock, FIONBIO, &nonblocking );
+#else
+        int flags = fcntl( m_connSock, F_GETFL, 0 );
+        fcntl( m_connSock, F_SETFL, flags & ~O_NONBLOCK );
+#endif
+        m_sock.store( m_connSock, std::memory_order_relaxed );
+        freeaddrinfo( m_res );
+        m_ptr = nullptr;
+        return true;
+    }
+
+    struct addrinfo hints;
+    struct addrinfo *res, *ptr;
+
+    memset( &hints, 0, sizeof( hints ) );
+    hints.ai_family = AF_UNSPEC;
+    hints.ai_socktype = SOCK_STREAM;
+
+    char portbuf[32];
+    sprintf( portbuf, "%" PRIu16, port );
+
+    if( getaddrinfo( addr, portbuf, &hints, &res ) != 0 ) return false;
+    int sock = 0;
+    for( ptr = res; ptr; ptr = ptr->ai_next )
+    {
+        if( ( sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol ) ) == -1 ) continue;
+#if defined __APPLE__
+        int val = 1;
+        setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) );
+#endif
+#if defined _WIN32
+        u_long nonblocking = 1;
+        ioctlsocket( sock, FIONBIO, &nonblocking );
+#else
+        int flags = fcntl( sock, F_GETFL, 0 );
+        fcntl( sock, F_SETFL, flags | O_NONBLOCK );
+#endif
+        if( connect( sock, ptr->ai_addr, ptr->ai_addrlen ) == 0 )
+        {
+            break;
+        }
+        else
+        {
+#if defined _WIN32
+            const auto err = WSAGetLastError();
+            if( err != WSAEWOULDBLOCK )
+            {
+                closesocket( sock );
+                continue;
+            }
+#else
+            if( errno != EINPROGRESS )
+            {
+                close( sock );
+                continue;
+            }
+#endif
+        }
+        m_res = res;
+        m_ptr = ptr;
+        m_connSock = sock;
+        return false;
+    }
+    freeaddrinfo( res );
+    if( !ptr ) return false;
+
+#if defined _WIN32
+    u_long nonblocking = 0;
+    ioctlsocket( sock, FIONBIO, &nonblocking );
+#else
+    int flags = fcntl( sock, F_GETFL, 0 );
+    fcntl( sock, F_SETFL, flags & ~O_NONBLOCK );
+#endif
+
+    m_sock.store( sock, std::memory_order_relaxed );
+    return true;
+}
+
+bool Socket::ConnectBlocking( const char* addr, uint16_t port )
+{
+    assert( !IsValid() );
+    assert( !m_ptr );
+
+    struct addrinfo hints;
+    struct addrinfo *res, *ptr;
+
+    memset( &hints, 0, sizeof( hints ) );
+    hints.ai_family = AF_UNSPEC;
+    hints.ai_socktype = SOCK_STREAM;
+
+    char portbuf[32];
+    sprintf( portbuf, "%" PRIu16, port );
+
+    if( getaddrinfo( addr, portbuf, &hints, &res ) != 0 ) return false;
+    int sock = 0;
+    for( ptr = res; ptr; ptr = ptr->ai_next )
+    {
+        if( ( sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol ) ) == -1 ) continue;
+#if defined __APPLE__
+        int val = 1;
+        setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) );
+#endif
+        if( connect( sock, ptr->ai_addr, ptr->ai_addrlen ) == -1 )
+        {
+#ifdef _WIN32
+            closesocket( sock );
+#else
+            close( sock );
+#endif
+            continue;
+        }
+        break;
+    }
+    freeaddrinfo( res );
+    if( !ptr ) return false;
+
+    m_sock.store( sock, std::memory_order_relaxed );
+    return true;
+}
+
+void Socket::Close()
+{
+    const auto sock = m_sock.load( std::memory_order_relaxed );
+    assert( sock != -1 );
+#ifdef _WIN32
+    closesocket( sock );
+#else
+    close( sock );
+#endif
+    m_sock.store( -1, std::memory_order_relaxed );
+}
+
+int Socket::Send( const void* _buf, int len )
+{
+    const auto sock = m_sock.load( std::memory_order_relaxed );
+    auto buf = (const char*)_buf;
+    assert( sock != -1 );
+    auto start = buf;
+    while( len > 0 )
+    {
+        auto ret = send( sock, buf, len, MSG_NOSIGNAL );
+        if( ret == -1 ) return -1;
+        len -= ret;
+        buf += ret;
+    }
+    return int( buf - start );
+}
+
+int Socket::GetSendBufSize()
+{
+    const auto sock = m_sock.load( std::memory_order_relaxed );
+    int bufSize;
+#if defined _WIN32
+    int sz = sizeof( bufSize );
+    getsockopt( sock, SOL_SOCKET, SO_SNDBUF, (char*)&bufSize, &sz );
+#else
+    socklen_t sz = sizeof( bufSize );
+    getsockopt( sock, SOL_SOCKET, SO_SNDBUF, &bufSize, &sz );
+#endif
+    return bufSize;
+}
+
+int Socket::RecvBuffered( void* buf, int len, int timeout )
+{
+    if( len <= m_bufLeft )
+    {
+        memcpy( buf, m_bufPtr, len );
+        m_bufPtr += len;
+        m_bufLeft -= len;
+        return len;
+    }
+
+    if( m_bufLeft > 0 )
+    {
+        memcpy( buf, m_bufPtr, m_bufLeft );
+        const auto ret = m_bufLeft;
+        m_bufLeft = 0;
+        return ret;
+    }
+
+    if( len >= BufSize ) return Recv( buf, len, timeout );
+
+    m_bufLeft = Recv( m_buf, BufSize, timeout );
+    if( m_bufLeft <= 0 ) return m_bufLeft;
+
+    const auto sz = len < m_bufLeft ? len : m_bufLeft;
+    memcpy( buf, m_buf, sz );
+    m_bufPtr = m_buf + sz;
+    m_bufLeft -= sz;
+    return sz;
+}
+
+int Socket::Recv( void* _buf, int len, int timeout )
+{
+    const auto sock = m_sock.load( std::memory_order_relaxed );
+    auto buf = (char*)_buf;
+
+    struct pollfd fd;
+    fd.fd = (socket_t)sock;
+    fd.events = POLLIN;
+
+    if( poll( &fd, 1, timeout ) > 0 )
+    {
+        return recv( sock, buf, len, 0 );
+    }
+    else
+    {
+        return -1;
+    }
+}
+
+int Socket::ReadUpTo( void* _buf, int len, int timeout )
+{
+    const auto sock = m_sock.load( std::memory_order_relaxed );
+    auto buf = (char*)_buf;
+
+    int rd = 0;
+    while( len > 0 )
+    {
+        const auto res = recv( sock, buf, len, 0 );
+        if( res == 0 ) break;
+        if( res == -1 ) return -1;
+        len -= res;
+        rd += res;
+        buf += res;
+    }
+    return rd;
+}
+
+bool Socket::Read( void* buf, int len, int timeout )
+{
+    auto cbuf = (char*)buf;
+    while( len > 0 )
+    {
+        if( !ReadImpl( cbuf, len, timeout ) ) return false;
+    }
+    return true;
+}
+
+bool Socket::ReadImpl( char*& buf, int& len, int timeout )
+{
+    const auto sz = RecvBuffered( buf, len, timeout );
+    switch( sz )
+    {
+    case 0:
+        return false;
+    case -1:
+#ifdef _WIN32
+    {
+        auto err = WSAGetLastError();
+        if( err == WSAECONNABORTED || err == WSAECONNRESET ) return false;
+    }
+#endif
+    break;
+    default:
+        len -= sz;
+        buf += sz;
+        break;
+    }
+    return true;
+}
+
+bool Socket::ReadRaw( void* _buf, int len, int timeout )
+{
+    auto buf = (char*)_buf;
+    while( len > 0 )
+    {
+        const auto sz = Recv( buf, len, timeout );
+        if( sz <= 0 ) return false;
+        len -= sz;
+        buf += sz;
+    }
+    return true;
+}
+
+bool Socket::HasData()
+{
+    const auto sock = m_sock.load( std::memory_order_relaxed );
+    if( m_bufLeft > 0 ) return true;
+
+    struct pollfd fd;
+    fd.fd = (socket_t)sock;
+    fd.events = POLLIN;
+
+    return poll( &fd, 1, 0 ) > 0;
+}
+
+bool Socket::IsValid() const
+{
+    return m_sock.load( std::memory_order_relaxed ) >= 0;
+}
+
+
+ListenSocket::ListenSocket()
+    : m_sock( -1 )
+{
+#ifdef _WIN32
+    InitWinSock();
+#endif
+}
+
+ListenSocket::~ListenSocket()
+{
+    if( m_sock != -1 ) Close();
+}
+
+static int addrinfo_and_socket_for_family( uint16_t port, int ai_family, struct addrinfo** res )
+{
+    struct addrinfo hints;
+    memset( &hints, 0, sizeof( hints ) );
+    hints.ai_family = ai_family;
+    hints.ai_socktype = SOCK_STREAM;
+#ifndef TRACY_ONLY_LOCALHOST
+    const char* onlyLocalhost = getenv( "TRACY_ONLY_LOCALHOST" );
+    if( !onlyLocalhost || onlyLocalhost[0] != '1' )
+    {
+        hints.ai_flags = AI_PASSIVE;
+    }
+#endif
+    char portbuf[32];
+    sprintf( portbuf, "%" PRIu16, port );
+    if( getaddrinfo( nullptr, portbuf, &hints, res ) != 0 ) return -1;
+    int sock = socket( (*res)->ai_family, (*res)->ai_socktype, (*res)->ai_protocol );
+    if (sock == -1) freeaddrinfo( *res );
+    return sock;
+}
+
+bool ListenSocket::Listen( uint16_t port, int backlog )
+{
+    assert( m_sock == -1 );
+
+    struct addrinfo* res = nullptr;
+
+#if !defined TRACY_ONLY_IPV4 && !defined TRACY_ONLY_LOCALHOST
+    const char* onlyIPv4 = getenv( "TRACY_ONLY_IPV4" );
+    if( !onlyIPv4 || onlyIPv4[0] != '1' )
+    {
+        m_sock = addrinfo_and_socket_for_family( port, AF_INET6, &res );
+    }
+#endif
+    if (m_sock == -1)
+    {
+        // IPV6 protocol may not be available/is disabled. Try to create a socket
+        // with the IPV4 protocol
+        m_sock = addrinfo_and_socket_for_family( port, AF_INET, &res );
+        if( m_sock == -1 ) return false;
+    }
+#if defined _WIN32 || defined __CYGWIN__
+    unsigned long val = 0;
+    setsockopt( m_sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*)&val, sizeof( val ) );
+#elif defined BSD
+    int val = 0;
+    setsockopt( m_sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*)&val, sizeof( val ) );
+    val = 1;
+    setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &val, sizeof( val ) );
+#else
+    int val = 1;
+    setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &val, sizeof( val ) );
+#endif
+    if( bind( m_sock, res->ai_addr, res->ai_addrlen ) == -1 ) { freeaddrinfo( res ); Close(); return false; }
+    if( listen( m_sock, backlog ) == -1 ) { freeaddrinfo( res ); Close(); return false; }
+    freeaddrinfo( res );
+    return true;
+}
+
+Socket* ListenSocket::Accept()
+{
+    struct sockaddr_storage remote;
+    socklen_t sz = sizeof( remote );
+
+    struct pollfd fd;
+    fd.fd = (socket_t)m_sock;
+    fd.events = POLLIN;
+
+    if( poll( &fd, 1, 10 ) > 0 )
+    {
+        int sock = accept( m_sock, (sockaddr*)&remote, &sz);
+        if( sock == -1 ) return nullptr;
+
+#if defined __APPLE__
+        int val = 1;
+        setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) );
+#endif
+
+        auto ptr = (Socket*)tracy_malloc( sizeof( Socket ) );
+        new(ptr) Socket( sock );
+        return ptr;
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+void ListenSocket::Close()
+{
+    assert( m_sock != -1 );
+#ifdef _WIN32
+    closesocket( m_sock );
+#else
+    close( m_sock );
+#endif
+    m_sock = -1;
+}
+
+UdpBroadcast::UdpBroadcast()
+    : m_sock( -1 )
+{
+#ifdef _WIN32
+    InitWinSock();
+#endif
+}
+
+UdpBroadcast::~UdpBroadcast()
+{
+    if( m_sock != -1 ) Close();
+}
+
+bool UdpBroadcast::Open( const char* addr, uint16_t port )
+{
+    assert( m_sock == -1 );
+
+    struct addrinfo hints;
+    struct addrinfo *res, *ptr;
+
+    memset( &hints, 0, sizeof( hints ) );
+    hints.ai_family = AF_INET;
+    hints.ai_socktype = SOCK_DGRAM;
+
+    char portbuf[32];
+    sprintf( portbuf, "%" PRIu16, port );
+
+    if( getaddrinfo( addr, portbuf, &hints, &res ) != 0 ) return false;
+    int sock = 0;
+    for( ptr = res; ptr; ptr = ptr->ai_next )
+    {
+        if( ( sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol ) ) == -1 ) continue;
+#if defined __APPLE__
+        int val = 1;
+        setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) );
+#endif
+#if defined _WIN32
+        unsigned long broadcast = 1;
+        if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, (const char*)&broadcast, sizeof( broadcast ) ) == -1 )
+#else
+        int broadcast = 1;
+        if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, &broadcast, sizeof( broadcast ) ) == -1 )
+#endif
+        {
+#ifdef _WIN32
+            closesocket( sock );
+#else
+            close( sock );
+#endif
+            continue;
+        }
+        break;
+    }
+    freeaddrinfo( res );
+    if( !ptr ) return false;
+
+    m_sock = sock;
+    inet_pton( AF_INET, addr, &m_addr );
+    return true;
+}
+
+void UdpBroadcast::Close()
+{
+    assert( m_sock != -1 );
+#ifdef _WIN32
+    closesocket( m_sock );
+#else
+    close( m_sock );
+#endif
+    m_sock = -1;
+}
+
+int UdpBroadcast::Send( uint16_t port, const void* data, int len )
+{
+    assert( m_sock != -1 );
+    struct sockaddr_in addr;
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons( port );
+    addr.sin_addr.s_addr = m_addr;
+    return sendto( m_sock, (const char*)data, len, MSG_NOSIGNAL, (sockaddr*)&addr, sizeof( addr ) );
+}
+
+IpAddress::IpAddress()
+    : m_number( 0 )
+{
+    *m_text = '\0';
+}
+
+IpAddress::~IpAddress()
+{
+}
+
+void IpAddress::Set( const struct sockaddr& addr )
+{
+#if defined _WIN32 && ( !defined NTDDI_WIN10 || NTDDI_VERSION < NTDDI_WIN10 )
+    struct sockaddr_in tmp;
+    memcpy( &tmp, &addr, sizeof( tmp ) );
+    auto ai = &tmp;
+#else
+    auto ai = (const struct sockaddr_in*)&addr;
+#endif
+    inet_ntop( AF_INET, &ai->sin_addr, m_text, 17 );
+    m_number = ai->sin_addr.s_addr;
+}
+
+UdpListen::UdpListen()
+    : m_sock( -1 )
+{
+#ifdef _WIN32
+    InitWinSock();
+#endif
+}
+
+UdpListen::~UdpListen()
+{
+    if( m_sock != -1 ) Close();
+}
+
+bool UdpListen::Listen( uint16_t port )
+{
+    assert( m_sock == -1 );
+
+    int sock;
+    if( ( sock = socket( AF_INET, SOCK_DGRAM, 0 ) ) == -1 ) return false;
+
+#if defined __APPLE__
+    int val = 1;
+    setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) );
+#endif
+#if defined _WIN32
+    unsigned long reuse = 1;
+    setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof( reuse ) );
+#else
+    int reuse = 1;
+    setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof( reuse ) );
+#endif
+#if defined _WIN32
+    unsigned long broadcast = 1;
+    if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, (const char*)&broadcast, sizeof( broadcast ) ) == -1 )
+#else
+    int broadcast = 1;
+    if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, &broadcast, sizeof( broadcast ) ) == -1 )
+#endif
+    {
+#ifdef _WIN32
+        closesocket( sock );
+#else
+        close( sock );
+#endif
+        return false;
+    }
+
+    struct sockaddr_in addr;
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons( port );
+    addr.sin_addr.s_addr = INADDR_ANY;
+
+    if( bind( sock, (sockaddr*)&addr, sizeof( addr ) ) == -1 )
+    {
+#ifdef _WIN32
+        closesocket( sock );
+#else
+        close( sock );
+#endif
+        return false;
+    }
+
+    m_sock = sock;
+    return true;
+}
+
+void UdpListen::Close()
+{
+    assert( m_sock != -1 );
+#ifdef _WIN32
+    closesocket( m_sock );
+#else
+    close( m_sock );
+#endif
+    m_sock = -1;
+}
+
+const char* UdpListen::Read( size_t& len, IpAddress& addr, int timeout )
+{
+    static char buf[2048];
+
+    struct pollfd fd;
+    fd.fd = (socket_t)m_sock;
+    fd.events = POLLIN;
+    if( poll( &fd, 1, timeout ) <= 0 ) return nullptr;
+
+    sockaddr sa;
+    socklen_t salen = sizeof( struct sockaddr );
+    len = (size_t)recvfrom( m_sock, buf, 2048, 0, &sa, &salen );
+    addr.Set( sa );
+
+    return buf;
+}
+
+}
diff --git a/Source/ThirdParty/tracy/common/TracySocket.hpp b/Source/ThirdParty/tracy/common/TracySocket.hpp
new file mode 100644
index 000000000..4fbb3278a
--- /dev/null
+++ b/Source/ThirdParty/tracy/common/TracySocket.hpp
@@ -0,0 +1,154 @@
+#ifndef __TRACYSOCKET_HPP__
+#define __TRACYSOCKET_HPP__
+
+#include <atomic>
+#include <stdint.h>
+
+struct addrinfo;
+struct sockaddr;
+
+namespace tracy
+{
+
+#ifdef _WIN32
+void InitWinSock();
+#endif
+
+class Socket
+{
+public:
+    Socket();
+    Socket( int sock );
+    ~Socket();
+
+    bool Connect( const char* addr, uint16_t port );
+    bool ConnectBlocking( const char* addr, uint16_t port );
+    void Close();
+
+    int Send( const void* buf, int len );
+    int GetSendBufSize();
+
+    int ReadUpTo( void* buf, int len, int timeout );
+    bool Read( void* buf, int len, int timeout );
+
+    template<typename ShouldExit>
+    bool Read( void* buf, int len, int timeout, ShouldExit exitCb )
+    {
+        auto cbuf = (char*)buf;
+        while( len > 0 )
+        {
+            if( exitCb() ) return false;
+            if( !ReadImpl( cbuf, len, timeout ) ) return false;
+        }
+        return true;
+    }
+
+    bool ReadRaw( void* buf, int len, int timeout );
+    bool HasData();
+    bool IsValid() const;
+
+    Socket( const Socket& ) = delete;
+    Socket( Socket&& ) = delete;
+    Socket& operator=( const Socket& ) = delete;
+    Socket& operator=( Socket&& ) = delete;
+
+private:
+    int RecvBuffered( void* buf, int len, int timeout );
+    int Recv( void* buf, int len, int timeout );
+
+    bool ReadImpl( char*& buf, int& len, int timeout );
+
+    char* m_buf;
+    char* m_bufPtr;
+    std::atomic<int> m_sock;
+    int m_bufLeft;
+
+    struct addrinfo *m_res;
+    struct addrinfo *m_ptr;
+    int m_connSock;
+};
+
+class ListenSocket
+{
+public:
+    ListenSocket();
+    ~ListenSocket();
+
+    bool Listen( uint16_t port, int backlog );
+    Socket* Accept();
+    void Close();
+
+    ListenSocket( const ListenSocket& ) = delete;
+    ListenSocket( ListenSocket&& ) = delete;
+    ListenSocket& operator=( const ListenSocket& ) = delete;
+    ListenSocket& operator=( ListenSocket&& ) = delete;
+
+private:
+    int m_sock;
+};
+
+class UdpBroadcast
+{
+public:
+    UdpBroadcast();
+    ~UdpBroadcast();
+
+    bool Open( const char* addr, uint16_t port );
+    void Close();
+
+    int Send( uint16_t port, const void* data, int len );
+
+    UdpBroadcast( const UdpBroadcast& ) = delete;
+    UdpBroadcast( UdpBroadcast&& ) = delete;
+    UdpBroadcast& operator=( const UdpBroadcast& ) = delete;
+    UdpBroadcast& operator=( UdpBroadcast&& ) = delete;
+
+private:
+    int m_sock;
+    uint32_t m_addr;
+};
+
+class IpAddress
+{
+public:
+    IpAddress();
+    ~IpAddress();
+
+    void Set( const struct sockaddr& addr );
+
+    uint32_t GetNumber() const { return m_number; }
+    const char* GetText() const { return m_text; }
+
+    IpAddress( const IpAddress& ) = delete;
+    IpAddress( IpAddress&& ) = delete;
+    IpAddress& operator=( const IpAddress& ) = delete;
+    IpAddress& operator=( IpAddress&& ) = delete;
+
+private:
+    uint32_t m_number;
+    char m_text[17];
+};
+
+class UdpListen
+{
+public:
+    UdpListen();
+    ~UdpListen();
+
+    bool Listen( uint16_t port );
+    void Close();
+
+    const char* Read( size_t& len, IpAddress& addr, int timeout );
+
+    UdpListen( const UdpListen& ) = delete;
+    UdpListen( UdpListen&& ) = delete;
+    UdpListen& operator=( const UdpListen& ) = delete;
+    UdpListen& operator=( UdpListen&& ) = delete;
+
+private:
+    int m_sock;
+};
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/common/TracySystem.cpp b/Source/ThirdParty/tracy/common/TracySystem.cpp
new file mode 100644
index 000000000..25ccf9f8a
--- /dev/null
+++ b/Source/ThirdParty/tracy/common/TracySystem.cpp
@@ -0,0 +1,239 @@
+#if defined _MSC_VER || defined __CYGWIN__ || defined _WIN32
+# ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+# endif
+# ifndef NOMINMAX
+#  define NOMINMAX
+# endif
+#endif
+#ifdef _MSC_VER
+#  pragma warning(disable:4996)
+#endif
+#if defined _WIN32 || defined __CYGWIN__
+#  include <windows.h>
+#else
+#  include <pthread.h>
+#  include <string.h>
+#  include <unistd.h>
+#endif
+
+#ifdef __linux__
+#  ifdef __ANDROID__
+#    include <sys/types.h>
+#  else
+#    include <sys/syscall.h>
+#  endif
+#  include <fcntl.h>
+#elif defined __FreeBSD__
+#  include <sys/thr.h>
+#elif defined __NetBSD__ || defined __DragonFly__
+#  include <sys/lwp.h>
+#endif
+
+#ifdef __MINGW32__
+#  define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "TracySystem.hpp"
+
+#if defined _WIN32 || defined __CYGWIN__
+extern "C" typedef HRESULT (WINAPI *t_SetThreadDescription)( HANDLE, PCWSTR );
+extern "C" typedef HRESULT (WINAPI *t_GetThreadDescription)( HANDLE, PWSTR* );
+#endif
+
+#ifdef TRACY_ENABLE
+#  include <atomic>
+#  include "TracyAlloc.hpp"
+#endif
+
+namespace tracy
+{
+
+namespace detail
+{
+
+TRACY_API uint64_t GetThreadHandleImpl()
+{
+#if defined _WIN32 || defined __CYGWIN__
+    static_assert( sizeof( decltype( GetCurrentThreadId() ) ) <= sizeof( uint64_t ), "Thread handle too big to fit in protocol" );
+    return uint64_t( GetCurrentThreadId() );
+#elif defined __APPLE__
+    uint64_t id;
+    pthread_threadid_np( pthread_self(), &id );
+    return id;
+#elif defined __ANDROID__
+    return (uint64_t)gettid();
+#elif defined __linux__
+    return (uint64_t)syscall( SYS_gettid );
+#elif defined __FreeBSD__
+    long id;
+    thr_self( &id );
+    return id;
+#elif defined __NetBSD__
+    return _lwp_self();
+#elif defined __DragonFly__
+    return lwp_gettid();
+#elif defined __OpenBSD__
+    return getthrid();
+#else
+    static_assert( sizeof( decltype( pthread_self() ) ) <= sizeof( uint64_t ), "Thread handle too big to fit in protocol" );
+    return uint64_t( pthread_self() );
+#endif
+
+}
+
+}
+
+#ifdef TRACY_ENABLE
+struct ThreadNameData
+{
+    uint64_t id;
+    const char* name;
+    ThreadNameData* next;
+};
+std::atomic<ThreadNameData*>& GetThreadNameData();
+TRACY_API void InitRPMallocThread();
+#endif
+
+TRACY_API void SetThreadName( const char* name )
+{
+#if defined _WIN32 || defined __CYGWIN__
+    static auto _SetThreadDescription = (t_SetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "SetThreadDescription" );
+    if( _SetThreadDescription )
+    {
+        wchar_t buf[256];
+        mbstowcs( buf, name, 256 );
+        _SetThreadDescription( GetCurrentThread(), buf );
+    }
+    else
+    {
+#  if defined _MSC_VER
+        const DWORD MS_VC_EXCEPTION=0x406D1388;
+#    pragma pack( push, 8 )
+        struct THREADNAME_INFO
+        {
+            DWORD dwType;
+            LPCSTR szName;
+            DWORD dwThreadID;
+            DWORD dwFlags;
+        };
+#    pragma pack(pop)
+
+        DWORD ThreadId = GetCurrentThreadId();
+        THREADNAME_INFO info;
+        info.dwType = 0x1000;
+        info.szName = name;
+        info.dwThreadID = ThreadId;
+        info.dwFlags = 0;
+
+        __try
+        {
+            RaiseException( MS_VC_EXCEPTION, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info );
+        }
+        __except(EXCEPTION_EXECUTE_HANDLER)
+        {
+        }
+#  endif
+    }
+#elif defined _GNU_SOURCE && !defined __EMSCRIPTEN__ && !defined __CYGWIN__
+    {
+        const auto sz = strlen( name );
+        if( sz <= 15 )
+        {
+            pthread_setname_np( pthread_self(), name );
+        }
+        else
+        {
+            char buf[16];
+            memcpy( buf, name, 15 );
+            buf[15] = '\0';
+            pthread_setname_np( pthread_self(), buf );
+        }
+    }
+#endif
+#ifdef TRACY_ENABLE
+    {
+        InitRPMallocThread();
+        const auto sz = strlen( name );
+        char* buf = (char*)tracy_malloc( sz+1 );
+        memcpy( buf, name, sz );
+        buf[sz] = '\0';
+        auto data = (ThreadNameData*)tracy_malloc( sizeof( ThreadNameData ) );
+        data->id = detail::GetThreadHandleImpl();
+        data->name = buf;
+        data->next = GetThreadNameData().load( std::memory_order_relaxed );
+        while( !GetThreadNameData().compare_exchange_weak( data->next, data, std::memory_order_release, std::memory_order_relaxed ) ) {}
+    }
+#endif
+}
+
+TRACY_API const char* GetThreadName( uint64_t id )
+{
+    static char buf[256];
+#ifdef TRACY_ENABLE
+    auto ptr = GetThreadNameData().load( std::memory_order_relaxed );
+    while( ptr )
+    {
+        if( ptr->id == id )
+        {
+            return ptr->name;
+        }
+        ptr = ptr->next;
+    }
+#else
+#  if defined _WIN32 || defined __CYGWIN__
+    static auto _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" );
+    if( _GetThreadDescription )
+    {
+        auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id );
+        if( hnd != 0 )
+        {
+            PWSTR tmp;
+            _GetThreadDescription( hnd, &tmp );
+            auto ret = wcstombs( buf, tmp, 256 );
+            CloseHandle( hnd );
+            if( ret != 0 )
+            {
+                return buf;
+            }
+        }
+    }
+#  elif defined __linux__
+    int cs, fd;
+    char path[32];
+#   ifdef __ANDROID__
+    int tid = gettid();
+#   else
+    int tid = (int) syscall( SYS_gettid );
+#   endif
+    snprintf( path, sizeof( path ), "/proc/self/task/%d/comm", tid );
+    sprintf( buf, "%" PRIu64, id );
+#   ifndef __ANDROID__
+    pthread_setcancelstate( PTHREAD_CANCEL_DISABLE, &cs );
+#   endif
+    if ( ( fd = open( path, O_RDONLY ) ) > 0) {
+        int len = read( fd, buf, 255 );
+        if( len > 0 )
+        {
+            buf[len] = 0;
+            if( len > 1 && buf[len-1] == '\n' )
+            {
+                buf[len-1] = 0;
+            }
+        }
+        close( fd );
+    }
+#   ifndef __ANDROID__
+    pthread_setcancelstate( cs, 0 );
+#   endif
+    return buf;
+#  endif
+#endif
+    sprintf( buf, "%" PRIu64, id );
+    return buf;
+}
+
+}
diff --git a/Source/ThirdParty/tracy/common/TracySystem.hpp b/Source/ThirdParty/tracy/common/TracySystem.hpp
new file mode 100644
index 000000000..f285b762a
--- /dev/null
+++ b/Source/ThirdParty/tracy/common/TracySystem.hpp
@@ -0,0 +1,95 @@
+#ifndef __TRACYSYSTEM_HPP__
+#define __TRACYSYSTEM_HPP__
+
+#include <stdint.h>
+
+// Tracy -> Flax integration:
+// - use LZ4 from Flax
+// - use engine symbols export
+// - use engine types and macros
+// - remove AddVectoredExceptionHandler from win32 to prevent messing with Flax crashes reporting
+// - hide implementation from includers to reduce compilation overhead
+// - optimize includes (faster compilation)
+// - remove some features (colors, frame image, dx1 compression)
+#include "Engine/Core/Types/BaseTypes.h"
+#define TRACY_API FLAXENGINE_API
+#define tracy_force_inline FORCE_INLINE
+#define tracy_no_inline FORCE_NOINLINE
+
+#ifndef TracyConcat
+#  define TracyConcat(x,y) TracyConcatIndirect(x,y)
+#endif
+#ifndef TracyConcatIndirect
+#  define TracyConcatIndirect(x,y) x##y
+#endif
+
+namespace tracy
+{
+enum class PlotFormatType : uint8_t
+{
+    Number,
+    Memory,
+    Percentage
+};
+
+typedef void(*ParameterCallback)( uint32_t idx, int32_t val );
+
+struct TRACY_API SourceLocationData
+{
+    const char* name;
+    const char* function;
+    const char* file;
+    uint32_t line;
+    uint32_t color;
+};
+
+class TRACY_API ScopedZone
+{
+public:
+    ScopedZone( const ScopedZone& ) = delete;
+    ScopedZone( ScopedZone&& ) = delete;
+    ScopedZone& operator=( const ScopedZone& ) = delete;
+    ScopedZone& operator=( ScopedZone&& ) = delete;
+
+    ScopedZone( const SourceLocationData* srcloc, bool is_active = true );
+    ScopedZone( const SourceLocationData* srcloc, int depth, bool is_active = true );
+    ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool is_active = true );
+    ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool is_active = true );
+
+    ~ScopedZone();
+
+    void Text( const char* txt, size_t size );
+    void Name( const char* txt, size_t size );
+    void Name( const Char* txt, size_t size );
+    void Color( uint32_t color );
+    void Value( uint64_t value );
+    bool IsActive() const;
+
+private:
+    const bool m_active;
+
+#ifdef TRACY_ON_DEMAND
+    uint64_t m_connectionId;
+#endif
+};
+
+namespace detail
+{
+TRACY_API uint64_t GetThreadHandleImpl();
+}
+
+#ifdef TRACY_ENABLE
+TRACY_API uint64_t GetThreadHandle();
+#else
+static inline uint64_t GetThreadHandle()
+{
+    return detail::GetThreadHandleImpl();
+}
+#endif
+
+TRACY_API void SetThreadName( const char* name );
+TRACY_API const char* GetThreadName( uint64_t id );
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/tracy.Build.cs b/Source/ThirdParty/tracy/tracy.Build.cs
new file mode 100644
index 000000000..aa10c514f
--- /dev/null
+++ b/Source/ThirdParty/tracy/tracy.Build.cs
@@ -0,0 +1,47 @@
+// Copyright (c) 2012-2021 Wojciech Figat. All rights reserved.
+
+using System.Collections.Generic;
+using System.IO;
+using Flax.Build;
+using Flax.Build.NativeCpp;
+
+/// <summary>
+/// https://github.com/wolfpld/tracy
+/// </summary>
+public class tracy : ThirdPartyModule
+{
+    /// <inheritdoc />
+    public override void Init()
+    {
+        base.Init();
+
+        LicenseType = LicenseTypes.BSD3Clause;
+        LicenseFilePath = "LICENSE";
+
+        // Merge third-party modules into engine binary
+        BinaryModuleName = "FlaxEngine";
+    }
+
+    /// <inheritdoc />
+    public override void Setup(BuildOptions options)
+    {
+        base.Setup(options);
+
+        options.SourcePaths.Clear();
+        options.SourceFiles.Clear();
+        options.SourceFiles.Add(Path.Combine(FolderPath, "Tracy.h"));
+        options.SourceFiles.Add(Path.Combine(FolderPath, "TracyClient.cpp"));
+
+        options.PublicDefinitions.Add("TRACY_ENABLE");
+    }
+
+    /// <inheritdoc />
+    public override void GetFilesToDeploy(List<string> files)
+    {
+        base.GetFilesToDeploy(files);
+
+        files.Add(Path.Combine(FolderPath, "Tracy.h"));
+        files.Add(Path.Combine(FolderPath, "common", "TracySystem.hpp"));
+        files.Add(Path.Combine(FolderPath, "client", "TracyCallstack.h"));
+    }
+}