diff --git a/Source/Engine/Engine/EngineService.cpp b/Source/Engine/Engine/EngineService.cpp
index ac58ba40c..9a01b08b5 100644
--- a/Source/Engine/Engine/EngineService.cpp
+++ b/Source/Engine/Engine/EngineService.cpp
@@ -4,7 +4,7 @@
 #include "Engine/Core/Log.h"
 #include "Engine/Core/Collections/Array.h"
 #include "Engine/Core/Collections/Sorting.h"
-#include <ThirdParty/tracy/Tracy.h>
+#include <ThirdParty/tracy/tracy/Tracy.hpp>
 
 static bool CompareEngineServices(EngineService* const& a, EngineService* const& b)
 {
diff --git a/Source/Engine/Platform/Base/ThreadBase.cpp b/Source/Engine/Platform/Base/ThreadBase.cpp
index 5739dbf3e..be45ceeab 100644
--- a/Source/Engine/Platform/Base/ThreadBase.cpp
+++ b/Source/Engine/Platform/Base/ThreadBase.cpp
@@ -7,7 +7,7 @@
 #include "Engine/Scripting/ManagedCLR/MCore.h"
 #if TRACY_ENABLE
 #include "Engine/Core/Math/Math.h"
-#include <ThirdParty/tracy/Tracy.h>
+#include <ThirdParty/tracy/tracy/Tracy.hpp>
 #endif
 
 Delegate<Thread*> ThreadBase::ThreadStarting;
diff --git a/Source/Engine/Profiler/ProfilerCPU.h b/Source/Engine/Profiler/ProfilerCPU.h
index 4525b6714..000e41ccb 100644
--- a/Source/Engine/Profiler/ProfilerCPU.h
+++ b/Source/Engine/Profiler/ProfilerCPU.h
@@ -8,7 +8,7 @@
 #include "Engine/Core/Math/Math.h"
 #include "Engine/Core/Collections/Array.h"
 #include "Engine/Scripting/ScriptingType.h"
-#include <ThirdParty/tracy/Tracy.h>
+#include <ThirdParty/tracy/tracy/Tracy.hpp>
 
 #if COMPILE_WITH_PROFILER
 
diff --git a/Source/Engine/Scripting/Plugins/PluginManager.cpp b/Source/Engine/Scripting/Plugins/PluginManager.cpp
index 4d205d6d6..16b0c5483 100644
--- a/Source/Engine/Scripting/Plugins/PluginManager.cpp
+++ b/Source/Engine/Scripting/Plugins/PluginManager.cpp
@@ -127,7 +127,7 @@ void PluginManagerService::InvokeDeinitialize(Plugin* plugin)
         return;
     StringAnsiView typeName = plugin->GetType().GetName();
     PROFILE_CPU();
-    ZoneName(typeName.Get(), typeName.Length())
+    ZoneName(typeName.Get(), typeName.Length());
 
     LOG(Info, "Unloading plugin {}", plugin->ToString());
 
diff --git a/Source/ThirdParty/tracy/TracyClient.cpp b/Source/ThirdParty/tracy/TracyClient.cpp
index 7e170e5de..3548c5752 100644
--- a/Source/ThirdParty/tracy/TracyClient.cpp
+++ b/Source/ThirdParty/tracy/TracyClient.cpp
@@ -26,6 +26,8 @@
 #include "client/TracySysTrace.cpp"
 #include "common/TracySocket.cpp"
 #include "client/tracy_rpmalloc.cpp"
+#include "client/TracyAlloc.cpp"
+
 
 #if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
 #  include "libbacktrace/alloc.cpp"
@@ -40,6 +42,7 @@
 #  else
 #    include "libbacktrace/elf.cpp"
 #  endif
+#  include "common/TracyStackFrames.cpp"
 #endif
 
 #ifdef _MSC_VER
diff --git a/Source/ThirdParty/tracy/client/TracyAlloc.cpp b/Source/ThirdParty/tracy/client/TracyAlloc.cpp
new file mode 100644
index 000000000..545a6062b
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyAlloc.cpp
@@ -0,0 +1,42 @@
+#include "../common/TracyAlloc.hpp"
+
+#ifdef TRACY_USE_RPMALLOC
+
+#include <atomic>
+
+#include "../common/TracyYield.hpp"
+
+namespace tracy
+{
+
+extern thread_local bool RpThreadInitDone;
+extern std::atomic<int> RpInitDone;
+extern std::atomic<int> RpInitLock;
+
+tracy_no_inline static void InitRpmallocPlumbing()
+{
+    const auto done = RpInitDone.load( std::memory_order_acquire );
+    if( !done )
+    {
+        int expected = 0;
+        while( !RpInitLock.compare_exchange_weak( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; YieldThread(); }
+        const auto done = RpInitDone.load( std::memory_order_acquire );
+        if( !done )
+        {
+            rpmalloc_initialize();
+            RpInitDone.store( 1, std::memory_order_release );
+        }
+        RpInitLock.store( 0, std::memory_order_release );
+    }
+    rpmalloc_thread_initialize();
+    RpThreadInitDone = true;
+}
+
+TRACY_API void InitRpmalloc()
+{
+    if( !RpThreadInitDone ) InitRpmallocPlumbing();
+}
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracyArmCpuTable.hpp b/Source/ThirdParty/tracy/client/TracyArmCpuTable.hpp
index ff7d976c8..2b4459764 100644
--- a/Source/ThirdParty/tracy/client/TracyArmCpuTable.hpp
+++ b/Source/ThirdParty/tracy/client/TracyArmCpuTable.hpp
@@ -38,7 +38,7 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part )
     static char buf[16];
     switch( impl )
     {
-    case 0x41:
+    case 0x41:  // ARM
         switch( part )
         {
         case 0x810: return "810";
@@ -61,8 +61,8 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part )
         case 0xc09: return " Cortex-A9";
         case 0xc0c: return " Cortex-A12";
         case 0xc0d: return " Rockchip RK3288";
-        case 0xc0f: return " Cortex-A15";
         case 0xc0e: return " Cortex-A17";
+        case 0xc0f: return " Cortex-A15";
         case 0xc14: return " Cortex-R4";
         case 0xc15: return " Cortex-R5";
         case 0xc17: return " Cortex-R7";
@@ -92,14 +92,21 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part )
         case 0xd13: return " Cortex-R52";
         case 0xd20: return " Cortex-M23";
         case 0xd21: return " Cortex-M33";
-        case 0xd40: return " Zeus";
+        case 0xd22: return " Cortex-M55";
+        case 0xd40: return " Neoverse V1";
         case 0xd41: return " Cortex-A78";
+        case 0xd42: return " Cortex-A78AE";
         case 0xd43: return " Cortex-A65AE";
         case 0xd44: return " Cortex-X1";
+        case 0xd47: return " Cortex-A710";
+        case 0xd48: return " Cortex-X2";
+        case 0xd49: return " Neoverse N2";
         case 0xd4a: return " Neoverse E1";
+        case 0xd4b: return " Cortex-A78C";
+        case 0xd4c: return " Cortex-X1C";
         default: break;
         }
-    case 0x42:
+    case 0x42:  // Broadcom
         switch( part )
         {
         case 0xf: return " Brahma B15";
@@ -107,7 +114,7 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part )
         case 0x516: return " ThunderX2";
         default: break;
         }
-    case 0x43:
+    case 0x43:  // Cavium
         switch( part )
         {
         case 0xa0: return " ThunderX";
@@ -121,30 +128,31 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part )
         case 0xb3: return " OcteonTX2 F95";
         case 0xb4: return " OcteonTX2 F95N";
         case 0xb5: return " OcteonTX2 F95MM";
+        case 0xb6: return " OcteonTX2 F95O";
         case 0xb8: return " ThunderX3 T110";
         default: break;
         }
-    case 0x44:
+    case 0x44:  // DEC
         switch( part )
         {
         case 0xa10: return " SA110";
         case 0xa11: return " SA1100";
         default: break;
         }
-    case 0x46:
+    case 0x46:  // Fujitsu
         switch( part )
         {
         case 0x1: return " A64FX";
         default: break;
         }
-    case 0x48:
+    case 0x48:  // HiSilicon
         switch( part )
         {
         case 0xd01: return " TSV100";
         case 0xd40: return " Kirin 980";
         default: break;
         }
-    case 0x4e:
+    case 0x4e:  // Nvidia
         switch( part )
         {
         case 0x0: return " Denver";
@@ -152,13 +160,13 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part )
         case 0x4: return " Carmel";
         default: break;
         }
-    case 0x50:
+    case 0x50:  // Applied Micro
         switch( part )
         {
         case 0x0: return " X-Gene";
         default: break;
         }
-    case 0x51:
+    case 0x51:  // Qualcomm
         switch( part )
         {
         case 0xf: return " Scorpion";
@@ -174,18 +182,27 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part )
         case 0x802: return " Kryo 385 Gold";
         case 0x803: return " Kryo 385 Silver";
         case 0x804: return " Kryo 485 Gold";
+        case 0x805: return " Kryo 4xx/5xx Silver";
         case 0xc00: return " Falkor";
         case 0xc01: return " Saphira";
         default: break;
         }
-    case 0x53:
+    case 0x53:  // Samsung
         switch( part )
         {
         case 0x1: return " Exynos M1/M2";
         case 0x2: return " Exynos M3";
+        case 0x3: return " Exynos M4";
+        case 0x4: return " Exynos M5";
         default: break;
         }
-    case 0x56:
+    case 0x54:  // Texas Instruments
+        switch( part )
+        {
+        case 0x925: return " TI925";
+        default: break;
+        }
+    case 0x56:  // Marvell
         switch( part )
         {
         case 0x131: return " Feroceon 88FR131";
@@ -193,7 +210,7 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part )
         case 0x584: return " PJ4B-MP / PJ4C";
         default: break;
         }
-    case 0x61:
+    case 0x61:  // Apple
         switch( part )
         {
         case 0x1: return " Cyclone";
@@ -203,21 +220,33 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part )
         case 0x5: return " Twister/Elba/Malta";
         case 0x6: return " Hurricane";
         case 0x7: return " Hurricane/Myst";
+        case 0x22: return " M1 Icestorm";
+        case 0x23: return " M1 Firestorm";
+        case 0x24: return " M1 Icestorm Pro";
+        case 0x25: return " M1 Firestorm Pro";
+        case 0x28: return " M1 Icestorm Max";
+        case 0x29: return " M1 Firestorm Max";
         default: break;
         }
-    case 0x66:
+    case 0x66:  // Faraday
         switch( part )
         {
         case 0x526: return " FA526";
         case 0x626: return " FA626";
         default: break;
         }
-    case 0x68:
+    case 0x68:  // HXT
         switch( part )
         {
         case 0x0: return " Phecda";
         default: break;
         }
+    case 0xc0:  // Ampere Computing
+        switch( part )
+        {
+        case 0xac3: return " Ampere1";
+        default: break;
+        }
     default: break;
     }
     sprintf( buf, " 0x%x", part );
@@ -267,6 +296,15 @@ static const char* DecodeIosDevice( const char* id )
         "iPhone12,3", "iPhone 11 Pro",
         "iPhone12,5", "iPhone 11 Pro Max",
         "iPhone12,8", "iPhone SE 2nd Gen",
+        "iPhone13,1", "iPhone 12 Mini",
+        "iPhone13,2", "iPhone 12",
+        "iPhone13,3", "iPhone 12 Pro",
+        "iPhone13,4", "iPhone 12 Pro Max",
+        "iPhone14,2", "iPhone 13 Pro",
+        "iPhone14,3", "iPhone 13 Pro Max",
+        "iPhone14,4", "iPhone 13 Mini",
+        "iPhone14,5", "iPhone 13",
+        "iPhone14,6", "iPhone SE 3rd Gen",
         "iPad1,1", "iPad (A1219/A1337)",
         "iPad2,1", "iPad 2 (A1395)",
         "iPad2,2", "iPad 2 (A1396)",
@@ -325,6 +363,20 @@ static const char* DecodeIosDevice( const char* id )
         "iPad11,2", "iPad Mini 5th gen (A2124/A2125/A2126)",
         "iPad11,3", "iPad Air 3rd gen (A2152)",
         "iPad11,4", "iPad Air 3rd gen (A2123/A2153/A2154)",
+        "iPad11,6", "iPad 8th gen (WiFi)",
+        "iPad11,7", "iPad 8th gen (WiFi+Cellular)",
+        "iPad13,1", "iPad Air 4th gen (WiFi)",
+        "iPad13,2", "iPad Air 4th gen (WiFi+Cellular)",
+        "iPad13,4", "iPad Pro 11\" 3rd gen",
+        "iPad13,5", "iPad Pro 11\" 3rd gen",
+        "iPad13,6", "iPad Pro 11\" 3rd gen",
+        "iPad13,7", "iPad Pro 11\" 3rd gen",
+        "iPad13,8", "iPad Pro 12.9\" 5th gen",
+        "iPad13,9", "iPad Pro 12.9\" 5th gen",
+        "iPad13,10", "iPad Pro 12.9\" 5th gen",
+        "iPad13,11", "iPad Pro 12.9\" 5th gen",
+        "iPad13,16", "iPad Air 5th Gen (WiFi)",
+        "iPad13,17", "iPad Air 5th Gen (WiFi+Cellular)",
         "iPod1,1", "iPod Touch",
         "iPod2,1", "iPod Touch 2nd gen",
         "iPod3,1", "iPod Touch 3rd gen",
diff --git a/Source/ThirdParty/tracy/client/TracyCallstack.cpp b/Source/ThirdParty/tracy/client/TracyCallstack.cpp
index 10698cb19..ca19a543b 100644
--- a/Source/ThirdParty/tracy/client/TracyCallstack.cpp
+++ b/Source/ThirdParty/tracy/client/TracyCallstack.cpp
@@ -1,9 +1,12 @@
+#include <limits>
 #include <new>
 #include <stdio.h>
 #include <string.h>
 #include "TracyCallstack.hpp"
 #include "TracyFastVector.hpp"
+#include "TracyStringHelpers.hpp"
 #include "../common/TracyAlloc.hpp"
+#include "TracyDebug.hpp"
 
 #ifdef TRACY_HAS_CALLSTACK
 
@@ -13,6 +16,7 @@
 #  endif
 #  include <windows.h>
 #  include <psapi.h>
+#  include <algorithm>
 #  ifdef _MSC_VER
 #    pragma warning( push )
 #    pragma warning( disable : 4091 )
@@ -23,8 +27,11 @@
 #  endif
 #elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
 #  include "../libbacktrace/backtrace.hpp"
+#  include <algorithm>
 #  include <dlfcn.h>
 #  include <cxxabi.h>
+#  include <stdlib.h>
+#  include "TracyFastVector.hpp"
 #elif TRACY_HAS_CALLSTACK == 5
 #  include <dlfcn.h>
 #  include <cxxabi.h>
@@ -45,31 +52,50 @@ extern "C"
 };
 #endif
 
+#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 5 || TRACY_HAS_CALLSTACK == 6
+// If you want to use your own demangling functionality (e.g. for another language),
+// define TRACY_DEMANGLE and provide your own implementation of the __tracy_demangle
+// function. The input parameter is a function name. The demangle function must
+// identify whether this name is mangled, and fail if it is not. Failure is indicated
+// by returning nullptr. If demangling succeeds, a pointer to the C string containing
+// demangled function must be returned. The demangling function is responsible for
+// managing memory for this string. It is expected that it will be internally reused.
+// When a call to ___tracy_demangle is made, previous contents of the string memory
+// do not need to be preserved. Function may return string of any length, but the
+// profiler can choose to truncate it.
+extern "C" const char* ___tracy_demangle( const char* mangled );
+
+#ifndef TRACY_DEMANGLE
+constexpr size_t ___tracy_demangle_buffer_len = 1024*1024; 
+char* ___tracy_demangle_buffer;
+
+void ___tracy_init_demangle_buffer()
+{
+    ___tracy_demangle_buffer = (char*)tracy::tracy_malloc( ___tracy_demangle_buffer_len );
+}
+
+void ___tracy_free_demangle_buffer()
+{
+    tracy::tracy_free( ___tracy_demangle_buffer );
+}
+
+extern "C" const char* ___tracy_demangle( const char* mangled )
+{
+    if( !mangled || mangled[0] != '_' ) return nullptr;
+    if( strlen( mangled ) > ___tracy_demangle_buffer_len ) return nullptr;
+    int status;
+    size_t len = ___tracy_demangle_buffer_len;
+    return abi::__cxa_demangle( mangled, ___tracy_demangle_buffer, &len, &status );
+}
+#endif
+#endif
+
 namespace tracy
 {
 
-static inline char* CopyString( const char* src, size_t sz )
-{
-    assert( strlen( src ) == sz );
-    auto dst = (char*)tracy_malloc( sz + 1 );
-    memcpy( dst, src, sz );
-    dst[sz] = '\0';
-    return dst;
-}
-
-static inline char* CopyString( const char* src )
-{
-    const auto sz = strlen( src );
-    auto dst = (char*)tracy_malloc( sz + 1 );
-    memcpy( dst, src, sz );
-    dst[sz] = '\0';
-    return dst;
-}
-
-
 #if TRACY_HAS_CALLSTACK == 1
 
-enum { MaxCbTrace = 16 };
+enum { MaxCbTrace = 64 };
 enum { MaxNameSize = 8*1024 };
 
 int cb_num;
@@ -77,24 +103,19 @@ CallstackEntry cb_data[MaxCbTrace];
 
 extern "C"
 {
-    typedef unsigned long (__stdcall *t_RtlWalkFrameChain)( void**, unsigned long, unsigned long );
-    t_RtlWalkFrameChain RtlWalkFrameChain = 0;
+    typedef DWORD (__stdcall *t_SymAddrIncludeInlineTrace)( HANDLE hProcess, DWORD64 Address );
+    typedef BOOL (__stdcall *t_SymQueryInlineTrace)( HANDLE hProcess, DWORD64 StartAddress, DWORD StartContext, DWORD64 StartRetAddress, DWORD64 CurAddress, LPDWORD CurContext, LPDWORD CurFrameIndex );
+    typedef BOOL (__stdcall *t_SymFromInlineContext)( HANDLE hProcess, DWORD64 Address, ULONG InlineContext, PDWORD64 Displacement, PSYMBOL_INFO Symbol );
+    typedef BOOL (__stdcall *t_SymGetLineFromInlineContext)( HANDLE hProcess, DWORD64 qwAddr, ULONG InlineContext, DWORD64 qwModuleBaseAddress, PDWORD pdwDisplacement, PIMAGEHLP_LINE64 Line64 );
+
+    TRACY_API ___tracy_t_RtlWalkFrameChain ___tracy_RtlWalkFrameChain = 0;
+    t_SymAddrIncludeInlineTrace _SymAddrIncludeInlineTrace = 0;
+    t_SymQueryInlineTrace _SymQueryInlineTrace = 0;
+    t_SymFromInlineContext _SymFromInlineContext = 0;
+    t_SymGetLineFromInlineContext _SymGetLineFromInlineContext = 0;
 }
 
-#if defined __MINGW32__ && API_VERSION_NUMBER < 12
-extern "C" {
-// Actual required API_VERSION_NUMBER is unknown because it is undocumented. These functions are not present in at least v11.
-DWORD IMAGEAPI SymAddrIncludeInlineTrace(HANDLE hProcess, DWORD64 Address);
-BOOL IMAGEAPI SymQueryInlineTrace(HANDLE hProcess, DWORD64 StartAddress, DWORD StartContext, DWORD64 StartRetAddress,
-    DWORD64 CurAddress, LPDWORD CurContext, LPDWORD CurFrameIndex);
-BOOL IMAGEAPI SymFromInlineContext(HANDLE hProcess, DWORD64 Address, ULONG InlineContext, PDWORD64 Displacement,
-    PSYMBOL_INFO Symbol);
-BOOL IMAGEAPI SymGetLineFromInlineContext(HANDLE hProcess, DWORD64 qwAddr, ULONG InlineContext,
-    DWORD64 qwModuleBaseAddress, PDWORD pdwDisplacement, PIMAGEHLP_LINE64 Line64);
-};
-#endif
 
-#ifndef __CYGWIN__
 struct ModuleCache
 {
     uint64_t start;
@@ -103,11 +124,30 @@ struct ModuleCache
 };
 
 static FastVector<ModuleCache>* s_modCache;
-#endif
+
+
+struct KernelDriver
+{
+    uint64_t addr;
+    const char* mod;
+    const char* path;
+};
+
+KernelDriver* s_krnlCache = nullptr;
+size_t s_krnlCacheCnt;
+
+
+void InitCallstackCritical()
+{
+    ___tracy_RtlWalkFrameChain = (___tracy_t_RtlWalkFrameChain)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlWalkFrameChain" );
+}
 
 void InitCallstack()
 {
-    RtlWalkFrameChain = (t_RtlWalkFrameChain)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlWalkFrameChain" );
+    _SymAddrIncludeInlineTrace = (t_SymAddrIncludeInlineTrace)GetProcAddress( GetModuleHandleA( "dbghelp.dll" ), "SymAddrIncludeInlineTrace" );
+    _SymQueryInlineTrace = (t_SymQueryInlineTrace)GetProcAddress( GetModuleHandleA( "dbghelp.dll" ), "SymQueryInlineTrace" );
+    _SymFromInlineContext = (t_SymFromInlineContext)GetProcAddress( GetModuleHandleA( "dbghelp.dll" ), "SymFromInlineContext" );
+    _SymGetLineFromInlineContext = (t_SymGetLineFromInlineContext)GetProcAddress( GetModuleHandleA( "dbghelp.dll" ), "SymGetLineFromInlineContext" );
 
 #ifdef TRACY_DBGHELP_LOCK
     DBGHELP_INIT;
@@ -118,14 +158,63 @@ void InitCallstack()
     SymInitialize( GetCurrentProcess(), nullptr, true );
     SymSetOptions( SYMOPT_LOAD_LINES );
 
-#ifndef __CYGWIN__
-    HMODULE mod[1024];
     DWORD needed;
-    HANDLE proc = GetCurrentProcess();
+    LPVOID dev[4096];
+    if( EnumDeviceDrivers( dev, sizeof(dev), &needed ) != 0 )
+    {
+        char windir[MAX_PATH];
+        if( !GetWindowsDirectoryA( windir, sizeof( windir ) ) ) memcpy( windir, "c:\\windows", 11 );
+        const auto windirlen = strlen( windir );
+
+        const auto sz = needed / sizeof( LPVOID );
+        s_krnlCache = (KernelDriver*)tracy_malloc( sizeof(KernelDriver) * sz );
+        int cnt = 0;
+        for( size_t i=0; i<sz; i++ )
+        {
+            char fn[MAX_PATH];
+            const auto len = GetDeviceDriverBaseNameA( dev[i], fn, sizeof( fn ) );
+            if( len != 0 )
+            {
+                auto buf = (char*)tracy_malloc_fast( len+3 );
+                buf[0] = '<';
+                memcpy( buf+1, fn, len );
+                memcpy( buf+len+1, ">", 2 );
+                s_krnlCache[cnt] = KernelDriver { (uint64_t)dev[i], buf };
+
+                const auto len = GetDeviceDriverFileNameA( dev[i], fn, sizeof( fn ) );
+                if( len != 0 )
+                {
+                    char full[MAX_PATH];
+                    char* path = fn;
+
+                    if( memcmp( fn, "\\SystemRoot\\", 12 ) == 0 )
+                    {
+                        memcpy( full, windir, windirlen );
+                        strcpy( full + windirlen, fn + 11 );
+                        path = full;
+                    }
+
+                    SymLoadModuleEx( GetCurrentProcess(), nullptr, path, nullptr, (DWORD64)dev[i], 0, nullptr, 0 );
+
+                    const auto psz = strlen( path );
+                    auto pptr = (char*)tracy_malloc_fast( psz+1 );
+                    memcpy( pptr, path, psz );
+                    pptr[psz] = '\0';
+                    s_krnlCache[cnt].path = pptr;
+                }
+
+                cnt++;
+            }
+        }
+        s_krnlCacheCnt = cnt;
+        std::sort( s_krnlCache, s_krnlCache + s_krnlCacheCnt, []( const KernelDriver& lhs, const KernelDriver& rhs ) { return lhs.addr > rhs.addr; } );
+    }
 
     s_modCache = (FastVector<ModuleCache>*)tracy_malloc( sizeof( FastVector<ModuleCache> ) );
     new(s_modCache) FastVector<ModuleCache>( 512 );
 
+    HANDLE proc = GetCurrentProcess();
+    HMODULE mod[1024];
     if( EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 )
     {
         const auto sz = needed / sizeof( HMODULE );
@@ -146,7 +235,7 @@ void InitCallstack()
                     auto cache = s_modCache->push_next();
                     cache->start = base;
                     cache->end = base + info.SizeOfImage;
-                    cache->name = (char*)tracy_malloc( namelen+3 );
+                    cache->name = (char*)tracy_malloc_fast( namelen+3 );
                     cache->name[0] = '[';
                     memcpy( cache->name+1, ptr, namelen );
                     cache->name[namelen+1] = ']';
@@ -155,19 +244,14 @@ void InitCallstack()
             }
         }
     }
-#endif
 
 #ifdef TRACY_DBGHELP_LOCK
     DBGHELP_UNLOCK;
 #endif
 }
 
-TRACY_API uintptr_t* CallTrace( int depth )
+void EndCallstack()
 {
-    auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) );
-    const auto num = RtlWalkFrameChain( (void**)( trace + 1 ), depth, 0 );
-    *trace = num;
-    return trace;
 }
 
 const char* DecodeCallstackPtrFast( uint64_t ptr )
@@ -198,11 +282,30 @@ const char* DecodeCallstackPtrFast( uint64_t ptr )
     return ret;
 }
 
-static const char* GetModuleName( uint64_t addr )
+const char* GetKernelModulePath( uint64_t addr )
 {
-    if( ( addr & 0x8000000000000000 ) != 0 ) return "[kernel]";
+    assert( addr >> 63 != 0 );
+    if( !s_krnlCache ) return nullptr;
+    auto it = std::lower_bound( s_krnlCache, s_krnlCache + s_krnlCacheCnt, addr, []( const KernelDriver& lhs, const uint64_t& rhs ) { return lhs.addr > rhs; } );
+    if( it == s_krnlCache + s_krnlCacheCnt ) return nullptr;
+    return it->path;
+}
+
+static const char* GetModuleNameAndPrepareSymbols( uint64_t addr )
+{
+    if( ( addr >> 63 ) != 0 )
+    {
+        if( s_krnlCache )
+        {
+            auto it = std::lower_bound( s_krnlCache, s_krnlCache + s_krnlCacheCnt, addr, []( const KernelDriver& lhs, const uint64_t& rhs ) { return lhs.addr > rhs; } );
+            if( it != s_krnlCache + s_krnlCacheCnt )
+            {
+                return it->mod;
+            }
+        }
+        return "<kernel>";
+    }
 
-#ifndef __CYGWIN__
     for( auto& v : *s_modCache )
     {
         if( addr >= v.start && addr < v.end )
@@ -215,6 +318,7 @@ static const char* GetModuleName( uint64_t addr )
     DWORD needed;
     HANDLE proc = GetCurrentProcess();
 
+    InitRpmalloc();
     if( EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 )
     {
         const auto sz = needed / sizeof( HMODULE );
@@ -230,6 +334,8 @@ static const char* GetModuleName( uint64_t addr )
                     const auto res = GetModuleFileNameA( mod[i], name, 1021 );
                     if( res > 0 )
                     {
+                        // since this is the first time we encounter this module, load its symbols (needed for modules loaded after SymInitialize)
+                        SymLoadModuleEx(proc, NULL, name, NULL, (DWORD64)info.lpBaseOfDll, info.SizeOfImage, NULL, 0);
                         auto ptr = name + res;
                         while( ptr > name && *ptr != '\\' && *ptr != '/' ) ptr--;
                         if( ptr > name ) ptr++;
@@ -237,7 +343,7 @@ static const char* GetModuleName( uint64_t addr )
                         auto cache = s_modCache->push_next();
                         cache->start = base;
                         cache->end = base + info.SizeOfImage;
-                        cache->name = (char*)tracy_malloc( namelen+3 );
+                        cache->name = (char*)tracy_malloc_fast( namelen+3 );
                         cache->name[0] = '[';
                         memcpy( cache->name+1, ptr, namelen );
                         cache->name[namelen+1] = ']';
@@ -248,8 +354,6 @@ static const char* GetModuleName( uint64_t addr )
             }
         }
     }
-#endif
-
     return "[unknown]";
 }
 
@@ -263,69 +367,21 @@ CallstackSymbolData DecodeSymbolAddress( uint64_t ptr )
     DBGHELP_LOCK;
 #endif
     const auto res = SymGetLineFromAddr64( GetCurrentProcess(), ptr, &displacement, &line );
-#ifdef TRACY_DBGHELP_LOCK
-    DBGHELP_UNLOCK;
-#endif
-    if( res == 0 )
+    if( res == 0 || line.LineNumber >= 0xF00000 )
     {
         sym.file = "[unknown]";
         sym.line = 0;
+        sym.needFree = false;
     }
     else
     {
-        sym.file = line.FileName;
+        sym.file = CopyString( line.FileName );
         sym.line = line.LineNumber;
-    }
-    sym.needFree = false;
-    return sym;
-}
-
-CallstackSymbolData DecodeCodeAddress( uint64_t ptr )
-{
-    CallstackSymbolData sym;
-    const auto proc = GetCurrentProcess();
-    bool done = false;
-
-    IMAGEHLP_LINE64 line;
-    DWORD displacement = 0;
-    line.SizeOfStruct = sizeof(IMAGEHLP_LINE64);
-
-#ifdef TRACY_DBGHELP_LOCK
-    DBGHELP_LOCK;
-#endif
-#ifndef __CYGWIN__
-    DWORD inlineNum = SymAddrIncludeInlineTrace( proc, ptr );
-    DWORD ctx = 0;
-    DWORD idx;
-    BOOL doInline = FALSE;
-    if( inlineNum != 0 ) doInline = SymQueryInlineTrace( proc, ptr, 0, ptr, ptr, &ctx, &idx );
-    if( doInline )
-    {
-        if( SymGetLineFromInlineContext( proc, ptr, ctx, 0, &displacement, &line ) != 0 )
-        {
-            sym.file = line.FileName;
-            sym.line = line.LineNumber;
-            done = true;
-        }
-    }
-#endif
-    if( !done )
-    {
-        if( SymGetLineFromAddr64( proc, ptr, &displacement, &line ) == 0 )
-        {
-            sym.file = "[unknown]";
-            sym.line = 0;
-        }
-        else
-        {
-            sym.file = line.FileName;
-            sym.line = line.LineNumber;
-        }
+        sym.needFree = true;
     }
 #ifdef TRACY_DBGHELP_LOCK
     DBGHELP_UNLOCK;
 #endif
-    sym.needFree = false;
     return sym;
 }
 
@@ -333,16 +389,25 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
 {
     int write;
     const auto proc = GetCurrentProcess();
+    InitRpmalloc();
+
 #ifdef TRACY_DBGHELP_LOCK
     DBGHELP_LOCK;
 #endif
-#ifndef __CYGWIN__
-    DWORD inlineNum = SymAddrIncludeInlineTrace( proc, ptr );
-    if( inlineNum > MaxCbTrace - 1 ) inlineNum = MaxCbTrace - 1;
-    DWORD ctx = 0;
-    DWORD idx;
+
+    const auto moduleName = GetModuleNameAndPrepareSymbols(ptr);
+
+#if !defined TRACY_NO_CALLSTACK_INLINES
     BOOL doInline = FALSE;
-    if( inlineNum != 0 ) doInline = SymQueryInlineTrace( proc, ptr, 0, ptr, ptr, &ctx, &idx );
+    DWORD ctx = 0;
+    DWORD inlineNum = 0;
+    if( _SymAddrIncludeInlineTrace )
+    {
+        inlineNum = _SymAddrIncludeInlineTrace( proc, ptr );
+        if( inlineNum > MaxCbTrace - 1 ) inlineNum = MaxCbTrace - 1;
+        DWORD idx;
+        if( inlineNum != 0 ) doInline = _SymQueryInlineTrace( proc, ptr, 0, ptr, ptr, &ctx, &idx );
+    }
     if( doInline )
     {
         write = inlineNum;
@@ -360,7 +425,6 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
     si->SizeOfStruct = sizeof( SYMBOL_INFO );
     si->MaxNameLen = MaxNameSize;
 
-    const auto moduleName = GetModuleName( ptr );
     const auto symValid = SymFromAddr( proc, ptr, nullptr, si ) != 0;
 
     IMAGEHLP_LINE64 line;
@@ -369,7 +433,8 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
 
     {
         const char* filename;
-        if( SymGetLineFromAddr64( proc, ptr, &displacement, &line ) == 0 )
+        const auto res = SymGetLineFromAddr64( proc, ptr, &displacement, &line );
+        if( res == 0 || line.LineNumber >= 0xF00000 )
         {
             filename = "[unknown]";
             cb_data[write].line = 0;
@@ -380,8 +445,8 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
             cb_data[write].line = line.LineNumber;
         }
 
-        cb_data[write].name = symValid ? CopyString( si->Name, si->NameLen ) : CopyString( moduleName );
-        cb_data[write].file = CopyString( filename );
+        cb_data[write].name = symValid ? CopyStringFast( si->Name, si->NameLen ) : CopyStringFast( moduleName );
+        cb_data[write].file = CopyStringFast( filename );
         if( symValid )
         {
             cb_data[write].symLen = si->Size;
@@ -394,15 +459,15 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
         }
     }
 
-#ifndef __CYGWIN__
+#if !defined TRACY_NO_CALLSTACK_INLINES
     if( doInline )
     {
         for( DWORD i=0; i<inlineNum; i++ )
         {
             auto& cb = cb_data[i];
-            const auto symInlineValid = SymFromInlineContext( proc, ptr, ctx, nullptr, si ) != 0;
+            const auto symInlineValid = _SymFromInlineContext( proc, ptr, ctx, nullptr, si ) != 0;
             const char* filename;
-            if( SymGetLineFromInlineContext( proc, ptr, ctx, 0, &displacement, &line ) == 0 )
+            if( _SymGetLineFromInlineContext( proc, ptr, ctx, 0, &displacement, &line ) == 0 )
             {
                 filename = "[unknown]";
                 cb.line = 0;
@@ -413,8 +478,8 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
                 cb.line = line.LineNumber;
             }
 
-            cb.name = symInlineValid ? CopyString( si->Name, si->NameLen ) : CopyString( moduleName );
-            cb.file = CopyString( filename );
+            cb.name = symInlineValid ? CopyStringFast( si->Name, si->NameLen ) : CopyStringFast( moduleName );
+            cb.file = CopyStringFast( filename );
             if( symInlineValid )
             {
                 cb.symLen = si->Size;
@@ -439,54 +504,285 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
 
 #elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
 
-enum { MaxCbTrace = 16 };
+enum { MaxCbTrace = 64 };
 
 struct backtrace_state* cb_bts;
 int cb_num;
 CallstackEntry cb_data[MaxCbTrace];
 int cb_fixup;
 
-void InitCallstack()
-{
-    cb_bts = backtrace_create_state( nullptr, 0, nullptr, nullptr );
-}
+#ifdef TRACY_DEBUGINFOD
+debuginfod_client* s_debuginfod;
 
-static int FastCallstackDataCb( void* data, uintptr_t pc, uintptr_t lowaddr, const char* fn, int lineno, const char* function )
+struct DebugInfo
 {
-    if( function )
+    uint8_t* buildid;
+    size_t buildid_size;
+    char* filename;
+    int fd;
+};
+
+FastVector<DebugInfo> s_di_known( 16 );
+#endif
+
+#ifdef __linux
+struct KernelSymbol
+{
+    uint64_t addr;
+    const char* name;
+    const char* mod;
+};
+
+KernelSymbol* s_kernelSym = nullptr;
+size_t s_kernelSymCnt;
+
+static void InitKernelSymbols()
+{
+    FILE* f = fopen( "/proc/kallsyms", "rb" );
+    if( !f ) return;
+    tracy::FastVector<KernelSymbol> tmpSym( 1024 );
+    size_t linelen = 16 * 1024;     // linelen must be big enough to prevent reallocs in getline()
+    auto linebuf = (char*)tracy_malloc( linelen );
+    ssize_t sz;
+    while( ( sz = getline( &linebuf, &linelen, f ) ) != -1 )
     {
-        strcpy( (char*)data, function );
+        auto ptr = linebuf;
+        uint64_t addr = 0;
+        while( *ptr != ' ' )
+        {
+            auto v = *ptr;
+            if( v >= '0' && v <= '9' )
+            {
+                v -= '0';
+            }
+            else if( v >= 'a' && v <= 'f' )
+            {
+                v -= 'a';
+                v += 10;
+            }
+            else if( v >= 'A' && v <= 'F' )
+            {
+                v -= 'A';
+                v += 10;
+            }
+            else
+            {
+                assert( false );
+            }
+            assert( ( v & ~0xF ) == 0 );
+            addr <<= 4;
+            addr |= v;
+            ptr++;
+        }
+        if( addr == 0 ) continue;
+        ptr++;
+        if( *ptr != 'T' && *ptr != 't' ) continue;
+        ptr += 2;
+        const auto namestart = ptr;
+        while( *ptr != '\t' && *ptr != '\n' ) ptr++;
+        const auto nameend = ptr;
+        const char* modstart = nullptr;
+        const char* modend;
+        if( *ptr == '\t' )
+        {
+            ptr += 2;
+            modstart = ptr;
+            while( *ptr != ']' ) ptr++;
+            modend = ptr;
+        }
+
+        auto strname = (char*)tracy_malloc_fast( nameend - namestart + 1 );
+        memcpy( strname, namestart, nameend - namestart );
+        strname[nameend-namestart] = '\0';
+
+        char* strmod = nullptr;
+        if( modstart )
+        {
+            strmod = (char*)tracy_malloc_fast( modend - modstart + 1 );
+            memcpy( strmod, modstart, modend - modstart );
+            strmod[modend-modstart] = '\0';
+        }
+
+        auto sym = tmpSym.push_next();
+        sym->addr = addr;
+        sym->name = strname;
+        sym->mod = strmod;
+    }
+    tracy_free_fast( linebuf );
+    fclose( f );
+    if( tmpSym.empty() ) return;
+
+    std::sort( tmpSym.begin(), tmpSym.end(), []( const KernelSymbol& lhs, const KernelSymbol& rhs ) { return lhs.addr > rhs.addr; } );
+    s_kernelSymCnt = tmpSym.size();
+    s_kernelSym = (KernelSymbol*)tracy_malloc_fast( sizeof( KernelSymbol ) * s_kernelSymCnt );
+    memcpy( s_kernelSym, tmpSym.data(), sizeof( KernelSymbol ) * s_kernelSymCnt );
+    TracyDebug( "Loaded %zu kernel symbols\n", s_kernelSymCnt );
+}
+#endif
+
+char* NormalizePath( const char* path )
+{
+    if( path[0] != '/' ) return nullptr;
+
+    const char* ptr = path;
+    const char* end = path;
+    while( *end ) end++;
+
+    char* res = (char*)tracy_malloc( end - ptr + 1 );
+    size_t rsz = 0;
+
+    while( ptr < end )
+    {
+        const char* next = ptr;
+        while( next < end && *next != '/' ) next++;
+        size_t lsz = next - ptr;
+        switch( lsz )
+        {
+        case 2:
+            if( memcmp( ptr, "..", 2 ) == 0 )
+            {
+                const char* back = res + rsz - 1;
+                while( back > res && *back != '/' ) back--;
+                rsz = back - res;
+                ptr = next + 1;
+                continue;
+            }
+            break;
+        case 1:
+            if( *ptr == '.' )
+            {
+                ptr = next + 1;
+                continue;
+            }
+            break;
+        case 0:
+            ptr = next + 1;
+            continue;
+        }
+        if( rsz != 1 ) res[rsz++] = '/';
+        memcpy( res+rsz, ptr, lsz );
+        rsz += lsz;
+        ptr = next + 1;
+    }
+
+    if( rsz == 0 )
+    {
+        memcpy( res, "/", 2 );
     }
     else
     {
-        const char* symname = nullptr;
-        auto vptr = (void*)pc;
-        Dl_info dlinfo;
-        if( dladdr( vptr, &dlinfo ) )
-        {
-            symname = dlinfo.dli_sname;
-        }
-        if( symname )
-        {
-            strcpy( (char*)data, symname );
-        }
-        else
-        {
-            *(char*)data = '\0';
-        }
+        res[rsz] = '\0';
     }
-    return 1;
+    return res;
 }
 
-static void FastCallstackErrorCb( void* data, const char* /*msg*/, int /*errnum*/ )
+void InitCallstackCritical()
 {
-    *(char*)data = '\0';
+}
+
+void InitCallstack()
+{
+    cb_bts = backtrace_create_state( nullptr, 0, nullptr, nullptr );
+    ___tracy_init_demangle_buffer();
+
+#ifdef __linux
+    InitKernelSymbols();
+#endif
+#ifdef TRACY_DEBUGINFOD
+    s_debuginfod = debuginfod_begin();
+#endif
+}
+
+#ifdef TRACY_DEBUGINFOD
+void ClearDebugInfoVector( FastVector<DebugInfo>& vec )
+{
+    for( auto& v : vec )
+    {
+        tracy_free( v.buildid );
+        tracy_free( v.filename );
+        if( v.fd >= 0 ) close( v.fd );
+    }
+    vec.clear();
+}
+
+DebugInfo* FindDebugInfo( FastVector<DebugInfo>& vec, const uint8_t* buildid_data, size_t buildid_size )
+{
+    for( auto& v : vec )
+    {
+        if( v.buildid_size == buildid_size && memcmp( v.buildid, buildid_data, buildid_size ) == 0 )
+        {
+            return &v;
+        }
+    }
+    return nullptr;
+}
+
+int GetDebugInfoDescriptor( const char* buildid_data, size_t buildid_size, const char* filename )
+{
+    auto buildid = (uint8_t*)buildid_data;
+    auto it = FindDebugInfo( s_di_known, buildid, buildid_size );
+    if( it ) return it->fd >= 0 ? dup( it->fd ) : -1;
+
+    int fd = debuginfod_find_debuginfo( s_debuginfod, buildid, buildid_size, nullptr );
+    it = s_di_known.push_next();
+    it->buildid_size = buildid_size;
+    it->buildid = (uint8_t*)tracy_malloc( buildid_size );
+    memcpy( it->buildid, buildid, buildid_size );
+    const auto fnsz = strlen( filename ) + 1;
+    it->filename = (char*)tracy_malloc( fnsz );
+    memcpy( it->filename, filename, fnsz );
+    it->fd = fd >= 0 ? fd : -1;
+    TracyDebug( "DebugInfo descriptor query: %i, fn: %s\n", fd, filename );
+    return it->fd;
+}
+
+const uint8_t* GetBuildIdForImage( const char* image, size_t& size )
+{
+    assert( image );
+    for( auto& v : s_di_known )
+    {
+        if( strcmp( image, v.filename ) == 0 )
+        {
+            size = v.buildid_size;
+            return v.buildid;
+        }
+    }
+    return nullptr;
+}
+
+debuginfod_client* GetDebuginfodClient()
+{
+    return s_debuginfod;
+}
+#endif
+
+void EndCallstack()
+{
+    ___tracy_free_demangle_buffer();
+#ifdef TRACY_DEBUGINFOD
+    ClearDebugInfoVector( s_di_known );
+    debuginfod_end( s_debuginfod );
+#endif
 }
 
 const char* DecodeCallstackPtrFast( uint64_t ptr )
 {
     static char ret[1024];
-    backtrace_pcinfo( cb_bts, ptr, FastCallstackDataCb, FastCallstackErrorCb, ret );
+    auto vptr = (void*)ptr;
+    const char* symname = nullptr;
+    Dl_info dlinfo;
+    if( dladdr( vptr, &dlinfo ) && dlinfo.dli_sname )
+    {
+        symname = dlinfo.dli_sname;
+    }
+    if( symname )
+    {
+        strcpy( ret, symname );
+    }
+    else
+    {
+        *ret = '\0';
+    }
     return ret;
 }
 
@@ -501,7 +797,8 @@ static int SymbolAddressDataCb( void* data, uintptr_t pc, uintptr_t lowaddr, con
     }
     else
     {
-        sym.file = CopyString( fn );
+        sym.file = NormalizePath( fn );
+        if( !sym.file ) sym.file = CopyString( fn );
         sym.line = lineno;
         sym.needFree = true;
     }
@@ -524,16 +821,8 @@ CallstackSymbolData DecodeSymbolAddress( uint64_t ptr )
     return sym;
 }
 
-CallstackSymbolData DecodeCodeAddress( uint64_t ptr )
-{
-    return DecodeSymbolAddress( ptr );
-}
-
 static int CallstackDataCb( void* /*data*/, uintptr_t pc, uintptr_t lowaddr, const char* fn, int lineno, const char* function )
 {
-    enum { DemangleBufLen = 64*1024 };
-    char demangled[DemangleBufLen];
-
     cb_data[cb_num].symLen = 0;
     cb_data[cb_num].symAddr = (uint64_t)lowaddr;
 
@@ -548,38 +837,30 @@ static int CallstackDataCb( void* /*data*/, uintptr_t pc, uintptr_t lowaddr, con
         {
             symname = dlinfo.dli_sname;
             symoff = (char*)pc - (char*)dlinfo.dli_saddr;
-
-            if( symname && symname[0] == '_' )
-            {
-                size_t len = DemangleBufLen;
-                int status;
-                abi::__cxa_demangle( symname, demangled, &len, &status );
-                if( status == 0 )
-                {
-                    symname = demangled;
-                }
-            }
+            const char* demangled = ___tracy_demangle( symname );
+            if( demangled ) symname = demangled;
         }
 
         if( !symname ) symname = "[unknown]";
 
         if( symoff == 0 )
         {
-            cb_data[cb_num].name = CopyString( symname );
+            const auto len = std::min<size_t>( strlen( symname ), std::numeric_limits<uint16_t>::max() );
+            cb_data[cb_num].name = CopyStringFast( symname, len );
         }
         else
         {
             char buf[32];
             const auto offlen = sprintf( buf, " + %td", symoff );
-            const auto namelen = strlen( symname );
-            auto name = (char*)tracy_malloc( namelen + offlen + 1 );
+            const auto namelen = std::min<size_t>( strlen( symname ), std::numeric_limits<uint16_t>::max() - offlen );
+            auto name = (char*)tracy_malloc_fast( namelen + offlen + 1 );
             memcpy( name, symname, namelen );
             memcpy( name + namelen, buf, offlen );
             name[namelen + offlen] = '\0';
             cb_data[cb_num].name = name;
         }
 
-        cb_data[cb_num].file = CopyString( "[unknown]" );
+        cb_data[cb_num].file = CopyStringFast( "[unknown]" );
         cb_data[cb_num].line = 0;
     }
     else
@@ -591,20 +872,14 @@ static int CallstackDataCb( void* /*data*/, uintptr_t pc, uintptr_t lowaddr, con
         }
         else
         {
-            if( function[0] == '_' )
-            {
-                size_t len = DemangleBufLen;
-                int status;
-                abi::__cxa_demangle( function, demangled, &len, &status );
-                if( status == 0 )
-                {
-                    function = demangled;
-                }
-            }
+            const char* demangled = ___tracy_demangle( function );
+            if( demangled ) function = demangled;
         }
 
-        cb_data[cb_num].name = CopyString( function );
-        cb_data[cb_num].file = CopyString( fn );
+        const auto len = std::min<size_t>( strlen( function ), std::numeric_limits<uint16_t>::max() );
+        cb_data[cb_num].name = CopyStringFast( function, len );
+        cb_data[cb_num].file = NormalizePath( fn );
+        if( !cb_data[cb_num].file ) cb_data[cb_num].file = CopyStringFast( fn );
         cb_data[cb_num].line = lineno;
     }
 
@@ -622,12 +897,12 @@ static void CallstackErrorCb( void* /*data*/, const char* /*msg*/, int /*errnum*
 {
     for( int i=0; i<cb_num; i++ )
     {
-        tracy_free( (void*)cb_data[i].name );
-        tracy_free( (void*)cb_data[i].file );
+        tracy_free_fast( (void*)cb_data[i].name );
+        tracy_free_fast( (void*)cb_data[i].file );
     }
 
-    cb_data[0].name = CopyString( "[error]" );
-    cb_data[0].file = CopyString( "[error]" );
+    cb_data[0].name = CopyStringFast( "[error]" );
+    cb_data[0].file = CopyStringFast( "[error]" );
     cb_data[0].line = 0;
 
     cb_num = 1;
@@ -647,23 +922,59 @@ void SymInfoError( void* /*data*/, const char* /*msg*/, int /*errnum*/ )
 
 CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
 {
-    cb_num = 0;
-    backtrace_pcinfo( cb_bts, ptr, CallstackDataCb, CallstackErrorCb, nullptr );
-    assert( cb_num > 0 );
+    InitRpmalloc();
+    if( ptr >> 63 == 0 )
+    {
+        cb_num = 0;
+        backtrace_pcinfo( cb_bts, ptr, CallstackDataCb, CallstackErrorCb, nullptr );
+        assert( cb_num > 0 );
 
-    backtrace_syminfo( cb_bts, ptr, SymInfoCallback, SymInfoError, nullptr );
+        backtrace_syminfo( cb_bts, ptr, SymInfoCallback, SymInfoError, nullptr );
 
-    const char* symloc = nullptr;
-    Dl_info dlinfo;
-    if( dladdr( (void*)ptr, &dlinfo ) ) symloc = dlinfo.dli_fname;
+        const char* symloc = nullptr;
+        Dl_info dlinfo;
+        if( dladdr( (void*)ptr, &dlinfo ) ) symloc = dlinfo.dli_fname;
 
-    return { cb_data, uint8_t( cb_num ), symloc ? symloc : "[unknown]" };
+        return { cb_data, uint8_t( cb_num ), symloc ? symloc : "[unknown]" };
+    }
+#ifdef __linux
+    else if( s_kernelSym )
+    {
+        auto it = std::lower_bound( s_kernelSym, s_kernelSym + s_kernelSymCnt, ptr, []( const KernelSymbol& lhs, const uint64_t& rhs ) { return lhs.addr > rhs; } );
+        if( it != s_kernelSym + s_kernelSymCnt )
+        {
+            cb_data[0].name = CopyStringFast( it->name );
+            cb_data[0].file = CopyStringFast( "<kernel>" );
+            cb_data[0].line = 0;
+            cb_data[0].symLen = 0;
+            cb_data[0].symAddr = it->addr;
+            return { cb_data, 1, it->mod ? it->mod : "<kernel>" };
+        }
+    }
+#endif
+
+    cb_data[0].name = CopyStringFast( "[unknown]" );
+    cb_data[0].file = CopyStringFast( "<kernel>" );
+    cb_data[0].line = 0;
+    cb_data[0].symLen = 0;
+    cb_data[0].symAddr = 0;
+    return { cb_data, 1, "<kernel>" };
 }
 
 #elif TRACY_HAS_CALLSTACK == 5
 
+void InitCallstackCritical()
+{
+}
+
 void InitCallstack()
 {
+    ___tracy_init_demangle_buffer();
+}
+
+void EndCallstack()
+{
+    ___tracy_free_demangle_buffer();
 }
 
 const char* DecodeCallstackPtrFast( uint64_t ptr )
@@ -693,12 +1004,7 @@ CallstackSymbolData DecodeSymbolAddress( uint64_t ptr )
     Dl_info dlinfo;
     if( dladdr( (void*)ptr, &dlinfo ) ) symloc = dlinfo.dli_fname;
     if( !symloc ) symloc = "[unknown]";
-    return CallstackSymbolData { symloc, 0, false };
-}
-
-CallstackSymbolData DecodeCodeAddress( uint64_t ptr )
-{
-    return DecodeSymbolAddress( ptr );
+    return CallstackSymbolData { symloc, 0, false, 0 };
 }
 
 CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
@@ -706,7 +1012,6 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
     static CallstackEntry cb;
     cb.line = 0;
 
-    char* demangled = nullptr;
     const char* symname = nullptr;
     const char* symloc = nullptr;
     auto vptr = (void*)ptr;
@@ -720,17 +1025,8 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
         symname = dlinfo.dli_sname;
         symoff = (char*)ptr - (char*)dlinfo.dli_saddr;
         symaddr = dlinfo.dli_saddr;
-
-        if( symname && symname[0] == '_' )
-        {
-            size_t len = 0;
-            int status;
-            demangled = abi::__cxa_demangle( symname, nullptr, &len, &status );
-            if( status == 0 )
-            {
-                symname = demangled;
-            }
-        }
+        const char* demangled = ___tracy_demangle( symname );
+        if( demangled ) symname = demangled;
     }
 
     if( !symname ) symname = "[unknown]";
@@ -738,13 +1034,14 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
 
     if( symoff == 0 )
     {
-        cb.name = CopyString( symname );
+        const auto len = std::min<size_t>( strlen( symname ), std::numeric_limits<uint16_t>::max() );
+        cb.name = CopyString( symname, len );
     }
     else
     {
         char buf[32];
         const auto offlen = sprintf( buf, " + %td", symoff );
-        const auto namelen = strlen( symname );
+        const auto namelen = std::min<size_t>( strlen( symname ), std::numeric_limits<uint16_t>::max() - offlen );
         auto name = (char*)tracy_malloc( namelen + offlen + 1 );
         memcpy( name, symname, namelen );
         memcpy( name + namelen, buf, offlen );
@@ -756,8 +1053,6 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
     cb.symLen = 0;
     cb.symAddr = (uint64_t)symaddr;
 
-    if( demangled ) free( demangled );
-
     return { &cb, 1, symloc };
 }
 
diff --git a/Source/ThirdParty/tracy/client/TracyCallstack.h b/Source/ThirdParty/tracy/client/TracyCallstack.h
index 87d8ce721..2c7ecad9f 100644
--- a/Source/ThirdParty/tracy/client/TracyCallstack.h
+++ b/Source/ThirdParty/tracy/client/TracyCallstack.h
@@ -1,28 +1,35 @@
 #ifndef __TRACYCALLSTACK_H__
 #define __TRACYCALLSTACK_H__
 
-#if !defined _WIN32 && !defined __CYGWIN__
-#  include <sys/param.h>
-#endif
+#ifndef TRACY_NO_CALLSTACK
 
-#if defined _WIN32 || defined __CYGWIN__
-#  define TRACY_HAS_CALLSTACK 1
-#elif defined __ANDROID__
-#  if !defined __arm__ || __ANDROID_API__ >= 21
-#    define TRACY_HAS_CALLSTACK 2
-#  else
-#    define TRACY_HAS_CALLSTACK 5
+#  if !defined _WIN32
+#    include <sys/param.h>
 #  endif
-#elif defined __linux
-#  if defined _GNU_SOURCE && defined __GLIBC__
-#    define TRACY_HAS_CALLSTACK 3
-#  else
-#    define TRACY_HAS_CALLSTACK 2
+
+#  if defined _WIN32
+#    include "../common/TracyUwp.hpp"
+#    ifndef TRACY_UWP
+#      define TRACY_HAS_CALLSTACK 1
+#    endif
+#  elif defined __ANDROID__
+#    if !defined __arm__ || __ANDROID_API__ >= 21
+#      define TRACY_HAS_CALLSTACK 2
+#    else
+#      define TRACY_HAS_CALLSTACK 5
+#    endif
+#  elif defined __linux
+#    if defined _GNU_SOURCE && defined __GLIBC__
+#      define TRACY_HAS_CALLSTACK 3
+#    else
+#      define TRACY_HAS_CALLSTACK 2
+#    endif
+#  elif defined __APPLE__
+#    define TRACY_HAS_CALLSTACK 4
+#  elif defined BSD
+#    define TRACY_HAS_CALLSTACK 6
 #  endif
-#elif defined __APPLE__
-#  define TRACY_HAS_CALLSTACK 4
-#elif defined BSD
-#  define TRACY_HAS_CALLSTACK 6
+
 #endif
 
 #endif
diff --git a/Source/ThirdParty/tracy/client/TracyCallstack.hpp b/Source/ThirdParty/tracy/client/TracyCallstack.hpp
index 923eccc04..8cfede8fb 100644
--- a/Source/ThirdParty/tracy/client/TracyCallstack.hpp
+++ b/Source/ThirdParty/tracy/client/TracyCallstack.hpp
@@ -12,6 +12,10 @@
 
 #ifdef TRACY_HAS_CALLSTACK
 
+#ifdef TRACY_DEBUGINFOD
+#  include <elfutils/debuginfod.h>
+#endif
+
 #include <assert.h>
 #include <stdint.h>
 
@@ -25,6 +29,7 @@ struct CallstackSymbolData
     const char* file;
     uint32_t line;
     bool needFree;
+    uint64_t symAddr;
 };
 
 struct CallstackEntry
@@ -44,19 +49,33 @@ struct CallstackEntryData
 };
 
 CallstackSymbolData DecodeSymbolAddress( uint64_t ptr );
-CallstackSymbolData DecodeCodeAddress( uint64_t ptr );
 const char* DecodeCallstackPtrFast( uint64_t ptr );
 CallstackEntryData DecodeCallstackPtr( uint64_t ptr );
 void InitCallstack();
+void InitCallstackCritical();
+void EndCallstack();
+const char* GetKernelModulePath( uint64_t addr );
+
+#ifdef TRACY_DEBUGINFOD
+const uint8_t* GetBuildIdForImage( const char* image, size_t& size );
+debuginfod_client* GetDebuginfodClient();
+#endif
 
 #if TRACY_HAS_CALLSTACK == 1
 
-TRACY_API uintptr_t* CallTrace( int depth );
+extern "C"
+{
+    typedef unsigned long (__stdcall *___tracy_t_RtlWalkFrameChain)( void**, unsigned long, unsigned long );
+    TRACY_API extern ___tracy_t_RtlWalkFrameChain ___tracy_RtlWalkFrameChain;
+}
 
 static tracy_force_inline void* Callstack( int depth )
 {
     assert( depth >= 1 && depth < 63 );
-    return CallTrace( depth );
+    auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) );
+    const auto num = ___tracy_RtlWalkFrameChain( (void**)( trace + 1 ), depth, 0 );
+    *trace = num;
+    return trace;
 }
 
 #elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5
diff --git a/Source/ThirdParty/tracy/client/TracyCpuid.hpp b/Source/ThirdParty/tracy/client/TracyCpuid.hpp
new file mode 100644
index 000000000..9820be00b
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyCpuid.hpp
@@ -0,0 +1,12 @@
+#ifndef __TRACYCPUID_HPP__
+#define __TRACYCPUID_HPP__
+
+// Prior to GCC 11 the cpuid.h header did not have any include guards and thus
+// including it more than once would cause a compiler error due to symbol
+// redefinitions. In order to support older GCC versions, we have to wrap this
+// include between custom include guards to prevent this issue.
+// See also https://github.com/wolfpld/tracy/issues/452
+
+#include <cpuid.h>
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracyDebug.hpp b/Source/ThirdParty/tracy/client/TracyDebug.hpp
new file mode 100644
index 000000000..8723356f4
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyDebug.hpp
@@ -0,0 +1,11 @@
+#ifndef __TRACYPRINT_HPP__
+#define __TRACYPRINT_HPP__
+
+#ifdef TRACY_VERBOSE
+#  include <stdio.h>
+#  define TracyDebug(...) fprintf( stderr, __VA_ARGS__ );
+#else
+#  define TracyDebug(...)
+#endif
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracyFastVector.hpp b/Source/ThirdParty/tracy/client/TracyFastVector.hpp
index fc4108016..8cf1651eb 100644
--- a/Source/ThirdParty/tracy/client/TracyFastVector.hpp
+++ b/Source/ThirdParty/tracy/client/TracyFastVector.hpp
@@ -101,7 +101,7 @@ private:
         const auto size = size_t( m_write - m_ptr );
         T* ptr = (T*)tracy_malloc( sizeof( T ) * cap );
         memcpy( ptr, m_ptr, size * sizeof( T ) );
-        tracy_free( m_ptr );
+        tracy_free_fast( m_ptr );
         m_ptr = ptr;
         m_write = m_ptr + size;
         m_end = m_ptr + cap;
diff --git a/Source/ThirdParty/tracy/client/TracyLock.hpp b/Source/ThirdParty/tracy/client/TracyLock.hpp
index e513cdc5d..296a41ba1 100644
--- a/Source/ThirdParty/tracy/client/TracyLock.hpp
+++ b/Source/ThirdParty/tracy/client/TracyLock.hpp
@@ -98,7 +98,6 @@ public:
 
         auto item = Profiler::QueueSerial();
         MemWrite( &item->hdr.type, QueueType::LockRelease );
-        MemWrite( &item->lockRelease.thread, GetThreadHandle() );
         MemWrite( &item->lockRelease.id, m_id );
         MemWrite( &item->lockRelease.time, Profiler::GetTime() );
         Profiler::QueueSerialFinish();
@@ -313,7 +312,6 @@ public:
 
         auto item = Profiler::QueueSerial();
         MemWrite( &item->hdr.type, QueueType::LockRelease );
-        MemWrite( &item->lockRelease.thread, GetThreadHandle() );
         MemWrite( &item->lockRelease.id, m_id );
         MemWrite( &item->lockRelease.time, Profiler::GetTime() );
         Profiler::QueueSerialFinish();
@@ -395,9 +393,9 @@ public:
 
         auto item = Profiler::QueueSerial();
         MemWrite( &item->hdr.type, QueueType::LockSharedRelease );
-        MemWrite( &item->lockRelease.thread, GetThreadHandle() );
-        MemWrite( &item->lockRelease.id, m_id );
-        MemWrite( &item->lockRelease.time, Profiler::GetTime() );
+        MemWrite( &item->lockReleaseShared.thread, GetThreadHandle() );
+        MemWrite( &item->lockReleaseShared.id, m_id );
+        MemWrite( &item->lockReleaseShared.time, Profiler::GetTime() );
         Profiler::QueueSerialFinish();
     }
 
diff --git a/Source/ThirdParty/tracy/client/TracyProfiler.cpp b/Source/ThirdParty/tracy/client/TracyProfiler.cpp
index 5462a3573..46a9d36e4 100644
--- a/Source/ThirdParty/tracy/client/TracyProfiler.cpp
+++ b/Source/ThirdParty/tracy/client/TracyProfiler.cpp
@@ -9,24 +9,18 @@
 #  include <tlhelp32.h>
 #  include <inttypes.h>
 #  include <intrin.h>
+#  include "../common/TracyUwp.hpp"
 #else
 #  include <sys/time.h>
 #  include <sys/param.h>
 #endif
 
-#ifdef __CYGWIN__
-#  include <windows.h>
-#  include <unistd.h>
-#  include <tlhelp32.h>
-#endif
-
 #ifdef _GNU_SOURCE
 #  include <errno.h>
 #endif
 
 #ifdef __linux__
 #  include <dirent.h>
-#  include <signal.h>
 #  include <pthread.h>
 #  include <sys/types.h>
 #  include <sys/syscall.h>
@@ -44,6 +38,7 @@
 
 #ifdef __ANDROID__
 #  include <sys/mman.h>
+#  include <sys/system_properties.h>
 #  include <stdio.h>
 #  include <stdint.h>
 #  include <algorithm>
@@ -62,16 +57,20 @@
 #include <thread>
 
 #include "../common/TracyAlign.hpp"
+#include "../common/TracyAlloc.hpp"
 #include "../common/TracySocket.hpp"
 #include "../common/TracySystem.hpp"
+#include "../common/TracyYield.hpp"
 #include "tracy_rpmalloc.hpp"
 #include "TracyCallstack.hpp"
+#include "TracyDebug.hpp"
 #include "TracyScoped.hpp"
 #include "TracyProfiler.hpp"
 #include "TracyThread.hpp"
 #include "TracyArmCpuTable.hpp"
 #include "TracySysTrace.hpp"
 
+
 #ifdef TRACY_PORT
 #  ifndef TRACY_DATA_PORT
 #    define TRACY_DATA_PORT TRACY_PORT
@@ -91,7 +90,7 @@
 #  endif
 #endif
 
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
 #  include <lmcons.h>
 extern "C" typedef LONG (WINAPI *t_RtlGetVersion)( PRTL_OSVERSIONINFOW );
 extern "C" typedef BOOL (WINAPI *t_GetLogicalProcessorInformationEx)( LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD );
@@ -104,55 +103,156 @@ extern "C" typedef BOOL (WINAPI *t_GetLogicalProcessorInformationEx)( LOGICAL_PR
 #  include <sys/utsname.h>
 #endif
 
-#if !defined _WIN32 && !defined __CYGWIN__ && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
-#  include <cpuid.h>
+#if !defined _WIN32 && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+#  include "TracyCpuid.hpp"
 #endif
 
-#if !( ( ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA ) || defined __linux__ )
+#if !( ( defined _WIN32 && _WIN32_WINNT >= _WIN32_WINNT_VISTA ) || defined __linux__ )
 #  include <mutex>
 #endif
 
 namespace tracy
 {
 
-namespace
+#ifdef __ANDROID__
+// Implementation helpers of EnsureReadable(address).
+// This is so far only needed on Android, where it is common for libraries to be mapped
+// with only executable, not readable, permissions. Typical example (line from /proc/self/maps):
+/*
+746b63b000-746b6dc000 --xp 00042000 07:48 35                             /apex/com.android.runtime/lib64/bionic/libc.so
+*/
+// See https://github.com/wolfpld/tracy/issues/125 .
+// To work around this, we parse /proc/self/maps and we use mprotect to set read permissions
+// on any mappings that contain symbols addresses hit by HandleSymbolCodeQuery.
+
+namespace {
+// Holds some information about a single memory mapping.
+struct MappingInfo {
+    // Start of address range. Inclusive.
+    uintptr_t start_address;
+    // End of address range. Exclusive, so the mapping is the half-open interval
+    // [start, end) and its length in bytes is `end - start`. As in /proc/self/maps.
+    uintptr_t end_address;
+    // Read/Write/Executable permissions.
+    bool perm_r, perm_w, perm_x;
+};
+}  // anonymous namespace
+
+   // Internal implementation helper for LookUpMapping(address).
+   //
+   // Parses /proc/self/maps returning a vector<MappingInfo>.
+   // /proc/self/maps is assumed to be sorted by ascending address, so the resulting
+   // vector is sorted by ascending address too.
+static std::vector<MappingInfo> ParseMappings()
 {
-#  if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA
-    BOOL CALLBACK InitOnceCallback( PINIT_ONCE /*initOnce*/, PVOID /*Parameter*/, PVOID* /*Context*/)
+    std::vector<MappingInfo> result;
+    FILE* file = fopen( "/proc/self/maps", "r" );
+    if( !file ) return result;
+    char line[1024];
+    while( fgets( line, sizeof( line ), file ) )
     {
-        rpmalloc_initialize();
-        return TRUE;
+        uintptr_t start_addr;
+        uintptr_t end_addr;
+        if( sscanf( line, "%lx-%lx", &start_addr, &end_addr ) != 2 ) continue;
+        char* first_space = strchr( line, ' ' );
+        if( !first_space ) continue;
+        char* perm = first_space + 1;
+        char* second_space = strchr( perm, ' ' );
+        if( !second_space || second_space - perm != 4 ) continue;
+        result.emplace_back();
+        auto& mapping = result.back();
+        mapping.start_address = start_addr;
+        mapping.end_address = end_addr;
+        mapping.perm_r = perm[0] == 'r';
+        mapping.perm_w = perm[1] == 'w';
+        mapping.perm_x = perm[2] == 'x';
     }
-    INIT_ONCE InitOnce = INIT_ONCE_STATIC_INIT;
-#  elif defined __linux__
-    void InitOnceCallback()
-    {
-        rpmalloc_initialize();
-    }
-    pthread_once_t once_control = PTHREAD_ONCE_INIT;
-#  else
-    void InitOnceCallback()
-    {
-        rpmalloc_initialize();
-    }
-    std::once_flag once_flag;
-#  endif
+    fclose( file );
+    return result;
 }
 
-struct RPMallocInit
+// Internal implementation helper for LookUpMapping(address).
+//
+// Takes as input an `address` and a known vector `mappings`, assumed to be
+// sorted by increasing addresses, as /proc/self/maps seems to be.
+// Returns a pointer to the MappingInfo describing the mapping that this
+// address belongs to, or nullptr if the address isn't in `mappings`.
+static MappingInfo* LookUpMapping(std::vector<MappingInfo>& mappings, uintptr_t address)
 {
-    RPMallocInit()
-    {
-#  if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA
-        InitOnceExecuteOnce( &InitOnce, InitOnceCallback, nullptr, nullptr );
-#  elif defined __linux__
-        pthread_once( &once_control, InitOnceCallback );
-#  else
-        std::call_once( once_flag, InitOnceCallback );
-#  endif
-        rpmalloc_thread_initialize();
+    // Comparison function for std::lower_bound. Returns true if all addresses in `m1`
+    // are lower than `addr`.
+    auto Compare = []( const MappingInfo& m1, uintptr_t addr ) {
+        // '<=' because the address ranges are half-open intervals, [start, end).
+        return m1.end_address <= addr;
+    };
+    auto iter = std::lower_bound( mappings.begin(), mappings.end(), address, Compare );
+    if( iter == mappings.end() || iter->start_address > address) {
+        return nullptr;
     }
-};
+    return &*iter;
+}
+
+// Internal implementation helper for EnsureReadable(address).
+//
+// Takes as input an `address` and returns a pointer to a MappingInfo
+// describing the mapping that this address belongs to, or nullptr if
+// the address isn't in any known mapping.
+//
+// This function is stateful and not reentrant (assumes to be called from
+// only one thread). It holds a vector of mappings parsed from /proc/self/maps.
+//
+// Attempts to react to mappings changes by re-parsing /proc/self/maps.
+static MappingInfo* LookUpMapping(uintptr_t address)
+{
+    // Static state managed by this function. Not constant, we mutate that state as
+    // we turn some mappings readable. Initially parsed once here, updated as needed below.
+    static std::vector<MappingInfo> s_mappings = ParseMappings();
+    MappingInfo* mapping = LookUpMapping( s_mappings, address );
+    if( mapping ) return mapping;
+
+    // This address isn't in any known mapping. Try parsing again, maybe
+    // mappings changed.
+    s_mappings = ParseMappings();
+    return LookUpMapping( s_mappings, address );
+}
+
+// Internal implementation helper for EnsureReadable(address).
+//
+// Attempts to make the specified `mapping` readable if it isn't already.
+// Returns true if and only if the mapping is readable.
+static bool EnsureReadable( MappingInfo& mapping )
+{
+    if( mapping.perm_r )
+    {
+        // The mapping is already readable.
+        return true;
+    }
+    int prot = PROT_READ;
+    if( mapping.perm_w ) prot |= PROT_WRITE;
+    if( mapping.perm_x ) prot |= PROT_EXEC;
+    if( mprotect( reinterpret_cast<void*>( mapping.start_address ),
+        mapping.end_address - mapping.start_address, prot ) == -1 )
+    {
+        // Failed to make the mapping readable. Shouldn't happen, hasn't
+        // been observed yet. If it happened in practice, we should consider
+        // adding a bool to MappingInfo to track this to avoid retrying mprotect
+        // everytime on such mappings.
+        return false;
+    }
+    // The mapping is now readable. Update `mapping` so the next call will be fast.
+    mapping.perm_r = true;
+    return true;
+}
+
+// Attempts to set the read permission on the entire mapping containing the
+// specified address. Returns true if and only if the mapping is now readable.
+static bool EnsureReadable( uintptr_t address )
+{
+    MappingInfo* mapping = LookUpMapping(address);
+    return mapping && EnsureReadable( *mapping );
+}
+
+#endif  // defined __ANDROID__
 
 #ifndef TRACY_DELAYED_INIT
 
@@ -168,7 +268,7 @@ struct ProducerWrapper
 
 struct ThreadHandleWrapper
 {
-    uint64_t val;
+    uint32_t val;
 };
 #endif
 
@@ -177,7 +277,7 @@ struct ThreadHandleWrapper
 static inline void CpuId( uint32_t* regs, uint32_t leaf )
 {
     memset(regs, 0, sizeof(uint32_t) * 4);
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
     __cpuidex( (int*)regs, leaf, 0 );
 #else
     __get_cpuid( leaf, regs, regs+1, regs+2, regs+3 );
@@ -186,7 +286,7 @@ static inline void CpuId( uint32_t* regs, uint32_t leaf )
 
 static void InitFailure( const char* msg )
 {
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
     bool hasConsole = false;
     bool reopen = false;
     const auto attached = AttachConsole( ATTACH_PARENT_PROCESS );
@@ -214,33 +314,59 @@ static void InitFailure( const char* msg )
     }
     else
     {
+#  ifndef TRACY_UWP
         MessageBoxA( nullptr, msg, "Tracy Profiler initialization failure", MB_ICONSTOP );
+#  endif
     }
 #else
     fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg );
 #endif
-    exit( 0 );
+    exit( 1 );
 }
 
+static bool CheckHardwareSupportsInvariantTSC()
+{
+#if defined TRACY_NO_INVARIANT_CHECK
+    return true;
+#else
+    const char* noCheck = GetEnvVar( "TRACY_NO_INVARIANT_CHECK" );
+    if( noCheck && noCheck[0] == '1' ) return true;
+
+    uint32_t regs[4];
+    CpuId( regs, 1 );
+    if( !( regs[3] & ( 1 << 4 ) ) )
+    {
+#if !defined TRACY_TIMER_QPC && !defined TRACY_TIMER_FALLBACK
+        InitFailure( "CPU doesn't support RDTSC instruction." );
+#else
+        return false;
+#endif
+    }
+    CpuId( regs, 0x80000007 );
+    if( regs[3] & ( 1 << 8 ) ) return true;
+
+    return false;
+#endif
+}
+
+#if defined TRACY_TIMER_FALLBACK && defined TRACY_HW_TIMER
+bool HardwareSupportsInvariantTSC()
+{
+    static bool cachedResult = CheckHardwareSupportsInvariantTSC();
+    return cachedResult;
+}
+#endif
+
 static int64_t SetupHwTimer()
 {
 #if !defined TRACY_TIMER_QPC && !defined TRACY_TIMER_FALLBACK
-    uint32_t regs[4];
-    CpuId( regs, 1 );
-    if( !( regs[3] & ( 1 << 4 ) ) ) InitFailure( "CPU doesn't support RDTSC instruction." );
-#if !defined TRACY_NO_INVARIANT_CHECK
-    CpuId( regs, 0x80000007 );
-    if( !( regs[3] & ( 1 << 8 ) ) )
+    if( !CheckHardwareSupportsInvariantTSC() )
     {
-        const char* noCheck = getenv( "TRACY_NO_INVARIANT_CHECK" );
-        if( !noCheck || noCheck[0] != '1' )
-        {
-#if defined _WIN32 || defined __CYGWIN__
-            InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*.\nAlternatively you may rebuild the application with the TRACY_TIMER_QPC or TRACY_TIMER_FALLBACK define to use lower resolution timer." );
+#if defined _WIN32
+        InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*.\nAlternatively you may rebuild the application with the TRACY_TIMER_QPC or TRACY_TIMER_FALLBACK define to use lower resolution timer." );
 #else
-            InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*.\nAlternatively you may rebuild the application with the TRACY_TIMER_FALLBACK define to use lower resolution timer." );
+        InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*.\nAlternatively you may rebuild the application with the TRACY_TIMER_FALLBACK define to use lower resolution timer." );
 #endif
-        }
     }
 #endif
 #endif
@@ -270,7 +396,7 @@ static const char* GetProcessName()
     auto buf = getprogname();
     if( buf ) processName = buf;
 #  endif
-#elif defined _GNU_SOURCE || defined __CYGWIN__
+#elif defined __linux__ && defined _GNU_SOURCE
     if( program_invocation_short_name ) processName = program_invocation_short_name;
 #elif defined __APPLE__ || defined BSD
     auto buf = getprogname();
@@ -287,7 +413,7 @@ static const char* GetProcessExecutablePath()
     return buf;
 #elif defined __ANDROID__
     return nullptr;
-#elif defined _GNU_SOURCE || defined __CYGWIN__
+#elif defined __linux__ && defined _GNU_SOURCE
     return program_invocation_name;
 #elif defined __APPLE__
     static char buf[1024];
@@ -341,13 +467,15 @@ static const char* GetHostInfo()
 {
     static char buf[1024];
     auto ptr = buf;
-#if defined _WIN32 || defined __CYGWIN__
-    t_RtlGetVersion RtlGetVersion = (t_RtlGetVersion)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlGetVersion" );
-    if( !RtlGetVersion )
+#if defined _WIN32
+#  ifdef TRACY_UWP
+    auto GetVersion = &::GetVersionEx;
+#  else
+    auto GetVersion = (t_RtlGetVersion)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlGetVersion" );
+#  endif
+    if( !GetVersion )
     {
-#  ifdef __CYGWIN__
-        ptr += sprintf( ptr, "OS: Windows (Cygwin)\n" );
-#  elif defined __MINGW32__
+#  ifdef __MINGW32__
         ptr += sprintf( ptr, "OS: Windows (MingW)\n" );
 #  else
         ptr += sprintf( ptr, "OS: Windows\n" );
@@ -356,11 +484,9 @@ static const char* GetHostInfo()
     else
     {
         RTL_OSVERSIONINFOW ver = { sizeof( RTL_OSVERSIONINFOW ) };
-        RtlGetVersion( &ver );
+        GetVersion( &ver );
 
-#  ifdef __CYGWIN__
-        ptr += sprintf( ptr, "OS: Windows %i.%i.%i (Cygwin)\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber );
-#  elif defined __MINGW32__
+#  ifdef __MINGW32__
         ptr += sprintf( ptr, "OS: Windows %i.%i.%i (MingW)\n", (int)ver.dwMajorVersion, (int)ver.dwMinorVersion, (int)ver.dwBuildNumber );
 #  else
         ptr += sprintf( ptr, "OS: Windows %i.%i.%i\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber );
@@ -403,21 +529,24 @@ static const char* GetHostInfo()
 #elif defined __clang__
     ptr += sprintf( ptr, "Compiler: clang %i.%i.%i\n", __clang_major__, __clang_minor__, __clang_patchlevel__ );
 #elif defined __GNUC__
-    ptr += sprintf( ptr, "Compiler: gcc %i.%i\n", __GNUC__, __GNUC_MINOR__ );
+    ptr += sprintf( ptr, "Compiler: gcc %i.%i.%i\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
 #else
     ptr += sprintf( ptr, "Compiler: unknown\n" );
 #endif
 
-#if defined _WIN32 || defined __CYGWIN__
-#  ifndef __CYGWIN__
+#if defined _WIN32
     InitWinSock();
-#  endif
+
     char hostname[512];
     gethostname( hostname, 512 );
 
+#  ifdef TRACY_UWP
+    const char* user = "";
+#  else
     DWORD userSz = UNLEN+1;
     char user[UNLEN+1];
     GetUserNameA( user, &userSz );
+#  endif
 
     ptr += sprintf( ptr, "User: %s@%s\n", user, hostname );
 #else
@@ -456,7 +585,7 @@ static const char* GetHostInfo()
 
 #if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64
     uint32_t regs[4];
-    char cpuModel[4*4*3];
+    char cpuModel[4*4*3+1] = {};
     auto modelPtr = cpuModel;
     for( uint32_t i=0x80000002; i<0x80000005; ++i )
     {
@@ -526,10 +655,17 @@ static const char* GetHostInfo()
 #else
     ptr += sprintf( ptr, "CPU: unknown\n" );
 #endif
+#ifdef __ANDROID__
+    char deviceModel[PROP_VALUE_MAX+1];
+    char deviceManufacturer[PROP_VALUE_MAX+1];
+    __system_property_get( "ro.product.model", deviceModel );
+    __system_property_get( "ro.product.manufacturer", deviceManufacturer );
+    ptr += sprintf( ptr, "Device: %s %s\n", deviceManufacturer, deviceModel );
+#endif
 
     ptr += sprintf( ptr, "CPU cores: %i\n", std::thread::hardware_concurrency() );
 
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
     MEMORYSTATUSEX statex;
     statex.dwLength = sizeof( statex );
     GlobalMemoryStatusEx( &statex );
@@ -561,7 +697,7 @@ static const char* GetHostInfo()
 
 static uint64_t GetPid()
 {
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
     return uint64_t( GetCurrentProcessId() );
 #else
     return uint64_t( getpid() );
@@ -576,12 +712,12 @@ void Profiler::AckServerQuery()
     AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::AckServerQueryNoop] );
 }
 
-void Profiler::AckSourceCodeNotAvailable()
+void Profiler::AckSymbolCodeNotAvailable()
 {
     QueueItem item;
-    MemWrite( &item.hdr.type, QueueType::AckSourceCodeNotAvailable );
-    NeedDataSize( QueueDataSize[(int)QueueType::AckSourceCodeNotAvailable] );
-    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::AckSourceCodeNotAvailable] );
+    MemWrite( &item.hdr.type, QueueType::AckSymbolCodeNotAvailable );
+    NeedDataSize( QueueDataSize[(int)QueueType::AckSymbolCodeNotAvailable] );
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::AckSymbolCodeNotAvailable] );
 }
 
 static BroadcastMessage& GetBroadcastMessage( const char* procname, size_t pnsz, int& len, int port )
@@ -591,6 +727,7 @@ static BroadcastMessage& GetBroadcastMessage( const char* procname, size_t pnsz,
     msg.broadcastVersion = BroadcastVersion;
     msg.protocolVersion = ProtocolVersion;
     msg.listenPort = port;
+    msg.pid = GetPid();
 
     memcpy( msg.programName, procname, pnsz );
     memset( msg.programName + pnsz, 0, WelcomeMessageProgramNameSize - pnsz );
@@ -599,8 +736,9 @@ static BroadcastMessage& GetBroadcastMessage( const char* procname, size_t pnsz,
     return msg;
 }
 
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32 && !defined TRACY_UWP && !defined TRACY_NO_CRASH_HANDLER
 static DWORD s_profilerThreadId = 0;
+static DWORD s_symbolThreadId = 0;
 static char s_crashText[1024];
 
 LONG WINAPI CrashFilter( PEXCEPTION_POINTERS pExp )
@@ -659,10 +797,10 @@ LONG WINAPI CrashFilter( PEXCEPTION_POINTERS pExp )
     {
         GetProfiler().SendCallstack( 60, "KiUserExceptionDispatcher" );
 
-        TracyLfqPrepare( QueueType::CrashReport );
+        TracyQueuePrepare( QueueType::CrashReport );
         item->crashReport.time = Profiler::GetTime();
         item->crashReport.text = (uint64_t)s_crashText;
-        TracyLfqCommit;
+        TracyQueueCommit( crashReportThread );
     }
 
     HANDLE h = CreateToolhelp32Snapshot( TH32CS_SNAPTHREAD, 0 );
@@ -680,7 +818,7 @@ LONG WINAPI CrashFilter( PEXCEPTION_POINTERS pExp )
 
     do
     {
-        if( te.th32OwnerProcessID == pid && te.th32ThreadID != tid && te.th32ThreadID != s_profilerThreadId )
+        if( te.th32OwnerProcessID == pid && te.th32ThreadID != tid && te.th32ThreadID != s_profilerThreadId && te.th32ThreadID != s_symbolThreadId )
         {
             HANDLE th = OpenThread( THREAD_SUSPEND_RESUME, FALSE, te.th32ThreadID );
             if( th != INVALID_HANDLE_VALUE )
@@ -702,14 +840,30 @@ LONG WINAPI CrashFilter( PEXCEPTION_POINTERS pExp )
     GetProfiler().RequestShutdown();
     while( !GetProfiler().HasShutdownFinished() ) { std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); };
 
-    TerminateProcess( GetCurrentProcess(), 1 );
-
     return EXCEPTION_CONTINUE_SEARCH;
 }
 #endif
 
-#ifdef __linux__
+static Profiler* s_instance = nullptr;
+static Thread* s_thread;
+#ifndef TRACY_NO_FRAME_IMAGE
+static Thread* s_compressThread;
+#endif
+#ifdef TRACY_HAS_CALLSTACK
+static Thread* s_symbolThread;
+std::atomic<bool> s_symbolThreadGone { false };
+#endif
+#ifdef TRACY_HAS_SYSTEM_TRACING
+static Thread* s_sysTraceThread = nullptr;
+#endif
+
+#if defined __linux__ && !defined TRACY_NO_CRASH_HANDLER
+#  ifndef TRACY_CRASH_SIGNAL
+#    define TRACY_CRASH_SIGNAL SIGPWR
+#  endif
+
 static long s_profilerTid = 0;
+static long s_symbolTid = 0;
 static char s_crashText[1024];
 static std::atomic<bool> s_alreadyCrashed( false );
 
@@ -898,10 +1052,10 @@ static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ )
     {
         GetProfiler().SendCallstack( 60, "__kernel_rt_sigreturn" );
 
-        TracyLfqPrepare( QueueType::CrashReport );
+        TracyQueuePrepare( QueueType::CrashReport );
         item->crashReport.time = Profiler::GetTime();
         item->crashReport.text = (uint64_t)s_crashText;
-        TracyLfqCommit;
+        TracyQueueCommit( crashReportThread );
     }
 
     DIR* dp = opendir( "/proc/self/task" );
@@ -914,17 +1068,17 @@ static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ )
     {
         if( ep->d_name[0] == '.' ) continue;
         int tid = atoi( ep->d_name );
-        if( tid != selfTid && tid != s_profilerTid )
+        if( tid != selfTid && tid != s_profilerTid && tid != s_symbolTid )
         {
-            syscall( SYS_tkill, tid, SIGPWR );
+            syscall( SYS_tkill, tid, TRACY_CRASH_SIGNAL );
         }
     }
     closedir( dp );
 
-    {
-        TracyLfqPrepare( QueueType::Crash );
-        TracyLfqCommit;
-    }
+    if( selfTid == s_symbolTid ) s_symbolThreadGone.store( true, std::memory_order_release );
+
+    TracyLfqPrepare( QueueType::Crash );
+    TracyLfqCommit;
 
     std::this_thread::sleep_for( std::chrono::milliseconds( 500 ) );
     GetProfiler().RequestShutdown();
@@ -937,18 +1091,9 @@ static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ )
 
 enum { QueuePrealloc = 256 * 1024 };
 
-static Profiler* s_instance = nullptr;
-static Thread* s_thread;
-
-#ifdef TRACY_HAS_SYSTEM_TRACING
-static Thread* s_sysTraceThread = nullptr;
-#endif
-
-TRACY_API bool ProfilerAvailable() { return s_instance != nullptr; }
-
 TRACY_API int64_t GetFrequencyQpc()
 {
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
     LARGE_INTEGER t;
     QueryPerformanceFrequency( &t );
     return t.QuadPart;
@@ -960,18 +1105,10 @@ TRACY_API int64_t GetFrequencyQpc()
 #ifdef TRACY_DELAYED_INIT
 struct ThreadNameData;
 TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue();
-TRACY_API void InitRPMallocThread();
-
-void InitRPMallocThread()
-{
-    RPMallocInit rpinit;
-    rpmalloc_thread_initialize();
-}
 
 struct ProfilerData
 {
     int64_t initTime = SetupHwTimer();
-    RPMallocInit rpmalloc_init;
     moodycamel::ConcurrentQueue<QueueItem> queue;
     Profiler profiler;
     std::atomic<uint32_t> lockCounter { 0 };
@@ -989,7 +1126,6 @@ struct ProducerWrapper
 struct ProfilerThreadData
 {
     ProfilerThreadData( ProfilerData& data ) : token( data ), gpuCtx( { nullptr } ) {}
-    RPMallocInit rpmalloc_init;
     ProducerWrapper token;
     GpuCtxWrapper gpuCtx;
 #  ifdef TRACY_ON_DEMAND
@@ -997,23 +1133,34 @@ struct ProfilerThreadData
 #  endif
 };
 
+std::atomic<int> RpInitDone { 0 };
+std::atomic<int> RpInitLock { 0 };
+thread_local bool RpThreadInitDone = false;
+thread_local bool RpThreadShutdown = false;
+
 #  ifdef TRACY_MANUAL_LIFETIME
 ProfilerData* s_profilerData = nullptr;
+static ProfilerThreadData& GetProfilerThreadData();
 TRACY_API void StartupProfiler()
 {
-    s_profilerData = new ProfilerData;
+    s_profilerData = (ProfilerData*)tracy_malloc( sizeof( ProfilerData ) );
+    new (s_profilerData) ProfilerData();
     s_profilerData->profiler.SpawnWorkerThreads();
+    GetProfilerThreadData().token = ProducerWrapper( *s_profilerData );
 }
 static ProfilerData& GetProfilerData()
 {
-    assert(s_profilerData);
+    assert( s_profilerData );
     return *s_profilerData;
 }
 TRACY_API void ShutdownProfiler()
 {
-    delete s_profilerData;
+    s_profilerData->~ProfilerData();
+    tracy_free( s_profilerData );
     s_profilerData = nullptr;
     rpmalloc_finalize();
+    RpThreadInitDone = false;
+    RpInitDone.store( 0, std::memory_order_release );
 }
 #  else
 static std::atomic<int> profilerDataLock { 0 };
@@ -1025,11 +1172,11 @@ static ProfilerData& GetProfilerData()
     if( !ptr )
     {
         int expected = 0;
-        while( !profilerDataLock.compare_exchange_strong( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; }
+        while( !profilerDataLock.compare_exchange_weak( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; YieldThread(); }
         ptr = profilerData.load( std::memory_order_acquire );
         if( !ptr )
         {
-            ptr = (ProfilerData*)malloc( sizeof( ProfilerData ) );
+            ptr = (ProfilerData*)tracy_malloc( sizeof( ProfilerData ) );
             new (ptr) ProfilerData();
             profilerData.store( ptr, std::memory_order_release );
         }
@@ -1039,11 +1186,60 @@ static ProfilerData& GetProfilerData()
 }
 #  endif
 
+// GCC prior to 8.4 had a bug with function-inline thread_local variables. Versions of glibc beginning with
+// 2.18 may attempt to work around this issue, which manifests as a crash while running static destructors
+// if this function is compiled into a shared object. Unfortunately, centos7 ships with glibc 2.17. If running
+// on old GCC, use the old-fashioned way as a workaround
+// See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85400
+#if !defined(__clang__) && defined(__GNUC__) && ((__GNUC__ < 8) || ((__GNUC__ == 8) && (__GNUC_MINOR__ < 4)))
+struct ProfilerThreadDataKey
+{
+public:
+    ProfilerThreadDataKey()
+    {
+        int val = pthread_key_create(&m_key, sDestructor);
+        static_cast<void>(val); // unused
+        assert(val == 0);
+    }
+    ~ProfilerThreadDataKey()
+    {
+        int val = pthread_key_delete(m_key);
+        static_cast<void>(val); // unused
+        assert(val == 0);
+    }
+    ProfilerThreadData& get()
+    {
+        void* p = pthread_getspecific(m_key);
+        if (!p)
+        {
+            p = (ProfilerThreadData*)tracy_malloc( sizeof( ProfilerThreadData ) );
+            new (p) ProfilerThreadData(GetProfilerData());
+            pthread_setspecific(m_key, p);
+        }
+        return *static_cast<ProfilerThreadData*>(p);
+    }
+private:
+    pthread_key_t m_key;
+
+    static void sDestructor(void* p)
+    {
+        ((ProfilerThreadData*)p)->~ProfilerThreadData();
+        tracy_free(p);
+    }
+};
+
+static ProfilerThreadData& GetProfilerThreadData()
+{
+    static ProfilerThreadDataKey key;
+    return key.get();
+}
+#else
 static ProfilerThreadData& GetProfilerThreadData()
 {
     thread_local ProfilerThreadData data( GetProfilerData() );
     return data;
 }
+#endif
 
 TRACY_API moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* GetToken() { return GetProfilerThreadData().token.ptr; }
 TRACY_API Profiler& GetProfiler() { return GetProfilerData().profiler; }
@@ -1052,7 +1248,7 @@ TRACY_API int64_t GetInitTime() { return GetProfilerData().initTime; }
 TRACY_API std::atomic<uint32_t>& GetLockCounter() { return GetProfilerData().lockCounter; }
 TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter() { return GetProfilerData().gpuCtxCounter; }
 TRACY_API GpuCtxWrapper& GetGpuCtx() { return GetProfilerThreadData().gpuCtx; }
-TRACY_API uint64_t GetThreadHandle() { return detail::GetThreadHandleImpl(); }
+TRACY_API uint32_t GetThreadHandle() { return detail::GetThreadHandleImpl(); }
 std::atomic<ThreadNameData*>& GetThreadNameData() { return GetProfilerData().threadNameData; }
 
 #  ifdef TRACY_ON_DEMAND
@@ -1067,18 +1263,12 @@ namespace
 #  endif
 
 #else
-TRACY_API void InitRPMallocThread()
-{
-    rpmalloc_thread_initialize();
-}
 
 // MSVC static initialization order solution. gcc/clang uses init_order() to avoid all this.
 
 // 1a. But s_queue is needed for initialization of variables in point 2.
 extern moodycamel::ConcurrentQueue<QueueItem> s_queue;
 
-thread_local RPMallocInit init_order(106) s_rpmalloc_thread_init;
-
 // 2. If these variables would be in the .CRT$XCB section, they would be initialized only in main thread.
 thread_local moodycamel::ProducerToken init_order(107) s_token_detail( s_queue );
 thread_local ProducerWrapper init_order(108) s_token { s_queue.get_explicit_producer( s_token_detail ) };
@@ -1091,7 +1281,10 @@ thread_local ThreadHandleWrapper init_order(104) s_threadHandle { detail::GetThr
 #  endif
 
 static InitTimeWrapper init_order(101) s_initTime { SetupHwTimer() };
-static RPMallocInit init_order(102) s_rpmalloc_init;
+std::atomic<int> init_order(102) RpInitDone( 0 );
+std::atomic<int> init_order(102) RpInitLock( 0 );
+thread_local bool RpThreadInitDone = false;
+thread_local bool RpThreadShutdown = false;
 moodycamel::ConcurrentQueue<QueueItem> init_order(103) s_queue( QueuePrealloc );
 std::atomic<uint32_t> init_order(104) s_lockCounter( 0 );
 std::atomic<uint8_t> init_order(104) s_gpuCtxCounter( 0 );
@@ -1115,12 +1308,7 @@ TRACY_API int64_t GetInitTime() { return s_initTime.val; }
 TRACY_API std::atomic<uint32_t>& GetLockCounter() { return s_lockCounter; }
 TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter() { return s_gpuCtxCounter; }
 TRACY_API GpuCtxWrapper& GetGpuCtx() { return s_gpuCtx; }
-#  ifdef __CYGWIN__
-// Hackfix for cygwin reporting memory frees without matching allocations. WTF?
-TRACY_API uint64_t GetThreadHandle() { return detail::GetThreadHandleImpl(); }
-#  else
-TRACY_API uint64_t GetThreadHandle() { return s_threadHandle.val; }
-#  endif
+TRACY_API uint32_t GetThreadHandle() { return s_threadHandle.val; }
 
 std::atomic<ThreadNameData*>& GetThreadNameData() { return s_threadNameData; }
 
@@ -1129,6 +1317,9 @@ TRACY_API LuaZoneState& GetLuaZoneState() { return s_luaZoneState; }
 #  endif
 #endif
 
+TRACY_API bool ProfilerAvailable() { return s_instance != nullptr; }
+TRACY_API bool ProfilerAllocatorAvailable() { return !RpThreadShutdown; }
+
 Profiler::Profiler()
     : m_timeBegin( 0 )
     , m_mainThread( detail::GetThreadHandleImpl() )
@@ -1149,6 +1340,11 @@ Profiler::Profiler()
     , m_lz4Buf( (char*)tracy_malloc( LZ4Size + sizeof( lz4sz_t ) ) )
     , m_serialQueue( 1024*1024 )
     , m_serialDequeue( 1024*1024 )
+#ifndef TRACY_NO_FRAME_IMAGE
+    , m_fiQueue( 16 )
+    , m_fiDequeue( 16 )
+#endif
+    , m_symbolQueue( 8*1024 )
     , m_frameCount( 0 )
     , m_isConnected( false )
 #ifdef TRACY_ON_DEMAND
@@ -1156,7 +1352,10 @@ Profiler::Profiler()
     , m_deferredQueue( 64*1024 )
 #endif
     , m_paramCallback( nullptr )
+    , m_sourceCallback( nullptr )
+    , m_queryImage( nullptr )
     , m_queryData( nullptr )
+    , m_crashHandlerInstalled( false )
 {
     assert( !s_instance );
     s_instance = this;
@@ -1175,14 +1374,14 @@ Profiler::Profiler()
     ReportTopology();
 
 #ifndef TRACY_NO_EXIT
-    const char* noExitEnv = getenv( "TRACY_NO_EXIT" );
+    const char* noExitEnv = GetEnvVar( "TRACY_NO_EXIT" );
     if( noExitEnv && noExitEnv[0] == '1' )
     {
         m_noExit = true;
     }
 #endif
 
-    const char* userPort = getenv( "TRACY_PORT" );
+    const char* userPort = GetEnvVar( "TRACY_PORT" );
     if( userPort )
     {
         m_userPort = atoi( userPort );
@@ -1195,9 +1394,6 @@ Profiler::Profiler()
 
 void Profiler::SpawnWorkerThreads()
 {
-    s_thread = (Thread*)tracy_malloc( sizeof( Thread ) );
-    new(s_thread) Thread( LaunchWorker, this );
-
 #ifdef TRACY_HAS_SYSTEM_TRACING
     if( SysTraceStart( m_samplingPeriod ) )
     {
@@ -1207,29 +1403,47 @@ void Profiler::SpawnWorkerThreads()
     }
 #endif
 
-#if defined _WIN32 || defined __CYGWIN__
-    s_profilerThreadId = GetThreadId( s_thread->Handle() );
-    AddVectoredExceptionHandler( 1, CrashFilter );
+    s_thread = (Thread*)tracy_malloc( sizeof( Thread ) );
+    new(s_thread) Thread( LaunchWorker, this );
+
+#ifndef TRACY_NO_FRAME_IMAGE
+    s_compressThread = (Thread*)tracy_malloc( sizeof( Thread ) );
+    new(s_compressThread) Thread( LaunchCompressWorker, this );
 #endif
 
-#ifdef __linux__
+#ifdef TRACY_HAS_CALLSTACK
+    s_symbolThread = (Thread*)tracy_malloc( sizeof( Thread ) );
+    new(s_symbolThread) Thread( LaunchSymbolWorker, this );
+#endif
+
+#if defined _WIN32 && !defined TRACY_UWP && !defined TRACY_NO_CRASH_HANDLER
+    s_profilerThreadId = GetThreadId( s_thread->Handle() );
+    s_symbolThreadId = GetThreadId( s_symbolThread->Handle() );
+    m_exceptionHandler = AddVectoredExceptionHandler( 1, CrashFilter );
+#endif
+
+#if defined __linux__ && !defined TRACY_NO_CRASH_HANDLER
     struct sigaction threadFreezer = {};
     threadFreezer.sa_handler = ThreadFreezer;
-    sigaction( SIGPWR, &threadFreezer, nullptr );
+    sigaction( TRACY_CRASH_SIGNAL, &threadFreezer, &m_prevSignal.pwr );
 
     struct sigaction crashHandler = {};
     crashHandler.sa_sigaction = CrashHandler;
     crashHandler.sa_flags = SA_SIGINFO;
-    sigaction( SIGILL, &crashHandler, nullptr );
-    sigaction( SIGFPE, &crashHandler, nullptr );
-    sigaction( SIGSEGV, &crashHandler, nullptr );
-    sigaction( SIGPIPE, &crashHandler, nullptr );
-    sigaction( SIGBUS, &crashHandler, nullptr );
-    sigaction( SIGABRT, &crashHandler, nullptr );
+    sigaction( SIGILL, &crashHandler, &m_prevSignal.ill );
+    sigaction( SIGFPE, &crashHandler, &m_prevSignal.fpe );
+    sigaction( SIGSEGV, &crashHandler, &m_prevSignal.segv );
+    sigaction( SIGPIPE, &crashHandler, &m_prevSignal.pipe );
+    sigaction( SIGBUS, &crashHandler, &m_prevSignal.bus );
+    sigaction( SIGABRT, &crashHandler, &m_prevSignal.abrt );
+#endif
+
+#ifndef TRACY_NO_CRASH_HANDLER
+    m_crashHandlerInstalled = true;
 #endif
 
 #ifdef TRACY_HAS_CALLSTACK
-    InitCallstack();
+    InitCallstackCritical();
 #endif
 
     m_timeBegin.store( GetTime(), std::memory_order_relaxed );
@@ -1239,6 +1453,23 @@ Profiler::~Profiler()
 {
     m_shutdown.store( true, std::memory_order_relaxed );
 
+#if defined _WIN32 && !defined TRACY_UWP
+    if( m_crashHandlerInstalled ) RemoveVectoredExceptionHandler( m_exceptionHandler );
+#endif
+
+#ifdef __linux__
+    if( m_crashHandlerInstalled )
+    {
+        sigaction( TRACY_CRASH_SIGNAL, &m_prevSignal.pwr, nullptr );
+        sigaction( SIGILL, &m_prevSignal.ill, nullptr );
+        sigaction( SIGFPE, &m_prevSignal.fpe, nullptr );
+        sigaction( SIGSEGV, &m_prevSignal.segv, nullptr );
+        sigaction( SIGPIPE, &m_prevSignal.pipe, nullptr );
+        sigaction( SIGBUS, &m_prevSignal.bus, nullptr );
+        sigaction( SIGABRT, &m_prevSignal.abrt, nullptr );
+    }
+#endif
+
 #ifdef TRACY_HAS_SYSTEM_TRACING
     if( s_sysTraceThread )
     {
@@ -1248,9 +1479,23 @@ Profiler::~Profiler()
     }
 #endif
 
+#ifdef TRACY_HAS_CALLSTACK
+    s_symbolThread->~Thread();
+    tracy_free( s_symbolThread );
+#endif
+
+#ifndef TRACY_NO_FRAME_IMAGE
+    s_compressThread->~Thread();
+    tracy_free( s_compressThread );
+#endif
+
     s_thread->~Thread();
     tracy_free( s_thread );
 
+#ifdef TRACY_HAS_CALLSTACK
+    EndCallstack();
+#endif
+
     tracy_free( m_lz4Buf );
     tracy_free( m_buffer );
     LZ4_freeStream( (LZ4_stream_t*)m_stream );
@@ -1301,7 +1546,9 @@ void Profiler::Worker()
 
     while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
 
+#ifdef TRACY_USE_RPMALLOC
     rpmalloc_thread_initialize();
+#endif
 
     m_exectime = 0;
     const auto execname = GetProcessExecutablePath();
@@ -1322,16 +1569,22 @@ void Profiler::Worker()
 
     const uint64_t pid = GetPid();
 
-#ifdef TRACY_ON_DEMAND
-    uint8_t onDemand = 1;
-#else
-    uint8_t onDemand = 0;
-#endif
+    uint8_t flags = 0;
 
+#ifdef TRACY_ON_DEMAND
+    flags |= WelcomeFlag::OnDemand;
+#endif
 #ifdef __APPLE__
-    uint8_t isApple = 1;
-#else
-    uint8_t isApple = 0;
+    flags |= WelcomeFlag::IsApple;
+#endif
+#ifndef TRACY_NO_CODE_TRANSFER
+    flags |= WelcomeFlag::CodeTransfer;
+#endif
+#ifdef _WIN32
+    flags |= WelcomeFlag::CombineSamples;
+#  ifndef TRACY_NO_CONTEXT_SWITCH
+    flags |= WelcomeFlag::IdentifySamples;
+#  endif
 #endif
 
 #if defined __i386 || defined _M_IX86
@@ -1346,12 +1599,6 @@ void Profiler::Worker()
     uint8_t cpuArch = CpuArchUnknown;
 #endif
 
-#ifdef TRACY_NO_CODE_TRANSFER
-    uint8_t codeTransfer = 0;
-#else
-    uint8_t codeTransfer = 1;
-#endif
-
 #if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64
     uint32_t regs[4];
     char manufacturer[12];
@@ -1377,10 +1624,8 @@ void Profiler::Worker()
     MemWrite( &welcome.exectime, m_exectime );
     MemWrite( &welcome.pid, pid );
     MemWrite( &welcome.samplingPeriod, m_samplingPeriod );
-    MemWrite( &welcome.onDemand, onDemand );
-    MemWrite( &welcome.isApple, isApple );
+    MemWrite( &welcome.flags, flags );
     MemWrite( &welcome.cpuArch, cpuArch );
-    MemWrite( &welcome.codeTransfer, codeTransfer );
     memcpy( welcome.cpuManufacturer, manufacturer, 12 );
     MemWrite( &welcome.cpuId, cpuId );
     memcpy( welcome.programName, procname, pnsz );
@@ -1606,7 +1851,7 @@ void Profiler::Worker()
 
                     keepAlive = 0;
                 }
-                else
+                else if( !m_sock->HasData() )
                 {
                     keepAlive++;
                     std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
@@ -1618,9 +1863,10 @@ void Profiler::Worker()
             }
 
             bool connActive = true;
-            while( m_sock->HasData() && connActive )
+            while( m_sock->HasData() )
             {
                 connActive = HandleServerQuery();
+                if( !connActive ) break;
             }
             if( !connActive ) break;
         }
@@ -1681,6 +1927,11 @@ void Profiler::Worker()
     }
     // End of connections loop
 
+    // Wait for symbols thread to terminate. Symbol resolution will continue in this thread.
+#ifdef TRACY_HAS_CALLSTACK
+    while( s_symbolThreadGone.load() == false ) { YieldThread(); }
+#endif
+
     // Client is exiting. Send items remaining in queues.
     for(;;)
     {
@@ -1705,6 +1956,16 @@ void Profiler::Worker()
                 return;
             }
         }
+
+#ifdef TRACY_HAS_CALLSTACK
+        for(;;)
+        {
+            auto si = m_symbolQueue.front();
+            if( !si ) break;
+            HandleSymbolQueueItem( *si );
+            m_symbolQueue.pop();
+        }
+#endif
     }
 
     // Send client termination notice to the server
@@ -1718,35 +1979,113 @@ void Profiler::Worker()
     // Handle remaining server queries
     for(;;)
     {
-        if( m_sock->HasData() )
+        while( m_sock->HasData() )
         {
-            while( m_sock->HasData() )
+            if( !HandleServerQuery() )
             {
-                if( !HandleServerQuery() )
-                {
-                    m_shutdownFinished.store( true, std::memory_order_relaxed );
-                    return;
-                }
-            }
-            while( Dequeue( token ) == DequeueStatus::DataDequeued ) {}
-            while( DequeueSerial() == DequeueStatus::DataDequeued ) {}
-            if( m_bufferOffset != m_bufferStart )
-            {
-                if( !CommitData() )
-                {
-                    m_shutdownFinished.store( true, std::memory_order_relaxed );
-                    return;
-                }
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
             }
         }
-        else
+#ifdef TRACY_HAS_CALLSTACK
+        for(;;)
         {
-            if( m_bufferOffset != m_bufferStart ) CommitData();
-            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+            auto si = m_symbolQueue.front();
+            if( !si ) break;
+            HandleSymbolQueueItem( *si );
+            m_symbolQueue.pop();
+        }
+#endif
+        const auto status = Dequeue( token );
+        const auto serialStatus = DequeueSerial();
+        if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost )
+        {
+            m_shutdownFinished.store( true, std::memory_order_relaxed );
+            return;
+        }
+        if( m_bufferOffset != m_bufferStart )
+        {
+            if( !CommitData() )
+            {
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
         }
     }
 }
 
+#ifndef TRACY_NO_FRAME_IMAGE
+void Profiler::CompressWorker()
+{
+    ThreadExitHandler threadExitHandler;
+    SetThreadName( "Tracy DXT1" );
+    while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+
+#ifdef TRACY_USE_RPMALLOC
+    rpmalloc_thread_initialize();
+#endif
+
+    for(;;)
+    {
+        const auto shouldExit = ShouldExit();
+
+        {
+            bool lockHeld = true;
+            while( !m_fiLock.try_lock() )
+            {
+                if( m_shutdownManual.load( std::memory_order_relaxed ) )
+                {
+                    lockHeld = false;
+                    break;
+                }
+            }
+            if( !m_fiQueue.empty() ) m_fiQueue.swap( m_fiDequeue );
+            if( lockHeld )
+            {
+                m_fiLock.unlock();
+            }
+        }
+
+        const auto sz = m_fiDequeue.size();
+        if( sz > 0 )
+        {
+            auto fi = m_fiDequeue.data();
+            auto end = fi + sz;
+            while( fi != end )
+            {
+                const auto w = fi->w;
+                const auto h = fi->h;
+                const auto csz = size_t( w * h / 2 );
+                auto etc1buf = (char*)tracy_malloc( csz );
+                CompressImageDxt1( (const char*)fi->image, etc1buf, w, h );
+                tracy_free( fi->image );
+
+                TracyLfqPrepare( QueueType::FrameImage );
+                MemWrite( &item->frameImageFat.image, (uint64_t)etc1buf );
+                MemWrite( &item->frameImageFat.frame, fi->frame );
+                MemWrite( &item->frameImageFat.w, w );
+                MemWrite( &item->frameImageFat.h, h );
+                uint8_t flip = fi->flip;
+                MemWrite( &item->frameImageFat.flip, flip );
+                TracyLfqCommit;
+
+                fi++;
+            }
+            m_fiDequeue.clear();
+        }
+        else
+        {
+            std::this_thread::sleep_for( std::chrono::milliseconds( 20 ) );
+        }
+
+        if( shouldExit )
+        {
+            return;
+        }
+    }
+}
+#endif
+
 static void FreeAssociatedMemory( const QueueItem& item )
 {
     if( item.hdr.idx >= (int)QueueType::Terminate ) return;
@@ -1796,6 +2135,7 @@ static void FreeAssociatedMemory( const QueueItem& item )
         tracy_free( (void*)ptr );
         break;
     case QueueType::CallstackSample:
+    case QueueType::CallstackSampleContextSwitch:
         ptr = MemRead<uint64_t>( &item.callstackSampleFat.ptr );
         tracy_free( (void*)ptr );
         break;
@@ -1803,6 +2143,36 @@ static void FreeAssociatedMemory( const QueueItem& item )
         ptr = MemRead<uint64_t>( &item.frameImageFat.image );
         tracy_free( (void*)ptr );
         break;
+#ifdef TRACY_HAS_CALLSTACK
+    case QueueType::CallstackFrameSize:
+    {
+        InitRpmalloc();
+        auto size = MemRead<uint8_t>( &item.callstackFrameSizeFat.size );
+        auto data = (const CallstackEntry*)MemRead<uint64_t>( &item.callstackFrameSizeFat.data );
+        for( uint8_t i=0; i<size; i++ )
+        {
+            const auto& frame = data[i];
+            tracy_free_fast( (void*)frame.name );
+            tracy_free_fast( (void*)frame.file );
+        }
+        tracy_free_fast( (void*)data );
+        break;
+    }
+    case QueueType::SymbolInformation:
+    {
+        uint8_t needFree = MemRead<uint8_t>( &item.symbolInformationFat.needFree );
+        if( needFree )
+        {
+            ptr = MemRead<uint64_t>( &item.symbolInformationFat.fileString );
+            tracy_free( (void*)ptr );
+        }
+        break;
+    }
+    case QueueType::SymbolCodeMetadata:
+        ptr = MemRead<uint64_t>( &item.symbolCodeMetadata.ptr );
+        tracy_free( (void*)ptr );
+        break;
+#endif
 #ifndef TRACY_ON_DEMAND
     case QueueType::LockName:
         ptr = MemRead<uint64_t>( &item.lockNameFat.name );
@@ -1819,6 +2189,18 @@ static void FreeAssociatedMemory( const QueueItem& item )
         // Don't free memory associated with deferred messages.
         break;
 #endif
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    case QueueType::ExternalNameMetadata:
+        ptr = MemRead<uint64_t>( &item.externalNameMetadata.name );
+        tracy_free( (void*)ptr );
+        ptr = MemRead<uint64_t>( &item.externalNameMetadata.threadName );
+        tracy_free_fast( (void*)ptr );
+        break;
+#endif
+    case QueueType::SourceCodeMetadata:
+        ptr = MemRead<uint64_t>( &item.sourceCodeMetadata.ptr );
+        tracy_free( (void*)ptr );
+        break;
     default:
         break;
     }
@@ -1861,21 +2243,14 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token )
 {
     bool connectionLost = false;
     const auto sz = GetQueue().try_dequeue_bulk_single( token,
-        [this, &connectionLost] ( const uint64_t& threadId )
+        [this, &connectionLost] ( const uint32_t& threadId )
         {
-            if( threadId != m_threadCtx )
-            {
-                QueueItem item;
-                MemWrite( &item.hdr.type, QueueType::ThreadContext );
-                MemWrite( &item.threadCtx.thread, threadId );
-                if( !AppendData( &item, QueueDataSize[(int)QueueType::ThreadContext] ) ) connectionLost = true;
-                m_threadCtx = threadId;
-                m_refTimeThread = 0;
-            }
+            if( ThreadCtxCheck( threadId ) == ThreadCtxStatus::ConnectionLost ) connectionLost = true;
         },
         [this, &connectionLost] ( QueueItem* item, size_t sz )
         {
             if( connectionLost ) return;
+            InitRpmalloc();
             assert( sz > 0 );
             int64_t refThread = m_refTimeThread;
             int64_t refCtx = m_refTimeCtx;
@@ -1894,28 +2269,28 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token )
                         ptr = MemRead<uint64_t>( &item->zoneTextFat.text );
                         size = MemRead<uint16_t>( &item->zoneTextFat.size );
                         SendSingleString( (const char*)ptr, size );
-                        tracy_free( (void*)ptr );
+                        tracy_free_fast( (void*)ptr );
                         break;
                     case QueueType::Message:
                     case QueueType::MessageCallstack:
                         ptr = MemRead<uint64_t>( &item->messageFat.text );
                         size = MemRead<uint16_t>( &item->messageFat.size );
                         SendSingleString( (const char*)ptr, size );
-                        tracy_free( (void*)ptr );
+                        tracy_free_fast( (void*)ptr );
                         break;
                     case QueueType::MessageColor:
                     case QueueType::MessageColorCallstack:
                         ptr = MemRead<uint64_t>( &item->messageColorFat.text );
                         size = MemRead<uint16_t>( &item->messageColorFat.size );
                         SendSingleString( (const char*)ptr, size );
-                        tracy_free( (void*)ptr );
+                        tracy_free_fast( (void*)ptr );
                         break;
                     case QueueType::MessageAppInfo:
                         ptr = MemRead<uint64_t>( &item->messageFat.text );
                         size = MemRead<uint16_t>( &item->messageFat.size );
                         SendSingleString( (const char*)ptr, size );
 #ifndef TRACY_ON_DEMAND
-                        tracy_free( (void*)ptr );
+                        tracy_free_fast( (void*)ptr );
 #endif
                         break;
                     case QueueType::ZoneBeginAllocSrcLoc:
@@ -1927,13 +2302,13 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token )
                         MemWrite( &item->zoneBegin.time, dt );
                         ptr = MemRead<uint64_t>( &item->zoneBegin.srcloc );
                         SendSourceLocationPayload( ptr );
-                        tracy_free( (void*)ptr );
+                        tracy_free_fast( (void*)ptr );
                         break;
                     }
                     case QueueType::Callstack:
                         ptr = MemRead<uint64_t>( &item->callstackFat.ptr );
                         SendCallstackPayload( ptr );
-                        tracy_free( (void*)ptr );
+                        tracy_free_fast( (void*)ptr );
                         break;
                     case QueueType::CallstackAlloc:
                         ptr = MemRead<uint64_t>( &item->callstackAllocFat.nativePtr );
@@ -1941,17 +2316,18 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token )
                         {
                             CutCallstack( (void*)ptr, "lua_pcall" );
                             SendCallstackPayload( ptr );
-                            tracy_free( (void*)ptr );
+                            tracy_free_fast( (void*)ptr );
                         }
                         ptr = MemRead<uint64_t>( &item->callstackAllocFat.ptr );
                         SendCallstackAlloc( ptr );
-                        tracy_free( (void*)ptr );
+                        tracy_free_fast( (void*)ptr );
                         break;
                     case QueueType::CallstackSample:
+                    case QueueType::CallstackSampleContextSwitch:
                     {
                         ptr = MemRead<uint64_t>( &item->callstackSampleFat.ptr );
                         SendCallstackPayload64( ptr );
-                        tracy_free( (void*)ptr );
+                        tracy_free_fast( (void*)ptr );
                         int64_t t = MemRead<int64_t>( &item->callstackSampleFat.time );
                         int64_t dt = t - refCtx;
                         refCtx = t;
@@ -1965,7 +2341,7 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token )
                         const auto h = MemRead<uint16_t>( &item->frameImageFat.h );
                         const auto csz = size_t( w * h / 2 );
                         SendLongString( ptr, (const char*)ptr, csz, QueueType::FrameImageData );
-                        tracy_free( (void*)ptr );
+                        tracy_free_fast( (void*)ptr );
                         break;
                     }
                     case QueueType::ZoneBegin:
@@ -2003,7 +2379,7 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token )
                         MemWrite( &item->gpuZoneBegin.cpuTime, dt );
                         ptr = MemRead<uint64_t>( &item->gpuZoneBegin.srcloc );
                         SendSourceLocationPayload( ptr );
-                        tracy_free( (void*)ptr );
+                        tracy_free_fast( (void*)ptr );
                         break;
                     }
                     case QueueType::GpuZoneEnd:
@@ -2019,15 +2395,17 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token )
                         size = MemRead<uint16_t>( &item->gpuContextNameFat.size );
                         SendSingleString( (const char*)ptr, size );
 #ifndef TRACY_ON_DEMAND
-                        tracy_free( (void*)ptr );
+                        tracy_free_fast( (void*)ptr );
 #endif
                         break;
-                    case QueueType::PlotData:
+                    case QueueType::PlotDataInt:
+                    case QueueType::PlotDataFloat:
+                    case QueueType::PlotDataDouble:
                     {
-                        int64_t t = MemRead<int64_t>( &item->plotData.time );
+                        int64_t t = MemRead<int64_t>( &item->plotDataInt.time );
                         int64_t dt = t - refThread;
                         refThread = t;
-                        MemWrite( &item->plotData.time, dt );
+                        MemWrite( &item->plotDataInt.time, dt );
                         break;
                     }
                     case QueueType::ContextSwitch:
@@ -2054,6 +2432,79 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token )
                         MemWrite( &item->gpuTime.gpuTime, dt );
                         break;
                     }
+#ifdef TRACY_HAS_CALLSTACK
+                    case QueueType::CallstackFrameSize:
+                    {
+                        auto data = (const CallstackEntry*)MemRead<uint64_t>( &item->callstackFrameSizeFat.data );
+                        auto datasz = MemRead<uint8_t>( &item->callstackFrameSizeFat.size );
+                        auto imageName = (const char*)MemRead<uint64_t>( &item->callstackFrameSizeFat.imageName );
+                        SendSingleString( imageName );
+                        AppendData( item++, QueueDataSize[idx] );
+
+                        for( uint8_t i=0; i<datasz; i++ )
+                        {
+                            const auto& frame = data[i];
+
+                            SendSingleString( frame.name );
+                            SendSecondString( frame.file );
+
+                            QueueItem item;
+                            MemWrite( &item.hdr.type, QueueType::CallstackFrame );
+                            MemWrite( &item.callstackFrame.line, frame.line );
+                            MemWrite( &item.callstackFrame.symAddr, frame.symAddr );
+                            MemWrite( &item.callstackFrame.symLen, frame.symLen );
+
+                            AppendData( &item, QueueDataSize[(int)QueueType::CallstackFrame] );
+
+                            tracy_free_fast( (void*)frame.name );
+                            tracy_free_fast( (void*)frame.file );
+                        }
+                        tracy_free_fast( (void*)data );
+                        continue;
+                    }
+                    case QueueType::SymbolInformation:
+                    {
+                        auto fileString = (const char*)MemRead<uint64_t>( &item->symbolInformationFat.fileString );
+                        auto needFree = MemRead<uint8_t>( &item->symbolInformationFat.needFree );
+                        SendSingleString( fileString );
+                        if( needFree ) tracy_free_fast( (void*)fileString );
+                        break;
+                    }
+                    case QueueType::SymbolCodeMetadata:
+                    {
+                        auto symbol = MemRead<uint64_t>( &item->symbolCodeMetadata.symbol );
+                        auto ptr = (const char*)MemRead<uint64_t>( &item->symbolCodeMetadata.ptr );
+                        auto size = MemRead<uint32_t>( &item->symbolCodeMetadata.size );
+                        SendLongString( symbol, ptr, size, QueueType::SymbolCode );
+                        tracy_free_fast( (void*)ptr );
+                        ++item;
+                        continue;
+                    }
+#endif
+#ifdef TRACY_HAS_SYSTEM_TRACING
+                    case QueueType::ExternalNameMetadata:
+                    {
+                        auto thread = MemRead<uint64_t>( &item->externalNameMetadata.thread );
+                        auto name = (const char*)MemRead<uint64_t>( &item->externalNameMetadata.name );
+                        auto threadName = (const char*)MemRead<uint64_t>( &item->externalNameMetadata.threadName );
+                        SendString( thread, threadName, QueueType::ExternalThreadName );
+                        SendString( thread, name, QueueType::ExternalName );
+                        tracy_free_fast( (void*)threadName );
+                        tracy_free_fast( (void*)name );
+                        ++item;
+                        continue;
+                    }
+#endif
+                    case QueueType::SourceCodeMetadata:
+                    {
+                        auto ptr = (const char*)MemRead<uint64_t>( &item->sourceCodeMetadata.ptr );
+                        auto size = MemRead<uint32_t>( &item->sourceCodeMetadata.size );
+                        auto id = MemRead<uint32_t>( &item->sourceCodeMetadata.id );
+                        SendLongString( (uint64_t)id, ptr, size, QueueType::SourceCode );
+                        tracy_free_fast( (void*)ptr );
+                        ++item;
+                        continue;
+                    }
                     default:
                         assert( false );
                         break;
@@ -2137,6 +2588,16 @@ Profiler::DequeueStatus Profiler::DequeueContextSwitches( tracy::moodycamel::Con
     return ( timeStop == -1 || sz > 0 ) ? DequeueStatus::DataDequeued : DequeueStatus::QueueEmpty;
 }
 
+#define ThreadCtxCheckSerial( _name ) \
+    uint32_t thread = MemRead<uint32_t>( &item->_name.thread ); \
+    switch( ThreadCtxCheck( thread ) ) \
+    { \
+    case ThreadCtxStatus::Same: break; \
+    case ThreadCtxStatus::Changed: assert( m_refTimeThread == 0 ); refThread = 0; break; \
+    case ThreadCtxStatus::ConnectionLost: return DequeueStatus::ConnectionLost; \
+    default: assert( false ); break; \
+    }
+
 Profiler::DequeueStatus Profiler::DequeueSerial()
 {
     {
@@ -2159,8 +2620,12 @@ Profiler::DequeueStatus Profiler::DequeueSerial()
     const auto sz = m_serialDequeue.size();
     if( sz > 0 )
     {
+        InitRpmalloc();
         int64_t refSerial = m_refTimeSerial;
         int64_t refGpu = m_refTimeGpu;
+#ifdef TRACY_FIBERS
+        int64_t refThread = m_refTimeThread;
+#endif
         auto item = m_serialDequeue.data();
         auto end = item + sz;
         while( item != end )
@@ -2174,7 +2639,7 @@ Profiler::DequeueStatus Profiler::DequeueSerial()
                 case QueueType::CallstackSerial:
                     ptr = MemRead<uint64_t>( &item->callstackFat.ptr );
                     SendCallstackPayload( ptr );
-                    tracy_free( (void*)ptr );
+                    tracy_free_fast( (void*)ptr );
                     break;
                 case QueueType::LockWait:
                 case QueueType::LockSharedWait:
@@ -2209,7 +2674,7 @@ Profiler::DequeueStatus Profiler::DequeueSerial()
                     uint16_t size = MemRead<uint16_t>( &item->lockNameFat.size );
                     SendSingleString( (const char*)ptr, size );
 #ifndef TRACY_ON_DEMAND
-                    tracy_free( (void*)ptr );
+                    tracy_free_fast( (void*)ptr );
 #endif
                     break;
                 }
@@ -2253,7 +2718,7 @@ Profiler::DequeueStatus Profiler::DequeueSerial()
                     MemWrite( &item->gpuZoneBegin.cpuTime, dt );
                     ptr = MemRead<uint64_t>( &item->gpuZoneBegin.srcloc );
                     SendSourceLocationPayload( ptr );
-                    tracy_free( (void*)ptr );
+                    tracy_free_fast( (void*)ptr );
                     break;
                 }
                 case QueueType::GpuZoneEndSerial:
@@ -2278,20 +2743,170 @@ Profiler::DequeueStatus Profiler::DequeueSerial()
                     uint16_t size = MemRead<uint16_t>( &item->gpuContextNameFat.size );
                     SendSingleString( (const char*)ptr, size );
 #ifndef TRACY_ON_DEMAND
-                    tracy_free( (void*)ptr );
+                    tracy_free_fast( (void*)ptr );
 #endif
                     break;
                 }
+#ifdef TRACY_FIBERS
+                case QueueType::ZoneBegin:
+                case QueueType::ZoneBeginCallstack:
+                {
+                    ThreadCtxCheckSerial( zoneBeginThread );
+                    int64_t t = MemRead<int64_t>( &item->zoneBegin.time );
+                    int64_t dt = t - refThread;
+                    refThread = t;
+                    MemWrite( &item->zoneBegin.time, dt );
+                    break;
+                }
+                case QueueType::ZoneBeginAllocSrcLoc:
+                case QueueType::ZoneBeginAllocSrcLocCallstack:
+                {
+                    ThreadCtxCheckSerial( zoneBeginThread );
+                    int64_t t = MemRead<int64_t>( &item->zoneBegin.time );
+                    int64_t dt = t - refThread;
+                    refThread = t;
+                    MemWrite( &item->zoneBegin.time, dt );
+                    ptr = MemRead<uint64_t>( &item->zoneBegin.srcloc );
+                    SendSourceLocationPayload( ptr );
+                    tracy_free_fast( (void*)ptr );
+                    break;
+                }
+                case QueueType::ZoneEnd:
+                {
+                    ThreadCtxCheckSerial( zoneEndThread );
+                    int64_t t = MemRead<int64_t>( &item->zoneEnd.time );
+                    int64_t dt = t - refThread;
+                    refThread = t;
+                    MemWrite( &item->zoneEnd.time, dt );
+                    break;
+                }
+                case QueueType::ZoneText:
+                case QueueType::ZoneName:
+                {
+                    ThreadCtxCheckSerial( zoneTextFatThread );
+                    ptr = MemRead<uint64_t>( &item->zoneTextFat.text );
+                    uint16_t size = MemRead<uint16_t>( &item->zoneTextFat.size );
+                    SendSingleString( (const char*)ptr, size );
+                    tracy_free_fast( (void*)ptr );
+                    break;
+                }
+                case QueueType::Message:
+                case QueueType::MessageCallstack:
+                {
+                    ThreadCtxCheckSerial( messageFatThread );
+                    ptr = MemRead<uint64_t>( &item->messageFat.text );
+                    uint16_t size = MemRead<uint16_t>( &item->messageFat.size );
+                    SendSingleString( (const char*)ptr, size );
+                    tracy_free_fast( (void*)ptr );
+                    break;
+                }
+                case QueueType::MessageColor:
+                case QueueType::MessageColorCallstack:
+                {
+                    ThreadCtxCheckSerial( messageColorFatThread );
+                    ptr = MemRead<uint64_t>( &item->messageColorFat.text );
+                    uint16_t size = MemRead<uint16_t>( &item->messageColorFat.size );
+                    SendSingleString( (const char*)ptr, size );
+                    tracy_free_fast( (void*)ptr );
+                    break;
+                }
+                case QueueType::Callstack:
+                {
+                    ThreadCtxCheckSerial( callstackFatThread );
+                    ptr = MemRead<uint64_t>( &item->callstackFat.ptr );
+                    SendCallstackPayload( ptr );
+                    tracy_free_fast( (void*)ptr );
+                    break;
+                }
+                case QueueType::CallstackAlloc:
+                {
+                    ThreadCtxCheckSerial( callstackAllocFatThread );
+                    ptr = MemRead<uint64_t>( &item->callstackAllocFat.nativePtr );
+                    if( ptr != 0 )
+                    {
+                        CutCallstack( (void*)ptr, "lua_pcall" );
+                        SendCallstackPayload( ptr );
+                        tracy_free_fast( (void*)ptr );
+                    }
+                    ptr = MemRead<uint64_t>( &item->callstackAllocFat.ptr );
+                    SendCallstackAlloc( ptr );
+                    tracy_free_fast( (void*)ptr );
+                    break;
+                }
+                case QueueType::FiberEnter:
+                {
+                    ThreadCtxCheckSerial( fiberEnter );
+                    int64_t t = MemRead<int64_t>( &item->fiberEnter.time );
+                    int64_t dt = t - refThread;
+                    refThread = t;
+                    MemWrite( &item->fiberEnter.time, dt );
+                    break;
+                }
+                case QueueType::FiberLeave:
+                {
+                    ThreadCtxCheckSerial( fiberLeave );
+                    int64_t t = MemRead<int64_t>( &item->fiberLeave.time );
+                    int64_t dt = t - refThread;
+                    refThread = t;
+                    MemWrite( &item->fiberLeave.time, dt );
+                    break;
+                }
+#endif
                 default:
                     assert( false );
                     break;
                 }
             }
+#ifdef TRACY_FIBERS
+            else
+            {
+                switch( (QueueType)idx )
+                {
+                case QueueType::ZoneColor:
+                {
+                    ThreadCtxCheckSerial( zoneColorThread );
+                    break;
+                }
+                case QueueType::ZoneValue:
+                {
+                    ThreadCtxCheckSerial( zoneValueThread );
+                    break;
+                }
+                case QueueType::ZoneValidation:
+                {
+                    ThreadCtxCheckSerial( zoneValidationThread );
+                    break;
+                }
+                case QueueType::MessageLiteral:
+                case QueueType::MessageLiteralCallstack:
+                {
+                    ThreadCtxCheckSerial( messageLiteralThread );
+                    break;
+                }
+                case QueueType::MessageLiteralColor:
+                case QueueType::MessageLiteralColorCallstack:
+                {
+                    ThreadCtxCheckSerial( messageColorLiteralThread );
+                    break;
+                }
+                case QueueType::CrashReport:
+                {
+                    ThreadCtxCheckSerial( crashReportThread );
+                    break;
+                }
+                default:
+                    break;
+                }
+            }
+#endif
             if( !AppendData( item, QueueDataSize[idx] ) ) return DequeueStatus::ConnectionLost;
             item++;
         }
         m_refTimeSerial = refSerial;
         m_refTimeGpu = refGpu;
+#ifdef TRACY_FIBERS
+        m_refTimeThread = refThread;
+#endif
         m_serialDequeue.clear();
     }
     else
@@ -2301,6 +2916,18 @@ Profiler::DequeueStatus Profiler::DequeueSerial()
     return DequeueStatus::DataDequeued;
 }
 
+Profiler::ThreadCtxStatus Profiler::ThreadCtxCheck( uint32_t threadId )
+{
+    if( m_threadCtx == threadId ) return ThreadCtxStatus::Same;
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::ThreadContext );
+    MemWrite( &item.threadCtx.thread, threadId );
+    if( !AppendData( &item, QueueDataSize[(int)QueueType::ThreadContext] ) ) return ThreadCtxStatus::ConnectionLost;
+    m_threadCtx = threadId;
+    m_refTimeThread = 0;
+    return ThreadCtxStatus::Changed;
+}
+
 bool Profiler::CommitData()
 {
     bool ret = SendData( m_buffer + m_bufferStart, m_bufferOffset - m_bufferStart );
@@ -2323,7 +2950,8 @@ void Profiler::SendString( uint64_t str, const char* ptr, size_t len, QueueType
             type == QueueType::PlotName ||
             type == QueueType::FrameName ||
             type == QueueType::ExternalName ||
-            type == QueueType::ExternalThreadName );
+            type == QueueType::ExternalThreadName ||
+            type == QueueType::FiberName );
 
     QueueItem item;
     MemWrite( &item.hdr.type, type );
@@ -2495,43 +3123,207 @@ void Profiler::SendCallstackAlloc( uint64_t _ptr )
     AppendDataUnsafe( ptr, len );
 }
 
-void Profiler::SendCallstackFrame( uint64_t ptr )
+void Profiler::QueueCallstackFrame( uint64_t ptr )
 {
 #ifdef TRACY_HAS_CALLSTACK
-    const auto frameData = DecodeCallstackPtr( ptr );
-
-    {
-        SendSingleString( frameData.imageName );
-
-        QueueItem item;
-        MemWrite( &item.hdr.type, QueueType::CallstackFrameSize );
-        MemWrite( &item.callstackFrameSize.ptr, ptr );
-        MemWrite( &item.callstackFrameSize.size, frameData.size );
-
-        AppendData( &item, QueueDataSize[(int)QueueType::CallstackFrameSize] );
-    }
-
-    for( uint8_t i=0; i<frameData.size; i++ )
-    {
-        const auto& frame = frameData.data[i];
-
-        SendSingleString( frame.name );
-        SendSecondString( frame.file );
-
-        QueueItem item;
-        MemWrite( &item.hdr.type, QueueType::CallstackFrame );
-        MemWrite( &item.callstackFrame.line, frame.line );
-        MemWrite( &item.callstackFrame.symAddr, frame.symAddr );
-        MemWrite( &item.callstackFrame.symLen, frame.symLen );
-
-        AppendData( &item, QueueDataSize[(int)QueueType::CallstackFrame] );
-
-        tracy_free( (void*)frame.name );
-        tracy_free( (void*)frame.file );
-    }
+    m_symbolQueue.emplace( SymbolQueueItem { SymbolQueueItemType::CallstackFrame, ptr } );
+#else
+    AckServerQuery();
 #endif
 }
 
+void Profiler::QueueSymbolQuery( uint64_t symbol )
+{
+#ifdef TRACY_HAS_CALLSTACK
+    // Special handling for kernel frames
+    if( symbol >> 63 != 0 )
+    {
+        SendSingleString( "<kernel>" );
+        QueueItem item;
+        MemWrite( &item.hdr.type, QueueType::SymbolInformation );
+        MemWrite( &item.symbolInformation.line, 0 );
+        MemWrite( &item.symbolInformation.symAddr, symbol );
+        AppendData( &item, QueueDataSize[(int)QueueType::SymbolInformation] );
+    }
+    else
+    {
+        m_symbolQueue.emplace( SymbolQueueItem { SymbolQueueItemType::SymbolQuery, symbol } );
+    }
+#else
+    AckServerQuery();
+#endif
+}
+
+void Profiler::QueueExternalName( uint64_t ptr )
+{
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    m_symbolQueue.emplace( SymbolQueueItem { SymbolQueueItemType::ExternalName, ptr } );
+#endif
+}
+
+void Profiler::QueueKernelCode( uint64_t symbol, uint32_t size )
+{
+    assert( symbol >> 63 != 0 );
+#ifdef TRACY_HAS_CALLSTACK
+    m_symbolQueue.emplace( SymbolQueueItem { SymbolQueueItemType::KernelCode, symbol, size } );
+#else
+    AckSymbolCodeNotAvailable();
+#endif
+}
+
+void Profiler::QueueSourceCodeQuery( uint32_t id )
+{
+    assert( m_exectime != 0 );
+    assert( m_queryData );
+    m_symbolQueue.emplace( SymbolQueueItem { SymbolQueueItemType::SourceCode, uint64_t( m_queryData ), uint64_t( m_queryImage ), id } );
+    m_queryData = nullptr;
+    m_queryImage = nullptr;
+}
+
+#ifdef TRACY_HAS_CALLSTACK
+void Profiler::HandleSymbolQueueItem( const SymbolQueueItem& si )
+{
+    switch( si.type )
+    {
+    case SymbolQueueItemType::CallstackFrame:
+    {
+        const auto frameData = DecodeCallstackPtr( si.ptr );
+        auto data = tracy_malloc_fast( sizeof( CallstackEntry ) * frameData.size );
+        memcpy( data, frameData.data, sizeof( CallstackEntry ) * frameData.size );
+        TracyLfqPrepare( QueueType::CallstackFrameSize );
+        MemWrite( &item->callstackFrameSizeFat.ptr, si.ptr );
+        MemWrite( &item->callstackFrameSizeFat.size, frameData.size );
+        MemWrite( &item->callstackFrameSizeFat.data, (uint64_t)data );
+        MemWrite( &item->callstackFrameSizeFat.imageName, (uint64_t)frameData.imageName );
+        TracyLfqCommit;
+        break;
+    }
+    case SymbolQueueItemType::SymbolQuery:
+    {
+#ifdef __ANDROID__
+        // On Android it's common for code to be in mappings that are only executable
+        // but not readable.
+        if( !EnsureReadable( si.ptr ) )
+        {
+            TracyLfqPrepare( QueueType::AckServerQueryNoop );
+            TracyLfqCommit;
+            break;
+        }
+#endif
+        const auto sym = DecodeSymbolAddress( si.ptr );
+        TracyLfqPrepare( QueueType::SymbolInformation );
+        MemWrite( &item->symbolInformationFat.line, sym.line );
+        MemWrite( &item->symbolInformationFat.symAddr, si.ptr );
+        MemWrite( &item->symbolInformationFat.fileString, (uint64_t)sym.file );
+        MemWrite( &item->symbolInformationFat.needFree, (uint8_t)sym.needFree );
+        TracyLfqCommit;
+        break;
+    }
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    case SymbolQueueItemType::ExternalName:
+    {
+        const char* threadName;
+        const char* name;
+        SysTraceGetExternalName( si.ptr, threadName, name );
+        TracyLfqPrepare( QueueType::ExternalNameMetadata );
+        MemWrite( &item->externalNameMetadata.thread, si.ptr );
+        MemWrite( &item->externalNameMetadata.name, (uint64_t)name );
+        MemWrite( &item->externalNameMetadata.threadName, (uint64_t)threadName );
+        TracyLfqCommit;
+        break;
+    }
+#endif
+    case SymbolQueueItemType::KernelCode:
+    {
+#ifdef _WIN32
+        auto mod = GetKernelModulePath( si.ptr );
+        if( mod )
+        {
+            auto fn = DecodeCallstackPtrFast( si.ptr );
+            if( *fn )
+            {
+                auto hnd = LoadLibraryExA( mod, nullptr, DONT_RESOLVE_DLL_REFERENCES );
+                if( hnd )
+                {
+                    auto ptr = (const void*)GetProcAddress( hnd, fn );
+                    if( ptr )
+                    {
+                        auto buf = (char*)tracy_malloc( si.extra );
+                        memcpy( buf, ptr, si.extra );
+                        FreeLibrary( hnd );
+                        TracyLfqPrepare( QueueType::SymbolCodeMetadata );
+                        MemWrite( &item->symbolCodeMetadata.symbol, si.ptr );
+                        MemWrite( &item->symbolCodeMetadata.ptr, (uint64_t)buf );
+                        MemWrite( &item->symbolCodeMetadata.size, (uint32_t)si.extra );
+                        TracyLfqCommit;
+                        break;
+                    }
+                    FreeLibrary( hnd );
+                }
+            }
+        }
+#endif
+        TracyLfqPrepare( QueueType::AckSymbolCodeNotAvailable );
+        TracyLfqCommit;
+        break;
+    }
+    case SymbolQueueItemType::SourceCode:
+        HandleSourceCodeQuery( (char*)si.ptr, (char*)si.extra, si.id );
+        break;
+    default:
+        assert( false );
+        break;
+    }
+}
+
+void Profiler::SymbolWorker()
+{
+#if defined __linux__ && !defined TRACY_NO_CRASH_HANDLER
+    s_symbolTid = syscall( SYS_gettid );
+#endif
+
+    ThreadExitHandler threadExitHandler;
+    SetThreadName( "Tracy Symbol Worker" );
+#ifdef TRACY_USE_RPMALLOC
+    InitRpmalloc();
+#endif
+    InitCallstack();
+    while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+
+    for(;;)
+    {
+        const auto shouldExit = ShouldExit();
+#ifdef TRACY_ON_DEMAND
+        if( !IsConnected() )
+        {
+            if( shouldExit )
+            {
+                s_symbolThreadGone.store( true, std::memory_order_release );
+                return;
+            }
+            while( m_symbolQueue.front() ) m_symbolQueue.pop();
+            std::this_thread::sleep_for( std::chrono::milliseconds( 20 ) );
+            continue;
+        }
+#endif
+        auto si = m_symbolQueue.front();
+        if( si )
+        {
+            HandleSymbolQueueItem( *si );
+            m_symbolQueue.pop();
+        }
+        else
+        {
+            if( shouldExit )
+            {
+                s_symbolThreadGone.store( true, std::memory_order_release );
+                return;
+            }
+            std::this_thread::sleep_for( std::chrono::milliseconds( 20 ) );
+        }
+    }
+}
+#endif
 
 bool Profiler::HandleServerQuery()
 {
@@ -2569,7 +3361,7 @@ bool Profiler::HandleServerQuery()
     case ServerQueryTerminate:
         return false;
     case ServerQueryCallstackFrame:
-        SendCallstackFrame( ptr );
+        QueueCallstackFrame( ptr );
         break;
     case ServerQueryFrameName:
         SendString( ptr, (const char*)ptr, QueueType::FrameName );
@@ -2579,28 +3371,29 @@ bool Profiler::HandleServerQuery()
         return false;
 #ifdef TRACY_HAS_SYSTEM_TRACING
     case ServerQueryExternalName:
-        SysTraceSendExternalName( ptr );
+        QueueExternalName( ptr );
         break;
 #endif
     case ServerQueryParameter:
         HandleParameter( ptr );
         break;
     case ServerQuerySymbol:
-        HandleSymbolQuery( ptr );
+        QueueSymbolQuery( ptr );
         break;
 #ifndef TRACY_NO_CODE_TRANSFER
     case ServerQuerySymbolCode:
         HandleSymbolCodeQuery( ptr, extra );
         break;
 #endif
-    case ServerQueryCodeLocation:
-        SendCodeLocation( ptr );
-        break;
     case ServerQuerySourceCode:
-        HandleSourceCodeQuery();
+        QueueSourceCodeQuery( uint32_t( ptr ) );
         break;
     case ServerQueryDataTransfer:
-        assert( !m_queryData );
+        if( m_queryData )
+        {
+            assert( !m_queryImage );
+            m_queryImage = m_queryData;
+        }
         m_queryDataPtr = m_queryData = (char*)tracy_malloc( ptr + 11 );
         AckServerQuery();
         break;
@@ -2610,6 +3403,11 @@ bool Profiler::HandleServerQuery()
         m_queryDataPtr += 12;
         AckServerQuery();
         break;
+#ifdef TRACY_FIBERS
+    case ServerQueryFiberName:
+        SendString( ptr, (const char*)ptr, QueueType::FiberName );
+        break;
+#endif
     default:
         assert( false );
         break;
@@ -2702,23 +3500,32 @@ void Profiler::HandleDisconnect()
 
 void Profiler::CalibrateTimer()
 {
-#ifdef TRACY_HW_TIMER
-    std::atomic_signal_fence( std::memory_order_acq_rel );
-    const auto t0 = std::chrono::high_resolution_clock::now();
-    const auto r0 = GetTime();
-    std::atomic_signal_fence( std::memory_order_acq_rel );
-    std::this_thread::sleep_for( std::chrono::milliseconds( 200 ) );
-    std::atomic_signal_fence( std::memory_order_acq_rel );
-    const auto t1 = std::chrono::high_resolution_clock::now();
-    const auto r1 = GetTime();
-    std::atomic_signal_fence( std::memory_order_acq_rel );
-
-    const auto dt = std::chrono::duration_cast<std::chrono::nanoseconds>( t1 - t0 ).count();
-    const auto dr = r1 - r0;
-
-    m_timerMul = double( dt ) / double( dr );
-#else
     m_timerMul = 1.;
+
+#ifdef TRACY_HW_TIMER
+
+#  if !defined TRACY_TIMER_QPC && defined TRACY_TIMER_FALLBACK
+    const bool needCalibration = HardwareSupportsInvariantTSC();
+#  else
+    const bool needCalibration = true;
+#  endif
+    if( needCalibration )
+    {
+        std::atomic_signal_fence( std::memory_order_acq_rel );
+        const auto t0 = std::chrono::high_resolution_clock::now();
+        const auto r0 = GetTime();
+        std::atomic_signal_fence( std::memory_order_acq_rel );
+        std::this_thread::sleep_for( std::chrono::milliseconds( 200 ) );
+        std::atomic_signal_fence( std::memory_order_acq_rel );
+        const auto t1 = std::chrono::high_resolution_clock::now();
+        const auto r1 = GetTime();
+        std::atomic_signal_fence( std::memory_order_acq_rel );
+
+        const auto dt = std::chrono::duration_cast<std::chrono::nanoseconds>( t1 - t0 ).count();
+        const auto dr = r1 - r0;
+
+        m_timerMul = double( dt ) / double( dr );
+    }
 #endif
 }
 
@@ -2784,8 +3591,12 @@ void Profiler::ReportTopology()
         uint32_t thread;
     };
 
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
+#  ifdef TRACY_UWP
+    t_GetLogicalProcessorInformationEx _GetLogicalProcessorInformationEx = &::GetLogicalProcessorInformationEx;
+#  else
     t_GetLogicalProcessorInformationEx _GetLogicalProcessorInformationEx = (t_GetLogicalProcessorInformationEx)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetLogicalProcessorInformationEx" );
+#  endif
     if( !_GetLogicalProcessorInformationEx ) return;
 
     DWORD psz = 0;
@@ -2919,10 +3730,11 @@ void Profiler::SendFrameMark( const char* name )
 #ifdef TRACY_ON_DEMAND
     if( !GetProfiler().IsConnected() ) return;
 #endif
-    TracyLfqPrepare( QueueType::FrameMarkMsg );
+    auto item = QueueSerial();
+    MemWrite( &item->hdr.type, QueueType::FrameMarkMsg );
     MemWrite( &item->frameMark.time, GetTime() );
     MemWrite( &item->frameMark.name, uint64_t( name ) );
-    TracyLfqCommit;
+    QueueSerialFinish();
 }
 
 void Profiler::SendFrameMark( const char* name, QueueType type )
@@ -2938,16 +3750,39 @@ void Profiler::SendFrameMark( const char* name, QueueType type )
     QueueSerialFinish();
 }
 
+void Profiler::SendFrameImage( const void* image, uint16_t w, uint16_t h, uint8_t offset, bool flip )
+{
+#ifndef TRACY_NO_FRAME_IMAGE
+    auto& profiler = GetProfiler();
+    assert( profiler.m_frameCount.load( std::memory_order_relaxed ) < std::numeric_limits<uint32_t>::max() );
+#  ifdef TRACY_ON_DEMAND
+    if( !profiler.IsConnected() ) return;
+#  endif
+    const auto sz = size_t( w ) * size_t( h ) * 4;
+    auto ptr = (char*)tracy_malloc( sz );
+    memcpy( ptr, image, sz );
+
+    profiler.m_fiLock.lock();
+    auto fi = profiler.m_fiQueue.prepare_next();
+    fi->image = ptr;
+    fi->frame = uint32_t( profiler.m_frameCount.load( std::memory_order_relaxed ) - offset );
+    fi->w = w;
+    fi->h = h;
+    fi->flip = flip;
+    profiler.m_fiQueue.commit_next();
+    profiler.m_fiLock.unlock();
+#endif
+}
+
 void Profiler::PlotData( const char* name, int64_t val )
 {
 #ifdef TRACY_ON_DEMAND
     if( !GetProfiler().IsConnected() ) return;
 #endif
-    TracyLfqPrepare( QueueType::PlotData );
-    MemWrite( &item->plotData.name, (uint64_t)name );
-    MemWrite( &item->plotData.time, GetTime() );
-    MemWrite( &item->plotData.type, PlotDataType::Int );
-    MemWrite( &item->plotData.data.i, val );
+    TracyLfqPrepare( QueueType::PlotDataInt );
+    MemWrite( &item->plotDataInt.name, (uint64_t)name );
+    MemWrite( &item->plotDataInt.time, GetTime() );
+    MemWrite( &item->plotDataInt.val, val );
     TracyLfqCommit;
 }
 
@@ -2956,11 +3791,10 @@ void Profiler::PlotData( const char* name, float val )
 #ifdef TRACY_ON_DEMAND
     if( !GetProfiler().IsConnected() ) return;
 #endif
-    TracyLfqPrepare( QueueType::PlotData );
-    MemWrite( &item->plotData.name, (uint64_t)name );
-    MemWrite( &item->plotData.time, GetTime() );
-    MemWrite( &item->plotData.type, PlotDataType::Float );
-    MemWrite( &item->plotData.data.f, val );
+    TracyLfqPrepare( QueueType::PlotDataFloat );
+    MemWrite( &item->plotDataFloat.name, (uint64_t)name );
+    MemWrite( &item->plotDataFloat.time, GetTime() );
+    MemWrite( &item->plotDataFloat.val, val );
     TracyLfqCommit;
 }
 
@@ -2969,19 +3803,21 @@ void Profiler::PlotData( const char* name, double val )
 #ifdef TRACY_ON_DEMAND
     if( !GetProfiler().IsConnected() ) return;
 #endif
-    TracyLfqPrepare( QueueType::PlotData );
-    MemWrite( &item->plotData.name, (uint64_t)name );
-    MemWrite( &item->plotData.time, GetTime() );
-    MemWrite( &item->plotData.type, PlotDataType::Double );
-    MemWrite( &item->plotData.data.d, val );
+    TracyLfqPrepare( QueueType::PlotDataDouble );
+    MemWrite( &item->plotDataDouble.name, (uint64_t)name );
+    MemWrite( &item->plotDataDouble.time, GetTime() );
+    MemWrite( &item->plotDataDouble.val, val );
     TracyLfqCommit;
 }
 
-void Profiler::ConfigurePlot( const char* name, PlotFormatType type )
+void Profiler::ConfigurePlot( const char* name, PlotFormatType type, bool step, bool fill, uint32_t color )
 {
     TracyLfqPrepare( QueueType::PlotConfig );
     MemWrite( &item->plotConfig.name, (uint64_t)name );
     MemWrite( &item->plotConfig.type, (uint8_t)type );
+    MemWrite( &item->plotConfig.step, (uint8_t)step );
+    MemWrite( &item->plotConfig.fill, (uint8_t)fill );
+    MemWrite( &item->plotConfig.color, color );
 
 #ifdef TRACY_ON_DEMAND
     GetProfiler().DeferItem( *item );
@@ -2990,7 +3826,7 @@ void Profiler::ConfigurePlot( const char* name, PlotFormatType type )
     TracyLfqCommit;
 }
 
-    void Profiler::Message( const char* txt, size_t size, int callstack )
+void Profiler::Message( const char* txt, size_t size, int callstack )
 {
     assert( size < std::numeric_limits<uint16_t>::max() );
 #ifdef TRACY_ON_DEMAND
@@ -2998,17 +3834,17 @@ void Profiler::ConfigurePlot( const char* name, PlotFormatType type )
 #endif
     if( callstack != 0 )
     {
-        InitRPMallocThread();
         tracy::GetProfiler().SendCallstack( callstack );
     }
 
-    TracyLfqPrepare( callstack == 0 ? QueueType::Message : QueueType::MessageCallstack );
     auto ptr = (char*)tracy_malloc( size );
     memcpy( ptr, txt, size );
+
+    TracyQueuePrepare( callstack == 0 ? QueueType::Message : QueueType::MessageCallstack );
     MemWrite( &item->messageFat.time, GetTime() );
     MemWrite( &item->messageFat.text, (uint64_t)ptr );
     MemWrite( &item->messageFat.size, (uint16_t)size );
-    TracyLfqCommit;
+    TracyQueueCommit( messageFatThread );
 }
 
 void Profiler::Message( const char* txt, int callstack )
@@ -3018,14 +3854,13 @@ void Profiler::Message( const char* txt, int callstack )
 #endif
     if( callstack != 0 )
     {
-        InitRPMallocThread();
         tracy::GetProfiler().SendCallstack( callstack );
     }
 
-    TracyLfqPrepare( callstack == 0 ? QueueType::MessageLiteral : QueueType::MessageLiteralCallstack );
+    TracyQueuePrepare( callstack == 0 ? QueueType::MessageLiteral : QueueType::MessageLiteralCallstack );
     MemWrite( &item->messageLiteral.time, GetTime() );
     MemWrite( &item->messageLiteral.text, (uint64_t)txt );
-    TracyLfqCommit;
+    TracyQueueCommit( messageLiteralThread );
 }
 
 void Profiler::MessageColor( const char* txt, size_t size, uint32_t color, int callstack )
@@ -3036,20 +3871,20 @@ void Profiler::MessageColor( const char* txt, size_t size, uint32_t color, int c
 #endif
     if( callstack != 0 )
     {
-        InitRPMallocThread();
         tracy::GetProfiler().SendCallstack( callstack );
     }
 
-    TracyLfqPrepare( callstack == 0 ? QueueType::MessageColor : QueueType::MessageColorCallstack );
     auto ptr = (char*)tracy_malloc( size );
     memcpy( ptr, txt, size );
+
+    TracyQueuePrepare( callstack == 0 ? QueueType::MessageColor : QueueType::MessageColorCallstack );
     MemWrite( &item->messageColorFat.time, GetTime() );
     MemWrite( &item->messageColorFat.text, (uint64_t)ptr );
     MemWrite( &item->messageColorFat.r, uint8_t( ( color       ) & 0xFF ) );
     MemWrite( &item->messageColorFat.g, uint8_t( ( color >> 8  ) & 0xFF ) );
     MemWrite( &item->messageColorFat.b, uint8_t( ( color >> 16 ) & 0xFF ) );
     MemWrite( &item->messageColorFat.size, (uint16_t)size );
-    TracyLfqCommit;
+    TracyQueueCommit( messageColorFatThread );
 }
 
 void Profiler::MessageColor( const char* txt, uint32_t color, int callstack )
@@ -3059,23 +3894,21 @@ void Profiler::MessageColor( const char* txt, uint32_t color, int callstack )
 #endif
     if( callstack != 0 )
     {
-        InitRPMallocThread();
         tracy::GetProfiler().SendCallstack( callstack );
     }
 
-    TracyLfqPrepare( callstack == 0 ? QueueType::MessageLiteralColor : QueueType::MessageLiteralColorCallstack );
+    TracyQueuePrepare( callstack == 0 ? QueueType::MessageLiteralColor : QueueType::MessageLiteralColorCallstack );
     MemWrite( &item->messageColorLiteral.time, GetTime() );
     MemWrite( &item->messageColorLiteral.text, (uint64_t)txt );
     MemWrite( &item->messageColorLiteral.r, uint8_t( ( color       ) & 0xFF ) );
     MemWrite( &item->messageColorLiteral.g, uint8_t( ( color >> 8  ) & 0xFF ) );
     MemWrite( &item->messageColorLiteral.b, uint8_t( ( color >> 16 ) & 0xFF ) );
-    TracyLfqCommit;
+    TracyQueueCommit( messageColorLiteralThread );
 }
 
 void Profiler::MessageAppInfo( const char* txt, size_t size )
 {
     assert( size < std::numeric_limits<uint16_t>::max() );
-    InitRPMallocThread();
     auto ptr = (char*)tracy_malloc( size );
     memcpy( ptr, txt, size );
     TracyLfqPrepare( QueueType::MessageAppInfo );
@@ -3126,7 +3959,6 @@ void Profiler::MemAllocCallstack( const void* ptr, size_t size, int depth, bool
 #  endif
     const auto thread = GetThreadHandle();
 
-    InitRPMallocThread();
     auto callstack = Callstack( depth );
 
     profiler.m_serialLock.lock();
@@ -3134,6 +3966,7 @@ void Profiler::MemAllocCallstack( const void* ptr, size_t size, int depth, bool
     SendMemAlloc( QueueType::MemAllocCallstack, thread, ptr, size );
     profiler.m_serialLock.unlock();
 #else
+    static_cast<void>(depth); // unused
     MemAlloc( ptr, size, secure );
 #endif
 }
@@ -3141,6 +3974,11 @@ void Profiler::MemAllocCallstack( const void* ptr, size_t size, int depth, bool
 void Profiler::MemFreeCallstack( const void* ptr, int depth, bool secure )
 {
     if( secure && !ProfilerAvailable() ) return;
+    if( !ProfilerAllocatorAvailable() )
+    {
+        MemFree( ptr, secure );
+        return;
+    }
 #ifdef TRACY_HAS_CALLSTACK
     auto& profiler = GetProfiler();
 #  ifdef TRACY_ON_DEMAND
@@ -3148,7 +3986,6 @@ void Profiler::MemFreeCallstack( const void* ptr, int depth, bool secure )
 #  endif
     const auto thread = GetThreadHandle();
 
-    InitRPMallocThread();
     auto callstack = Callstack( depth );
 
     profiler.m_serialLock.lock();
@@ -3156,6 +3993,7 @@ void Profiler::MemFreeCallstack( const void* ptr, int depth, bool secure )
     SendMemFree( QueueType::MemFreeCallstack, thread, ptr );
     profiler.m_serialLock.unlock();
 #else
+    static_cast<void>(depth); // unused
     MemFree( ptr, secure );
 #endif
 }
@@ -3198,7 +4036,6 @@ void Profiler::MemAllocCallstackNamed( const void* ptr, size_t size, int depth,
 #  endif
     const auto thread = GetThreadHandle();
 
-    InitRPMallocThread();
     auto callstack = Callstack( depth );
 
     profiler.m_serialLock.lock();
@@ -3207,6 +4044,8 @@ void Profiler::MemAllocCallstackNamed( const void* ptr, size_t size, int depth,
     SendMemAlloc( QueueType::MemAllocCallstackNamed, thread, ptr, size );
     profiler.m_serialLock.unlock();
 #else
+    static_cast<void>(depth); // unused
+    static_cast<void>(name); // unused
     MemAlloc( ptr, size, secure );
 #endif
 }
@@ -3221,7 +4060,6 @@ void Profiler::MemFreeCallstackNamed( const void* ptr, int depth, bool secure, c
 #  endif
     const auto thread = GetThreadHandle();
 
-    InitRPMallocThread();
     auto callstack = Callstack( depth );
 
     profiler.m_serialLock.lock();
@@ -3230,6 +4068,8 @@ void Profiler::MemFreeCallstackNamed( const void* ptr, int depth, bool secure, c
     SendMemFree( QueueType::MemFreeCallstackNamed, thread, ptr );
     profiler.m_serialLock.unlock();
 #else
+    static_cast<void>(depth); // unused
+    static_cast<void>(name); // unused
     MemFree( ptr, secure );
 #endif
 }
@@ -3238,13 +4078,22 @@ void Profiler::SendCallstack( int depth )
 {
 #ifdef TRACY_HAS_CALLSTACK
     auto ptr = Callstack( depth );
-    TracyLfqPrepare( QueueType::Callstack );
+    TracyQueuePrepare( QueueType::Callstack );
     MemWrite( &item->callstackFat.ptr, (uint64_t)ptr );
-    TracyLfqCommit;
+    TracyQueueCommit( callstackFatThread );
+#else
+    static_cast<void>(depth); // unused
 #endif
 }
 
 void Profiler::ParameterRegister( ParameterCallback cb ) { GetProfiler().m_paramCallback = cb; }
+void Profiler::ParameterRegister( ParameterCallback cb, void* data )
+{
+    auto& profiler = GetProfiler();
+    profiler.m_paramCallback = cb;
+    profiler.m_paramCallbackData = data;
+}
+
 void Profiler::ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val )
 {
     TracyLfqPrepare( QueueType::ParamSetup );
@@ -3263,11 +4112,12 @@ void Profiler::ParameterSetup( uint32_t idx, const char* name, bool isBool, int3
 void Profiler::SendCallstack( int depth, const char* skipBefore )
 {
 #ifdef TRACY_HAS_CALLSTACK
-    TracyLfqPrepare( QueueType::Callstack );
     auto ptr = Callstack( depth );
     CutCallstack( ptr, skipBefore );
+
+    TracyQueuePrepare( QueueType::Callstack );
     MemWrite( &item->callstackFat.ptr, (uint64_t)ptr );
-    TracyLfqCommit;
+    TracyQueueCommit( callstackFatThread );
 #endif
 }
 
@@ -3322,246 +4172,126 @@ void Profiler::HandleParameter( uint64_t payload )
     assert( m_paramCallback );
     const auto idx = uint32_t( payload >> 32 );
     const auto val = int32_t( payload & 0xFFFFFFFF );
-    m_paramCallback( idx, val );
+    m_paramCallback( m_paramCallbackData, idx, val );
     AckServerQuery();
 }
 
-#ifdef __ANDROID__
-// Implementation helpers of EnsureReadable(address).
-// This is so far only needed on Android, where it is common for libraries to be mapped
-// with only executable, not readable, permissions. Typical example (line from /proc/self/maps):
-/*
-746b63b000-746b6dc000 --xp 00042000 07:48 35                             /apex/com.android.runtime/lib64/bionic/libc.so
-*/
-// See https://github.com/wolfpld/tracy/issues/125 .
-// To work around this, we parse /proc/self/maps and we use mprotect to set read permissions
-// on any mappings that contain symbols addresses hit by HandleSymbolCodeQuery.
-
-namespace {
-// Holds some information about a single memory mapping.
-struct MappingInfo {
-    // Start of address range. Inclusive.
-    uintptr_t start_address;
-    // End of address range. Exclusive, so the mapping is the half-open interval
-    // [start, end) and its length in bytes is `end - start`. As in /proc/self/maps.
-    uintptr_t end_address;
-    // Read/Write/Executable permissions.
-    bool perm_r, perm_w, perm_x;
-};
-}  // anonymous namespace
-
-// Internal implementation helper for LookUpMapping(address).
-//
-// Parses /proc/self/maps returning a vector<MappingInfo>.
-// /proc/self/maps is assumed to be sorted by ascending address, so the resulting
-// vector is sorted by ascending address too.
-static std::vector<MappingInfo> ParseMappings()
-{
-    std::vector<MappingInfo> result;
-    FILE* file = fopen( "/proc/self/maps", "r" );
-    if( !file ) return result;
-    char line[1024];
-    while( fgets( line, sizeof( line ), file ) )
-    {
-        uintptr_t start_addr;
-        uintptr_t end_addr;
-        if( sscanf( line, "%lx-%lx", &start_addr, &end_addr ) != 2 ) continue;
-        char* first_space = strchr( line, ' ' );
-        if( !first_space ) continue;
-        char* perm = first_space + 1;
-        char* second_space = strchr( perm, ' ' );
-        if( !second_space || second_space - perm != 4 ) continue;
-        result.emplace_back();
-        auto& mapping = result.back();
-        mapping.start_address = start_addr;
-        mapping.end_address = end_addr;
-        mapping.perm_r = perm[0] == 'r';
-        mapping.perm_w = perm[1] == 'w';
-        mapping.perm_x = perm[2] == 'x';
-    }
-    fclose( file );
-    return result;
-}
-
-// Internal implementation helper for LookUpMapping(address).
-//
-// Takes as input an `address` and a known vector `mappings`, assumed to be
-// sorted by increasing addresses, as /proc/self/maps seems to be.
-// Returns a pointer to the MappingInfo describing the mapping that this
-// address belongs to, or nullptr if the address isn't in `mappings`.
-static MappingInfo* LookUpMapping(std::vector<MappingInfo>& mappings, uintptr_t address)
-{
-    // Comparison function for std::lower_bound. Returns true if all addresses in `m1`
-    // are lower than `addr`.
-    auto Compare = []( const MappingInfo& m1, uintptr_t addr ) {
-        // '<=' because the address ranges are half-open intervals, [start, end).
-        return m1.end_address <= addr;
-    };
-    auto iter = std::lower_bound( mappings.begin(), mappings.end(), address, Compare );
-    if( iter == mappings.end() || iter->start_address > address) {
-        return nullptr;
-    }
-    return &*iter;
-}
-
-// Internal implementation helper for EnsureReadable(address).
-//
-// Takes as input an `address` and returns a pointer to a MappingInfo
-// describing the mapping that this address belongs to, or nullptr if
-// the address isn't in any known mapping.
-//
-// This function is stateful and not reentrant (assumes to be called from
-// only one thread). It holds a vector of mappings parsed from /proc/self/maps.
-//
-// Attempts to react to mappings changes by re-parsing /proc/self/maps.
-static MappingInfo* LookUpMapping(uintptr_t address)
-{
-    // Static state managed by this function. Not constant, we mutate that state as
-    // we turn some mappings readable. Initially parsed once here, updated as needed below.
-    static std::vector<MappingInfo> s_mappings = ParseMappings();
-    MappingInfo* mapping = LookUpMapping( s_mappings, address );
-    if( mapping ) return mapping;
-
-    // This address isn't in any known mapping. Try parsing again, maybe
-    // mappings changed.
-    s_mappings = ParseMappings();
-    return LookUpMapping( s_mappings, address );
-}
-
-// Internal implementation helper for EnsureReadable(address).
-//
-// Attempts to make the specified `mapping` readable if it isn't already.
-// Returns true if and only if the mapping is readable.
-static bool EnsureReadable( MappingInfo& mapping )
-{
-    if( mapping.perm_r )
-    {
-        // The mapping is already readable.
-        return true;
-    }
-    int prot = PROT_READ;
-    if( mapping.perm_w ) prot |= PROT_WRITE;
-    if( mapping.perm_x ) prot |= PROT_EXEC;
-    if( mprotect( reinterpret_cast<void*>( mapping.start_address ),
-                  mapping.end_address - mapping.start_address, prot ) == -1 )
-    {
-        // Failed to make the mapping readable. Shouldn't happen, hasn't
-        // been observed yet. If it happened in practice, we should consider
-        // adding a bool to MappingInfo to track this to avoid retrying mprotect
-        // everytime on such mappings.
-        return false;
-    }
-    // The mapping is now readable. Update `mapping` so the next call will be fast.
-    mapping.perm_r = true;
-    return true;
-}
-
-// Attempts to set the read permission on the entire mapping containing the
-// specified address. Returns true if and only if the mapping is now readable.
-static bool EnsureReadable( uintptr_t address )
-{
-    MappingInfo* mapping = LookUpMapping(address);
-    return mapping && EnsureReadable( *mapping );
-}
-
-#endif  // defined __ANDROID__
-
-void Profiler::HandleSymbolQuery( uint64_t symbol )
-{
-#ifdef TRACY_HAS_CALLSTACK
-#ifdef __ANDROID__
-    // On Android it's common for code to be in mappings that are only executable
-    // but not readable.
-    if( !EnsureReadable( symbol ) )
-    {
-        return;
-    }
-#endif
-    const auto sym = DecodeSymbolAddress( symbol );
-
-    SendSingleString( sym.file );
-
-    QueueItem item;
-    MemWrite( &item.hdr.type, QueueType::SymbolInformation );
-    MemWrite( &item.symbolInformation.line, sym.line );
-    MemWrite( &item.symbolInformation.symAddr, symbol );
-
-    AppendData( &item, QueueDataSize[(int)QueueType::SymbolInformation] );
-
-    if( sym.needFree ) tracy_free( (void*)sym.file );
-#endif
-}
-
 void Profiler::HandleSymbolCodeQuery( uint64_t symbol, uint32_t size )
 {
-#ifdef __ANDROID__
-    // On Android it's common for code to be in mappings that are only executable
-    // but not readable.
-    if( !EnsureReadable( symbol ) )
+    if( symbol >> 63 != 0 )
     {
-        return;
+        QueueKernelCode( symbol, size );
     }
+    else
+    {
+#ifdef __ANDROID__
+        // On Android it's common for code to be in mappings that are only executable
+        // but not readable.
+        if( !EnsureReadable( symbol ) )
+        {
+            AckSymbolCodeNotAvailable();
+            return;
+        }
 #endif
-    SendLongString( symbol, (const char*)symbol, size, QueueType::SymbolCode );
+        SendLongString( symbol, (const char*)symbol, size, QueueType::SymbolCode );
+    }
 }
 
-void Profiler::HandleSourceCodeQuery()
+void Profiler::HandleSourceCodeQuery( char* data, char* image, uint32_t id )
 {
-    assert( m_exectime != 0 );
-    assert( m_queryData );
-
+    bool ok = false;
     struct stat st;
-    if( stat( m_queryData, &st ) == 0 && (uint64_t)st.st_mtime < m_exectime && st.st_size < ( TargetFrameSize - 16 ) )
+    if( stat( data, &st ) == 0 && (uint64_t)st.st_mtime < m_exectime )
     {
-        FILE* f = fopen( m_queryData, "rb" );
-        tracy_free( m_queryData );
-        if( f )
+        if( st.st_size < ( TargetFrameSize - 16 ) )
         {
-            auto ptr = (char*)tracy_malloc( st.st_size );
-            auto rd = fread( ptr, 1, st.st_size, f );
-            fclose( f );
-            if( rd == (size_t)st.st_size )
+            FILE* f = fopen( data, "rb" );
+            if( f )
             {
-                SendLongString( (uint64_t)ptr, ptr, rd, QueueType::SourceCode );
+                auto ptr = (char*)tracy_malloc_fast( st.st_size );
+                auto rd = fread( ptr, 1, st.st_size, f );
+                fclose( f );
+                if( rd == (size_t)st.st_size )
+                {
+                    TracyLfqPrepare( QueueType::SourceCodeMetadata );
+                    MemWrite( &item->sourceCodeMetadata.ptr, (uint64_t)ptr );
+                    MemWrite( &item->sourceCodeMetadata.size, (uint32_t)rd );
+                    MemWrite( &item->sourceCodeMetadata.id, id );
+                    TracyLfqCommit;
+                    ok = true;
+                }
             }
-            else
-            {
-                AckSourceCodeNotAvailable();
-            }
-            tracy_free( ptr );
         }
-        else
+    }
+
+#ifdef TRACY_DEBUGINFOD
+    else if( image && data[0] == '/' )
+    {
+        size_t size;
+        auto buildid = GetBuildIdForImage( image, size );
+        if( buildid )
         {
-            AckSourceCodeNotAvailable();
+            auto d = debuginfod_find_source( GetDebuginfodClient(), buildid, size, data, nullptr );
+            TracyDebug( "DebugInfo source query: %s, fn: %s, image: %s\n", d >= 0 ? " ok " : "fail", data, image );
+            if( d >= 0 )
+            {
+                struct stat st;
+                fstat( d, &st );
+                if( st.st_size < ( TargetFrameSize - 16 ) )
+                {
+                    lseek( d, 0, SEEK_SET );
+                    auto ptr = (char*)tracy_malloc_fast( st.st_size );
+                    auto rd = read( d, ptr, st.st_size );
+                    if( rd == (size_t)st.st_size )
+                    {
+                        TracyLfqPrepare( QueueType::SourceCodeMetadata );
+                        MemWrite( &item->sourceCodeMetadata.ptr, (uint64_t)ptr );
+                        MemWrite( &item->sourceCodeMetadata.size, (uint32_t)rd );
+                        MemWrite( &item->sourceCodeMetadata.id, id );
+                        TracyLfqCommit;
+                        ok = true;
+                    }
+                }
+                close( d );
+            }
         }
     }
     else
     {
-        tracy_free( m_queryData );
-        AckSourceCodeNotAvailable();
+        TracyDebug( "DebugInfo invalid query fn: %s, image: %s\n", data, image );
     }
-    m_queryData = nullptr;
-}
-
-void Profiler::SendCodeLocation( uint64_t ptr )
-{
-#ifdef TRACY_HAS_CALLSTACK
-    const auto sym = DecodeCodeAddress( ptr );
-
-    SendSingleString( sym.file );
-
-    QueueItem item;
-    MemWrite( &item.hdr.type, QueueType::CodeInformation );
-    MemWrite( &item.codeInformation.ptr, ptr );
-    MemWrite( &item.codeInformation.line, sym.line );
-
-    AppendData( &item, QueueDataSize[(int)QueueType::CodeInformation] );
-
-    if( sym.needFree ) tracy_free( (void*)sym.file );
 #endif
+
+    if( !ok && m_sourceCallback )
+    {
+        size_t sz;
+        char* ptr = m_sourceCallback( m_sourceCallbackData, data, sz );
+        if( ptr )
+        {
+            if( sz < ( TargetFrameSize - 16 ) )
+            {
+                TracyLfqPrepare( QueueType::SourceCodeMetadata );
+                MemWrite( &item->sourceCodeMetadata.ptr, (uint64_t)ptr );
+                MemWrite( &item->sourceCodeMetadata.size, (uint32_t)sz );
+                MemWrite( &item->sourceCodeMetadata.id, id );
+                TracyLfqCommit;
+                ok = true;
+            }
+        }
+    }
+
+    if( !ok )
+    {
+        TracyLfqPrepare( QueueType::AckSourceCodeNotAvailable );
+        MemWrite( &item->sourceCodeNotAvailable, id );
+        TracyLfqCommit;
+    }
+
+    tracy_free_fast( data );
+    tracy_free_fast( image );
 }
 
-#if ( defined _WIN32 || defined __CYGWIN__ ) && defined TRACY_TIMER_QPC
+#if defined _WIN32 && defined TRACY_TIMER_QPC
 int64_t Profiler::GetTimeQpc()
 {
     LARGE_INTEGER t;
@@ -3572,4 +4302,486 @@ int64_t Profiler::GetTimeQpc()
 
 }
 
+#if 0
+#ifdef __cplusplus
+extern "C" {
 #endif
+
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin( const struct ___tracy_source_location_data* srcloc, int active )
+{
+    ___tracy_c_zone_context ctx;
+#ifdef TRACY_ON_DEMAND
+    ctx.active = active && tracy::GetProfiler().IsConnected();
+#else
+    ctx.active = active;
+#endif
+    if( !ctx.active ) return ctx;
+    const auto id = tracy::GetProfiler().GetNextZoneId();
+    ctx.id = id;
+
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneBegin );
+        tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() );
+        tracy::MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
+        TracyQueueCommitC( zoneBeginThread );
+    }
+    return ctx;
+}
+
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_callstack( const struct ___tracy_source_location_data* srcloc, int depth, int active )
+{
+    ___tracy_c_zone_context ctx;
+#ifdef TRACY_ON_DEMAND
+    ctx.active = active && tracy::GetProfiler().IsConnected();
+#else
+    ctx.active = active;
+#endif
+    if( !ctx.active ) return ctx;
+    const auto id = tracy::GetProfiler().GetNextZoneId();
+    ctx.id = id;
+
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    tracy::GetProfiler().SendCallstack( depth );
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneBeginCallstack );
+        tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() );
+        tracy::MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
+        TracyQueueCommitC( zoneBeginThread );
+    }
+    return ctx;
+}
+
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc( uint64_t srcloc, int active )
+{
+    ___tracy_c_zone_context ctx;
+#ifdef TRACY_ON_DEMAND
+    ctx.active = active && tracy::GetProfiler().IsConnected();
+#else
+    ctx.active = active;
+#endif
+    if( !ctx.active )
+    {
+        tracy::tracy_free( (void*)srcloc );
+        return ctx;
+    }
+    const auto id = tracy::GetProfiler().GetNextZoneId();
+    ctx.id = id;
+
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneBeginAllocSrcLoc );
+        tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() );
+        tracy::MemWrite( &item->zoneBegin.srcloc, srcloc );
+        TracyQueueCommitC( zoneBeginThread );
+    }
+    return ctx;
+}
+
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc_callstack( uint64_t srcloc, int depth, int active )
+{
+    ___tracy_c_zone_context ctx;
+#ifdef TRACY_ON_DEMAND
+    ctx.active = active && tracy::GetProfiler().IsConnected();
+#else
+    ctx.active = active;
+#endif
+    if( !ctx.active )
+    {
+        tracy::tracy_free( (void*)srcloc );
+        return ctx;
+    }
+    const auto id = tracy::GetProfiler().GetNextZoneId();
+    ctx.id = id;
+
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    tracy::GetProfiler().SendCallstack( depth );
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneBeginAllocSrcLocCallstack );
+        tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() );
+        tracy::MemWrite( &item->zoneBegin.srcloc, srcloc );
+        TracyQueueCommitC( zoneBeginThread );
+    }
+    return ctx;
+}
+
+TRACY_API void ___tracy_emit_zone_end( TracyCZoneCtx ctx )
+{
+    if( !ctx.active ) return;
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, ctx.id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneEnd );
+        tracy::MemWrite( &item->zoneEnd.time, tracy::Profiler::GetTime() );
+        TracyQueueCommitC( zoneEndThread );
+    }
+}
+
+TRACY_API void ___tracy_emit_zone_text( TracyCZoneCtx ctx, const char* txt, size_t size )
+{
+    assert( size < std::numeric_limits<uint16_t>::max() );
+    if( !ctx.active ) return;
+    auto ptr = (char*)tracy::tracy_malloc( size );
+    memcpy( ptr, txt, size );
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, ctx.id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneText );
+        tracy::MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
+        tracy::MemWrite( &item->zoneTextFat.size, (uint16_t)size );
+        TracyQueueCommitC( zoneTextFatThread );
+    }
+}
+
+TRACY_API void ___tracy_emit_zone_name( TracyCZoneCtx ctx, const char* txt, size_t size )
+{
+    assert( size < std::numeric_limits<uint16_t>::max() );
+    if( !ctx.active ) return;
+    auto ptr = (char*)tracy::tracy_malloc( size );
+    memcpy( ptr, txt, size );
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, ctx.id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneName );
+        tracy::MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
+        tracy::MemWrite( &item->zoneTextFat.size, (uint16_t)size );
+        TracyQueueCommitC( zoneTextFatThread );
+    }
+}
+
+TRACY_API void ___tracy_emit_zone_color( TracyCZoneCtx ctx, uint32_t color ) {
+    if( !ctx.active ) return;
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, ctx.id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneColor );
+        tracy::MemWrite( &item->zoneColor.r, uint8_t( ( color       ) & 0xFF ) );
+        tracy::MemWrite( &item->zoneColor.g, uint8_t( ( color >> 8  ) & 0xFF ) );
+        tracy::MemWrite( &item->zoneColor.b, uint8_t( ( color >> 16 ) & 0xFF ) );
+        TracyQueueCommitC( zoneColorThread );
+    }
+}
+
+TRACY_API void ___tracy_emit_zone_value( TracyCZoneCtx ctx, uint64_t value )
+{
+    if( !ctx.active ) return;
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, ctx.id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValue );
+        tracy::MemWrite( &item->zoneValue.value, value );
+        TracyQueueCommitC( zoneValueThread );
+    }
+}
+
+TRACY_API void ___tracy_emit_memory_alloc( const void* ptr, size_t size, int secure ) { tracy::Profiler::MemAlloc( ptr, size, secure != 0 ); }
+TRACY_API void ___tracy_emit_memory_alloc_callstack( const void* ptr, size_t size, int depth, int secure ) { tracy::Profiler::MemAllocCallstack( ptr, size, depth, secure != 0 ); }
+TRACY_API void ___tracy_emit_memory_free( const void* ptr, int secure ) { tracy::Profiler::MemFree( ptr, secure != 0 ); }
+TRACY_API void ___tracy_emit_memory_free_callstack( const void* ptr, int depth, int secure ) { tracy::Profiler::MemFreeCallstack( ptr, depth, secure != 0 ); }
+TRACY_API void ___tracy_emit_memory_alloc_named( const void* ptr, size_t size, int secure, const char* name ) { tracy::Profiler::MemAllocNamed( ptr, size, secure != 0, name ); }
+TRACY_API void ___tracy_emit_memory_alloc_callstack_named( const void* ptr, size_t size, int depth, int secure, const char* name ) { tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, secure != 0, name ); }
+TRACY_API void ___tracy_emit_memory_free_named( const void* ptr, int secure, const char* name ) { tracy::Profiler::MemFreeNamed( ptr, secure != 0, name ); }
+TRACY_API void ___tracy_emit_memory_free_callstack_named( const void* ptr, int depth, int secure, const char* name ) { tracy::Profiler::MemFreeCallstackNamed( ptr, depth, secure != 0, name ); }
+TRACY_API void ___tracy_emit_frame_mark( const char* name ) { tracy::Profiler::SendFrameMark( name ); }
+TRACY_API void ___tracy_emit_frame_mark_start( const char* name ) { tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgStart ); }
+TRACY_API void ___tracy_emit_frame_mark_end( const char* name ) { tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgEnd ); }
+TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_t h, uint8_t offset, int flip ) { tracy::Profiler::SendFrameImage( image, w, h, offset, flip ); }
+TRACY_API void ___tracy_emit_plot( const char* name, double val ) { tracy::Profiler::PlotData( name, val ); }
+TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack ) { tracy::Profiler::Message( txt, size, callstack ); }
+TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack ) { tracy::Profiler::Message( txt, callstack ); }
+TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, size, color, callstack ); }
+TRACY_API void ___tracy_emit_messageLC( const char* txt, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, color, callstack ); }
+TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size ) { tracy::Profiler::MessageAppInfo( txt, size ); }
+
+TRACY_API uint64_t ___tracy_alloc_srcloc( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz ) {
+    return tracy::Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz );
+}
+
+TRACY_API uint64_t ___tracy_alloc_srcloc_name( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz ) {
+    return tracy::Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin( const struct ___tracy_gpu_zone_begin_data data )
+{
+    TracyLfqPrepareC( tracy::QueueType::GpuZoneBegin );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_callstack( const struct ___tracy_gpu_zone_begin_callstack_data data )
+{
+    tracy::GetProfiler().SendCallstack( data.depth );
+    TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginCallstack );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_alloc( const struct ___tracy_gpu_zone_begin_data data )
+{
+    TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLoc  );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack( const struct ___tracy_gpu_zone_begin_callstack_data data )
+{
+    tracy::GetProfiler().SendCallstack( data.depth );
+    TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLocCallstack  );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_time( const struct ___tracy_gpu_time_data data )
+{
+    TracyLfqPrepareC( tracy::QueueType::GpuTime );
+    tracy::MemWrite( &item->gpuTime.gpuTime, data.gpuTime );
+    tracy::MemWrite( &item->gpuTime.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuTime.context, data.context );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_end( const struct ___tracy_gpu_zone_end_data data )
+{
+    TracyLfqPrepareC( tracy::QueueType::GpuZoneEnd );
+    tracy::MemWrite( &item->gpuZoneEnd.cpuTime, tracy::Profiler::GetTime() );
+    memset( &item->gpuZoneEnd.thread, 0, sizeof( item->gpuZoneEnd.thread ) );
+    tracy::MemWrite( &item->gpuZoneEnd.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneEnd.context, data.context );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_new_context( ___tracy_gpu_new_context_data data )
+{
+    TracyLfqPrepareC( tracy::QueueType::GpuNewContext );
+    tracy::MemWrite( &item->gpuNewContext.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuNewContext.gpuTime, data.gpuTime );
+    tracy::MemWrite( &item->gpuNewContext.period, data.period );
+    tracy::MemWrite( &item->gpuNewContext.context, data.context );
+    tracy::MemWrite( &item->gpuNewContext.flags, data.flags );
+    tracy::MemWrite( &item->gpuNewContext.type, data.type );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_context_name( const struct ___tracy_gpu_context_name_data data )
+{
+    auto ptr = (char*)tracy::tracy_malloc( data.len );
+    memcpy( ptr, data.name, data.len );
+
+    TracyLfqPrepareC( tracy::QueueType::GpuContextName );
+    tracy::MemWrite( &item->gpuContextNameFat.context, data.context );
+    tracy::MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
+    tracy::MemWrite( &item->gpuContextNameFat.size, data.len );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_calibration( const struct ___tracy_gpu_calibration_data data )
+{
+    TracyLfqPrepareC( tracy::QueueType::GpuCalibration );
+    tracy::MemWrite( &item->gpuCalibration.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuCalibration.gpuTime, data.gpuTime );
+    tracy::MemWrite( &item->gpuCalibration.cpuDelta, data.cpuDelta );
+    tracy::MemWrite( &item->gpuCalibration.context, data.context );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_serial( const struct ___tracy_gpu_zone_begin_data data )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginSerial );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_callstack_serial( const struct ___tracy_gpu_zone_begin_callstack_data data )
+{
+    auto item = tracy::Profiler::QueueSerialCallstack( tracy::Callstack( data.depth ) );
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginCallstackSerial );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_serial( const struct ___tracy_gpu_zone_begin_data data )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocSerial );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack_serial( const struct ___tracy_gpu_zone_begin_callstack_data data )
+{
+    auto item = tracy::Profiler::QueueSerialCallstack( tracy::Callstack( data.depth ) );
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocCallstackSerial );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_time_serial( const struct ___tracy_gpu_time_data data )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuTime );
+    tracy::MemWrite( &item->gpuTime.gpuTime, data.gpuTime );
+    tracy::MemWrite( &item->gpuTime.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuTime.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_end_serial( const struct ___tracy_gpu_zone_end_data data )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneEndSerial );
+    tracy::MemWrite( &item->gpuZoneEnd.cpuTime, tracy::Profiler::GetTime() );
+    memset( &item->gpuZoneEnd.thread, 0, sizeof( item->gpuZoneEnd.thread ) );
+    tracy::MemWrite( &item->gpuZoneEnd.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneEnd.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_new_context_serial( ___tracy_gpu_new_context_data data )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuNewContext );
+    tracy::MemWrite( &item->gpuNewContext.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuNewContext.gpuTime, data.gpuTime );
+    tracy::MemWrite( &item->gpuNewContext.period, data.period );
+    tracy::MemWrite( &item->gpuNewContext.context, data.context );
+    tracy::MemWrite( &item->gpuNewContext.flags, data.flags );
+    tracy::MemWrite( &item->gpuNewContext.type, data.type );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_context_name_serial( const struct ___tracy_gpu_context_name_data data )
+{
+    auto ptr = (char*)tracy::tracy_malloc( data.len );
+    memcpy( ptr, data.name, data.len );
+
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuContextName );
+    tracy::MemWrite( &item->gpuContextNameFat.context, data.context );
+    tracy::MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
+    tracy::MemWrite( &item->gpuContextNameFat.size, data.len );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_calibration_serial( const struct ___tracy_gpu_calibration_data data )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuCalibration );
+    tracy::MemWrite( &item->gpuCalibration.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuCalibration.gpuTime, data.gpuTime );
+    tracy::MemWrite( &item->gpuCalibration.cpuDelta, data.cpuDelta );
+    tracy::MemWrite( &item->gpuCalibration.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API int ___tracy_connected( void )
+{
+    return tracy::GetProfiler().IsConnected();
+}
+
+#ifdef TRACY_FIBERS
+TRACY_API void ___tracy_fiber_enter( const char* fiber ){ tracy::Profiler::EnterFiber( fiber ); }
+TRACY_API void ___tracy_fiber_leave( void ){ tracy::Profiler::LeaveFiber(); }
+#endif
+
+#  ifdef TRACY_MANUAL_LIFETIME
+TRACY_API void ___tracy_startup_profiler( void )
+{
+    tracy::StartupProfiler();
+}
+
+TRACY_API void ___tracy_shutdown_profiler( void )
+{
+    tracy::ShutdownProfiler();
+}
+#  endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+
diff --git a/Source/ThirdParty/tracy/client/TracyProfiler.hpp b/Source/ThirdParty/tracy/client/TracyProfiler.hpp
index cdd6154b6..99ae63e4f 100644
--- a/Source/ThirdParty/tracy/client/TracyProfiler.hpp
+++ b/Source/ThirdParty/tracy/client/TracyProfiler.hpp
@@ -8,6 +8,7 @@
 #include <time.h>
 
 #include "tracy_concurrentqueue.h"
+#include "tracy_SPSCQueue.h"
 #include "TracyCallstack.hpp"
 #include "TracySysTime.hpp"
 #include "TracyFastVector.hpp"
@@ -17,7 +18,7 @@
 #include "../common/TracyMutex.hpp"
 #include "../common/TracyProtocol.hpp"
 
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
 #  include <intrin.h>
 #endif
 #ifdef __APPLE__
@@ -25,11 +26,15 @@
 #  include <mach/mach_time.h>
 #endif
 
-#if !defined TRACY_TIMER_FALLBACK && ( defined _WIN32 || defined __CYGWIN__ || ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) || ( defined TARGET_OS_IOS && TARGET_OS_IOS == 1 ) )
+#if ( defined _WIN32 || ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) || ( defined TARGET_OS_IOS && TARGET_OS_IOS == 1 ) )
 #  define TRACY_HW_TIMER
 #endif
 
-#if !defined TRACY_HW_TIMER
+#ifdef __linux__
+#  include <signal.h>
+#endif
+
+#if defined TRACY_TIMER_FALLBACK || !defined TRACY_HW_TIMER
 #  include <chrono>
 #endif
 
@@ -55,11 +60,27 @@ TRACY_API Profiler& GetProfiler();
 TRACY_API std::atomic<uint32_t>& GetLockCounter();
 TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter();
 TRACY_API GpuCtxWrapper& GetGpuCtx();
-TRACY_API uint64_t GetThreadHandle();
-TRACY_API void InitRPMallocThread();
+TRACY_API uint32_t GetThreadHandle();
 TRACY_API bool ProfilerAvailable();
+TRACY_API bool ProfilerAllocatorAvailable();
 TRACY_API int64_t GetFrequencyQpc();
 
+#if defined TRACY_TIMER_FALLBACK && defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+TRACY_API bool HardwareSupportsInvariantTSC();  // check, if we need fallback scenario
+#else
+#  if defined TRACY_HW_TIMER
+tracy_force_inline bool HardwareSupportsInvariantTSC()
+{
+    return true;  // this is checked at startup
+}
+#  else
+tracy_force_inline bool HardwareSupportsInvariantTSC()
+{
+    return false;
+}
+#  endif
+#endif
+
 #ifdef TRACY_ON_DEMAND
 struct LuaZoneState
 {
@@ -90,6 +111,29 @@ struct LuaZoneState
     __tail.store( __magic + 1, std::memory_order_release );
 
 
+#ifdef TRACY_FIBERS
+#  define TracyQueuePrepare( _type ) \
+    auto item = Profiler::QueueSerial(); \
+    MemWrite( &item->hdr.type, _type );
+#  define TracyQueueCommit( _name ) \
+    MemWrite( &item->_name.thread, GetThreadHandle() ); \
+    Profiler::QueueSerialFinish();
+#  define TracyQueuePrepareC( _type ) \
+    auto item = tracy::Profiler::QueueSerial(); \
+    tracy::MemWrite( &item->hdr.type, _type );
+#  define TracyQueueCommitC( _name ) \
+    tracy::MemWrite( &item->_name.thread, tracy::GetThreadHandle() ); \
+    tracy::Profiler::QueueSerialFinish();
+#else
+#  define TracyQueuePrepare( _type ) TracyLfqPrepare( _type )
+#  define TracyQueueCommit( _name ) TracyLfqCommit
+#  define TracyQueuePrepareC( _type ) TracyLfqPrepareC( _type )
+#  define TracyQueueCommitC( _name ) TracyLfqCommitC
+#endif
+
+
+typedef char*(*SourceContentsCallback)( void* data, const char* filename, size_t& size );
+
 class TRACY_API Profiler
 {
     struct FrameImageQueueItem
@@ -98,10 +142,26 @@ class TRACY_API Profiler
         uint32_t frame;
         uint16_t w;
         uint16_t h;
-        uint8_t offset;
         bool flip;
     };
 
+    enum class SymbolQueueItemType
+    {
+        CallstackFrame,
+        SymbolQuery,
+        ExternalName,
+        KernelCode,
+        SourceCode
+    };
+
+    struct SymbolQueueItem
+    {
+        SymbolQueueItemType type;
+        uint64_t ptr;
+        uint64_t extra;
+        uint32_t id;
+    };
+
 public:
     Profiler();
     ~Profiler();
@@ -112,25 +172,33 @@ public:
     {
 #ifdef TRACY_HW_TIMER
 #  if defined TARGET_OS_IOS && TARGET_OS_IOS == 1
-        return mach_absolute_time();
-#  elif defined _WIN32 || defined __CYGWIN__
+        if( HardwareSupportsInvariantTSC() ) return mach_absolute_time();
+#  elif defined _WIN32
 #    ifdef TRACY_TIMER_QPC
         return GetTimeQpc();
 #    else
-        return int64_t( __rdtsc() );
+        if( HardwareSupportsInvariantTSC() ) return int64_t( __rdtsc() );
 #    endif
 #  elif defined __i386 || defined _M_IX86
-        uint32_t eax, edx;
-        asm volatile ( "rdtsc" : "=a" (eax), "=d" (edx) );
-        return ( uint64_t( edx ) << 32 ) + uint64_t( eax );
+        if( HardwareSupportsInvariantTSC() )
+        {
+            uint32_t eax, edx;
+            asm volatile ( "rdtsc" : "=a" (eax), "=d" (edx) );
+            return ( uint64_t( edx ) << 32 ) + uint64_t( eax );
+        }
 #  elif defined __x86_64__ || defined _M_X64
-        uint64_t rax, rdx;
-        asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) );
-        return (int64_t)(( rdx << 32 ) + rax);
+        if( HardwareSupportsInvariantTSC() )
+        {
+            uint64_t rax, rdx;
+            asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) );
+            return (int64_t)(( rdx << 32 ) + rax);
+        }
 #  else
 #    error "TRACY_HW_TIMER detection logic needs fixing"
 #  endif
-#else
+#endif
+
+#if !defined TRACY_HW_TIMER || defined TRACY_TIMER_FALLBACK
 #  if defined __linux__ && defined CLOCK_MONOTONIC_RAW
         struct timespec ts;
         clock_gettime( CLOCK_MONOTONIC_RAW, &ts );
@@ -138,6 +206,10 @@ public:
 #  else
         return std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::high_resolution_clock::now().time_since_epoch() ).count();
 #  endif
+#endif
+
+#if !defined TRACY_TIMER_FALLBACK
+        return 0;  // unreachable branch
 #endif
     }
 
@@ -168,12 +240,37 @@ public:
         p.m_serialLock.unlock();
     }
 
+    static tracy_force_inline void SourceCallbackRegister( SourceContentsCallback cb, void* data )
+    {
+        auto& profiler = GetProfiler();
+        profiler.m_sourceCallback = cb;
+        profiler.m_sourceCallbackData = data;
+    }
+
+#ifdef TRACY_FIBERS
+    static tracy_force_inline void EnterFiber( const char* fiber )
+    {
+        TracyQueuePrepare( QueueType::FiberEnter );
+        MemWrite( &item->fiberEnter.time, GetTime() );
+        MemWrite( &item->fiberEnter.fiber, (uint64_t)fiber );
+        TracyQueueCommit( fiberEnter );
+    }
+
+    static tracy_force_inline void LeaveFiber()
+    {
+        TracyQueuePrepare( QueueType::FiberLeave );
+        MemWrite( &item->fiberLeave.time, GetTime() );
+        TracyQueueCommit( fiberLeave );
+    }
+#endif
+
     static void SendFrameMark( const char* name );
     static void SendFrameMark( const char* name, QueueType type );
+    static void SendFrameImage( const void* image, uint16_t w, uint16_t h, uint8_t offset, bool flip );
     static void PlotData( const char* name, int64_t val );
     static void PlotData( const char* name, float val );
     static void PlotData( const char* name, double val );
-    static void ConfigurePlot( const char* name, PlotFormatType type );
+    static void ConfigurePlot( const char* name, PlotFormatType type, bool step, bool fill, uint32_t color );
     static void Message( const char* txt, size_t size, int callstack );
     static void Message( const char* txt, int callstack );
     static void MessageColor( const char* txt, size_t size, uint32_t color, int callstack );
@@ -189,6 +286,7 @@ public:
     static void MemFreeCallstackNamed( const void* ptr, int depth, bool secure, const char* name );
     static void SendCallstack( int depth );
     static void ParameterRegister( ParameterCallback cb );
+    static void ParameterRegister( ParameterCallback cb, void* data );
     static void ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val );
 
     void SendCallstack( int depth, const char* skipBefore );
@@ -296,15 +394,28 @@ public:
 
 private:
     enum class DequeueStatus { DataDequeued, ConnectionLost, QueueEmpty };
+    enum class ThreadCtxStatus { Same, Changed, ConnectionLost };
 
     static void LaunchWorker( void* ptr ) { ((Profiler*)ptr)->Worker(); }
     void Worker();
 
+#ifndef TRACY_NO_FRAME_IMAGE
+    static void LaunchCompressWorker( void* ptr ) { ((Profiler*)ptr)->CompressWorker(); }
+    void CompressWorker();
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+    static void LaunchSymbolWorker( void* ptr ) { ((Profiler*)ptr)->SymbolWorker(); }
+    void SymbolWorker();
+    void HandleSymbolQueueItem( const SymbolQueueItem& si );
+#endif
+
     void ClearQueues( tracy::moodycamel::ConsumerToken& token );
     void ClearSerial();
     DequeueStatus Dequeue( tracy::moodycamel::ConsumerToken& token );
     DequeueStatus DequeueContextSwitches( tracy::moodycamel::ConsumerToken& token, int64_t& timeStop );
     DequeueStatus DequeueSerial();
+    ThreadCtxStatus ThreadCtxCheck( uint32_t threadId );
     bool CommitData();
 
     tracy_force_inline bool AppendData( const void* data, size_t len )
@@ -338,18 +449,21 @@ private:
     void SendCallstackPayload( uint64_t ptr );
     void SendCallstackPayload64( uint64_t ptr );
     void SendCallstackAlloc( uint64_t ptr );
-    void SendCallstackFrame( uint64_t ptr );
-    void SendCodeLocation( uint64_t ptr );
+
+    void QueueCallstackFrame( uint64_t ptr );
+    void QueueSymbolQuery( uint64_t symbol );
+    void QueueExternalName( uint64_t ptr );
+    void QueueKernelCode( uint64_t symbol, uint32_t size );
+    void QueueSourceCodeQuery( uint32_t id );
 
     bool HandleServerQuery();
     void HandleDisconnect();
     void HandleParameter( uint64_t payload );
-    void HandleSymbolQuery( uint64_t symbol );
     void HandleSymbolCodeQuery( uint64_t symbol, uint32_t size );
-    void HandleSourceCodeQuery();
+    void HandleSourceCodeQuery( char* data, char* image, uint32_t id );
 
     void AckServerQuery();
-    void AckSourceCodeNotAvailable();
+    void AckSymbolCodeNotAvailable();
 
     void CalibrateTimer();
     void CalibrateDelay();
@@ -362,10 +476,12 @@ private:
         MemWrite( &item->hdr.type, QueueType::CallstackSerial );
         MemWrite( &item->callstackFat.ptr, (uint64_t)ptr );
         GetProfiler().m_serialQueue.commit_next();
+#else
+        static_cast<void>(ptr); // unused
 #endif
     }
 
-    static tracy_force_inline void SendMemAlloc( QueueType type, const uint64_t thread, const void* ptr, size_t size )
+    static tracy_force_inline void SendMemAlloc( QueueType type, const uint32_t thread, const void* ptr, size_t size )
     {
         assert( type == QueueType::MemAlloc || type == QueueType::MemAllocCallstack || type == QueueType::MemAllocNamed || type == QueueType::MemAllocCallstackNamed );
 
@@ -388,7 +504,7 @@ private:
         GetProfiler().m_serialQueue.commit_next();
     }
 
-    static tracy_force_inline void SendMemFree( QueueType type, const uint64_t thread, const void* ptr )
+    static tracy_force_inline void SendMemFree( QueueType type, const uint32_t thread, const void* ptr )
     {
         assert( type == QueueType::MemFree || type == QueueType::MemFreeCallstack || type == QueueType::MemFreeNamed || type == QueueType::MemFreeCallstackNamed );
 
@@ -409,7 +525,7 @@ private:
         GetProfiler().m_serialQueue.commit_next();
     }
 
-#if ( defined _WIN32 || defined __CYGWIN__ ) && defined TRACY_TIMER_QPC
+#if defined _WIN32 && defined TRACY_TIMER_QPC
     static int64_t GetTimeQpc();
 #endif
 
@@ -417,7 +533,7 @@ private:
     uint64_t m_resolution;
     uint64_t m_delay;
     std::atomic<int64_t> m_timeBegin;
-    uint64_t m_mainThread;
+    uint32_t m_mainThread;
     uint64_t m_epoch, m_exectime;
     std::atomic<bool> m_shutdown;
     std::atomic<bool> m_shutdownManual;
@@ -429,7 +545,7 @@ private:
     std::atomic<uint32_t> m_zoneId;
     int64_t m_samplingPeriod;
 
-    uint64_t m_threadCtx;
+    uint32_t m_threadCtx;
     int64_t m_refTimeThread;
     int64_t m_refTimeSerial;
     int64_t m_refTimeCtx;
@@ -445,6 +561,13 @@ private:
     FastVector<QueueItem> m_serialQueue, m_serialDequeue;
     TracyMutex m_serialLock;
 
+#ifndef TRACY_NO_FRAME_IMAGE
+    FastVector<FrameImageQueueItem> m_fiQueue, m_fiDequeue;
+    TracyMutex m_fiLock;
+#endif
+
+    SPSCQueue<SymbolQueueItem> m_symbolQueue;
+
     std::atomic<uint64_t> m_frameCount;
     std::atomic<bool> m_isConnected;
 #ifdef TRACY_ON_DEMAND
@@ -464,9 +587,23 @@ private:
 #endif
 
     ParameterCallback m_paramCallback;
+    void* m_paramCallbackData;
+    SourceContentsCallback m_sourceCallback;
+    void* m_sourceCallbackData;
 
+    char* m_queryImage;
     char* m_queryData;
     char* m_queryDataPtr;
+
+#if defined _WIN32
+    void* m_exceptionHandler;
+#endif
+#ifdef __linux__
+    struct {
+        struct sigaction pwr, ill, fpe, segv, pipe, bus, abrt;
+    } m_prevSignal;
+#endif
+    bool m_crashHandlerInstalled;
 };
 
 }
diff --git a/Source/ThirdParty/tracy/client/TracyRingBuffer.hpp b/Source/ThirdParty/tracy/client/TracyRingBuffer.hpp
index 29d935596..e9100e2d8 100644
--- a/Source/ThirdParty/tracy/client/TracyRingBuffer.hpp
+++ b/Source/ThirdParty/tracy/client/TracyRingBuffer.hpp
@@ -1,27 +1,44 @@
+#include <atomic>
+#include <assert.h>
+#include <errno.h>
+#include <linux/perf_event.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "TracyDebug.hpp"
+
 namespace tracy
 {
 
-template<size_t Size>
 class RingBuffer
 {
 public:
-    RingBuffer( int fd )
-        : m_fd( fd )
+    RingBuffer( unsigned int size, int fd, int id, int cpu = -1 )
+        : m_size( size )
+        , m_id( id )
+        , m_cpu( cpu )
+        , m_fd( fd )
     {
         const auto pageSize = uint32_t( getpagesize() );
-        assert( Size >= pageSize );
-        assert( __builtin_popcount( Size ) == 1 );
-        m_mapSize = Size + pageSize;
+        assert( size >= pageSize );
+        assert( __builtin_popcount( size ) == 1 );
+        m_mapSize = size + pageSize;
         auto mapAddr = mmap( nullptr, m_mapSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 );
-        if( !mapAddr )
+        if( mapAddr == MAP_FAILED )
         {
+            TracyDebug( "mmap failed: errno %i (%s)\n", errno, strerror( errno ) );
             m_fd = 0;
+            m_metadata = nullptr;
             close( fd );
             return;
         }
         m_metadata = (perf_event_mmap_page*)mapAddr;
         assert( m_metadata->data_offset == pageSize );
         m_buffer = ((char*)mapAddr) + pageSize;
+        m_tail = m_metadata->data_tail;
     }
 
     ~RingBuffer()
@@ -49,36 +66,35 @@ public:
     }
 
     bool IsValid() const { return m_metadata != nullptr; }
+    int GetId() const { return m_id; }
+    int GetCpu() const { return m_cpu; }
 
     void Enable()
     {
         ioctl( m_fd, PERF_EVENT_IOC_ENABLE, 0 );
     }
 
-    bool HasData() const
-    {
-        const auto head = LoadHead();
-        return head > m_metadata->data_tail;
-    }
-
     void Read( void* dst, uint64_t offset, uint64_t cnt )
     {
-        auto src = ( m_metadata->data_tail + offset ) % Size;
-        if( src + cnt <= Size )
+        const auto size = m_size;
+        auto src = ( m_tail + offset ) % size;
+        if( src + cnt <= size )
         {
             memcpy( dst, m_buffer + src, cnt );
         }
         else
         {
-            const auto s0 = Size - src;
-            memcpy( dst, m_buffer + src, s0 );
-            memcpy( (char*)dst + s0, m_buffer, cnt - s0 );
+            const auto s0 = size - src;
+            const auto buf = m_buffer;
+            memcpy( dst, buf + src, s0 );
+            memcpy( (char*)dst + s0, buf, cnt - s0 );
         }
     }
 
     void Advance( uint64_t cnt )
     {
-        StoreTail( m_metadata->data_tail + cnt );
+        m_tail += cnt;
+        StoreTail();
     }
 
     bool CheckTscCaps() const
@@ -88,26 +104,35 @@ public:
 
     int64_t ConvertTimeToTsc( int64_t timestamp ) const
     {
-        assert( m_metadata->cap_user_time_zero );
+        if( !m_metadata->cap_user_time_zero ) return 0;
         const auto time = timestamp - m_metadata->time_zero;
         const auto quot = time / m_metadata->time_mult;
         const auto rem = time % m_metadata->time_mult;
         return ( quot << m_metadata->time_shift ) + ( rem << m_metadata->time_shift ) / m_metadata->time_mult;
     }
 
-private:
     uint64_t LoadHead() const
     {
         return std::atomic_load_explicit( (const volatile std::atomic<uint64_t>*)&m_metadata->data_head, std::memory_order_acquire );
     }
 
-    void StoreTail( uint64_t tail )
+    uint64_t GetTail() const
     {
-        std::atomic_store_explicit( (volatile std::atomic<uint64_t>*)&m_metadata->data_tail, tail, std::memory_order_release );
+        return m_tail;
     }
 
-    perf_event_mmap_page* m_metadata;
+private:
+    void StoreTail()
+    {
+        std::atomic_store_explicit( (volatile std::atomic<uint64_t>*)&m_metadata->data_tail, m_tail, std::memory_order_release );
+    }
+
+    unsigned int m_size;
+    uint64_t m_tail;
     char* m_buffer;
+    int m_id;
+    int m_cpu;
+    perf_event_mmap_page* m_metadata;
 
     size_t m_mapSize;
     int m_fd;
diff --git a/Source/ThirdParty/tracy/client/TracyScoped.hpp b/Source/ThirdParty/tracy/client/TracyScoped.hpp
index b18f02388..1e5b6c809 100644
--- a/Source/ThirdParty/tracy/client/TracyScoped.hpp
+++ b/Source/ThirdParty/tracy/client/TracyScoped.hpp
@@ -8,7 +8,7 @@
 #include "../common/TracySystem.hpp"
 #include "../common/TracyAlign.hpp"
 #include "../common/TracyAlloc.hpp"
-#include "TracyProfiler.hpp"
+#include "../client/TracyLock.hpp"
 
 namespace tracy
 {
@@ -56,10 +56,10 @@ ScopedZone::ScopedZone( const SourceLocationData* srcloc, bool is_active )
 #ifdef TRACY_ON_DEMAND
     m_connectionId = GetProfiler().ConnectionId();
 #endif
-    TracyLfqPrepare( QueueType::ZoneBegin );
+    TracyQueuePrepare( QueueType::ZoneBegin );
     MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
     MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
-    TracyLfqCommit;
+    TracyQueueCommit( zoneBeginThread );
 }
 
 ScopedZone::ScopedZone( const SourceLocationData* srcloc, int depth, bool is_active )
@@ -75,10 +75,10 @@ ScopedZone::ScopedZone( const SourceLocationData* srcloc, int depth, bool is_act
 #endif
     GetProfiler().SendCallstack( depth );
 
-    TracyLfqPrepare( QueueType::ZoneBeginCallstack );
+    TracyQueuePrepare( QueueType::ZoneBeginCallstack );
     MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
     MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
-    TracyLfqCommit;
+    TracyQueueCommit( zoneBeginThread );
 }
 
 ScopedZone::ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool is_active )
@@ -92,11 +92,11 @@ ScopedZone::ScopedZone( uint32_t line, const char* source, size_t sourceSz, cons
 #ifdef TRACY_ON_DEMAND
     m_connectionId = GetProfiler().ConnectionId();
 #endif
-    TracyLfqPrepare( QueueType::ZoneBeginAllocSrcLoc );
+    TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLoc );
     const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
     MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
     MemWrite( &item->zoneBegin.srcloc, srcloc );
-    TracyLfqCommit;
+    TracyQueueCommit( zoneBeginThread );
 }
 
 ScopedZone::ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool is_active )
@@ -112,11 +112,11 @@ ScopedZone::ScopedZone( uint32_t line, const char* source, size_t sourceSz, cons
 #endif
     GetProfiler().SendCallstack( depth );
 
-    TracyLfqPrepare( QueueType::ZoneBeginAllocSrcLocCallstack );
+    TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLocCallstack );
     const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
     MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
     MemWrite( &item->zoneBegin.srcloc, srcloc );
-    TracyLfqCommit;
+    TracyQueueCommit( zoneBeginThread );
 }
 
 ScopedZone::~ScopedZone()
@@ -125,9 +125,9 @@ ScopedZone::~ScopedZone()
 #ifdef TRACY_ON_DEMAND
     if( GetProfiler().ConnectionId() != m_connectionId ) return;
 #endif
-    TracyLfqPrepare( QueueType::ZoneEnd );
+    TracyQueuePrepare( QueueType::ZoneEnd );
     MemWrite( &item->zoneEnd.time, Profiler::GetTime() );
-    TracyLfqCommit;
+    TracyQueueCommit( zoneEndThread );
 }
 
 void ScopedZone::Text( const char* txt, size_t size )
@@ -139,13 +139,13 @@ void ScopedZone::Text( const char* txt, size_t size )
 #endif
     auto ptr = (char*)tracy_malloc( size );
     memcpy( ptr, txt, size );
-    TracyLfqPrepare( QueueType::ZoneText );
+    TracyQueuePrepare( QueueType::ZoneText );
     MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
     MemWrite( &item->zoneTextFat.size, (uint16_t)size );
-    TracyLfqCommit;
+    TracyQueueCommit( zoneTextFatThread );
 }
 
-void ScopedZone::Text(const Char* txt, size_t size)
+void ScopedZone::Text( const Char* txt, size_t size )
 {
     assert( size < std::numeric_limits<uint16_t>::max() );
     if( !m_active ) return;
@@ -155,10 +155,10 @@ void ScopedZone::Text(const Char* txt, size_t size)
     auto ptr = (char*)tracy_malloc( size );
     for( int i = 0; i < size; i++)
         ptr[i] = (char)txt[i];
-    TracyLfqPrepare( QueueType::ZoneText );
+    TracyQueuePrepare( QueueType::ZoneText );
     MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
     MemWrite( &item->zoneTextFat.size, (uint16_t)size );
-    TracyLfqCommit;
+    TracyQueueCommit( zoneTextFatThread );
 }
 
 void ScopedZone::Name( const char* txt, size_t size )
@@ -170,10 +170,10 @@ void ScopedZone::Name( const char* txt, size_t size )
 #endif
     auto ptr = (char*)tracy_malloc( size );
     memcpy( ptr, txt, size );
-    TracyLfqPrepare( QueueType::ZoneName );
+    TracyQueuePrepare( QueueType::ZoneName );
     MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
     MemWrite( &item->zoneTextFat.size, (uint16_t)size );
-    TracyLfqCommit;
+    TracyQueueCommit( zoneTextFatThread );
 }
 
 void ScopedZone::Name( const Char* txt, size_t size )
@@ -186,10 +186,10 @@ void ScopedZone::Name( const Char* txt, size_t size )
     auto ptr = (char*)tracy_malloc( size );
     for( int i = 0; i < size; i++)
         ptr[i] = (char)txt[i];
-    TracyLfqPrepare( QueueType::ZoneName );
+    TracyQueuePrepare( QueueType::ZoneName );
     MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
     MemWrite( &item->zoneTextFat.size, (uint16_t)size );
-    TracyLfqCommit;
+    TracyQueueCommit( zoneTextFatThread );
 }
 
 void ScopedZone::Color( uint32_t color )
@@ -198,11 +198,11 @@ void ScopedZone::Color( uint32_t color )
 #ifdef TRACY_ON_DEMAND
     if( GetProfiler().ConnectionId() != m_connectionId ) return;
 #endif
-    TracyLfqPrepare( QueueType::ZoneColor );
+    TracyQueuePrepare( QueueType::ZoneColor );
     MemWrite( &item->zoneColor.r, uint8_t( ( color       ) & 0xFF ) );
     MemWrite( &item->zoneColor.g, uint8_t( ( color >> 8  ) & 0xFF ) );
     MemWrite( &item->zoneColor.b, uint8_t( ( color >> 16 ) & 0xFF ) );
-    TracyLfqCommit;
+    TracyQueueCommit( zoneColorThread );
 }
 
 void ScopedZone::Value( uint64_t value )
@@ -211,13 +211,10 @@ void ScopedZone::Value( uint64_t value )
 #ifdef TRACY_ON_DEMAND
     if( GetProfiler().ConnectionId() != m_connectionId ) return;
 #endif
-    TracyLfqPrepare( QueueType::ZoneValue );
+    TracyQueuePrepare( QueueType::ZoneValue );
     MemWrite( &item->zoneValue.value, value );
-    TracyLfqCommit;
+    TracyQueueCommit( zoneValueThread );
 }
-
-bool ScopedZone::IsActive() const { return m_active; }
-
 }
 
 #endif
diff --git a/Source/ThirdParty/tracy/client/TracyStringHelpers.hpp b/Source/ThirdParty/tracy/client/TracyStringHelpers.hpp
new file mode 100644
index 000000000..1f8e4a659
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/TracyStringHelpers.hpp
@@ -0,0 +1,40 @@
+#ifndef __TRACYSTRINGHELPERS_HPP__
+#define __TRACYSTRINGHELPERS_HPP__
+
+#include <assert.h>
+#include <string.h>
+
+#include "../common/TracyAlloc.hpp"
+
+namespace tracy
+{
+
+static tracy_force_inline char* CopyString( const char* src, size_t sz )
+{
+    auto dst = (char*)tracy_malloc( sz + 1 );
+    memcpy( dst, src, sz );
+    dst[sz] = '\0';
+    return dst;
+}
+
+static tracy_force_inline char* CopyString( const char* src )
+{
+    return CopyString( src, strlen( src ) );
+}
+
+static tracy_force_inline char* CopyStringFast( const char* src, size_t sz )
+{
+    auto dst = (char*)tracy_malloc_fast( sz + 1 );
+    memcpy( dst, src, sz );
+    dst[sz] = '\0';
+    return dst;
+}
+
+static tracy_force_inline char* CopyStringFast( const char* src )
+{
+    return CopyStringFast( src, strlen( src ) );
+}
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/client/TracySysTime.cpp b/Source/ThirdParty/tracy/client/TracySysTime.cpp
index e5903467d..b690a9114 100644
--- a/Source/ThirdParty/tracy/client/TracySysTime.cpp
+++ b/Source/ThirdParty/tracy/client/TracySysTime.cpp
@@ -2,7 +2,7 @@
 
 #ifdef TRACY_HAS_SYSTIME
 
-#  if defined _WIN32 || defined __CYGWIN__
+#  if defined _WIN32
 #    include <windows.h>
 #  elif defined __linux__
 #    include <stdio.h>
@@ -18,7 +18,7 @@
 namespace tracy
 {
 
-#  if defined _WIN32 || defined __CYGWIN__
+#  if defined _WIN32
 
 static inline uint64_t ConvertTime( const FILETIME& t )
 {
@@ -62,7 +62,7 @@ void SysTime::ReadTimes()
 {
     host_cpu_load_info_data_t info;
     mach_msg_type_number_t cnt = HOST_CPU_LOAD_INFO_COUNT;
-    host_statistics( mach_host_self(), HOST_CPU_LOAD_INFO, reinterpret_cast<host_info_t>( &info ), &cnt ); 
+    host_statistics( mach_host_self(), HOST_CPU_LOAD_INFO, reinterpret_cast<host_info_t>( &info ), &cnt );
     used = info.cpu_ticks[CPU_STATE_USER] + info.cpu_ticks[CPU_STATE_NICE] + info.cpu_ticks[CPU_STATE_SYSTEM];
     idle = info.cpu_ticks[CPU_STATE_IDLE];
 }
@@ -95,7 +95,7 @@ float SysTime::Get()
     const auto diffIdle = idle - oldIdle;
     const auto diffUsed = used - oldUsed;
 
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
     return diffUsed == 0 ? -1 : ( diffUsed - diffIdle ) * 100.f / diffUsed;
 #elif defined __linux__ || defined __APPLE__ || defined BSD
     const auto total = diffUsed + diffIdle;
diff --git a/Source/ThirdParty/tracy/client/TracySysTime.hpp b/Source/ThirdParty/tracy/client/TracySysTime.hpp
index fc6ba321a..cb5ebe736 100644
--- a/Source/ThirdParty/tracy/client/TracySysTime.hpp
+++ b/Source/ThirdParty/tracy/client/TracySysTime.hpp
@@ -1,7 +1,7 @@
 #ifndef __TRACYSYSTIME_HPP__
 #define __TRACYSYSTIME_HPP__
 
-#if defined _WIN32 || defined __CYGWIN__ || defined __linux__ || defined __APPLE__
+#if defined _WIN32 || defined __linux__ || defined __APPLE__
 #  define TRACY_HAS_SYSTIME
 #else
 #  include <sys/param.h>
diff --git a/Source/ThirdParty/tracy/client/TracySysTrace.cpp b/Source/ThirdParty/tracy/client/TracySysTrace.cpp
index 972779770..23b1020a5 100644
--- a/Source/ThirdParty/tracy/client/TracySysTrace.cpp
+++ b/Source/ThirdParty/tracy/client/TracySysTrace.cpp
@@ -1,8 +1,38 @@
+#include "TracyDebug.hpp"
+#include "TracyStringHelpers.hpp"
 #include "TracySysTrace.hpp"
+#include "../common/TracySystem.hpp"
 
 #ifdef TRACY_HAS_SYSTEM_TRACING
 
-#  if defined _WIN32 || defined __CYGWIN__
+#ifndef TRACY_SAMPLING_HZ
+#  if defined _WIN32
+#    define TRACY_SAMPLING_HZ 8000
+#  elif defined __linux__
+#    define TRACY_SAMPLING_HZ 10000
+#  endif
+#endif
+
+namespace tracy
+{
+
+static constexpr int GetSamplingFrequency()
+{
+#if defined _WIN32
+    return TRACY_SAMPLING_HZ > 8000 ? 8000 : ( TRACY_SAMPLING_HZ < 1 ? 1 : TRACY_SAMPLING_HZ );
+#else
+    return TRACY_SAMPLING_HZ > 1000000 ? 1000000 : ( TRACY_SAMPLING_HZ < 1 ? 1 : TRACY_SAMPLING_HZ );
+#endif
+}
+
+static constexpr int GetSamplingPeriod()
+{
+    return 1000000000 / GetSamplingFrequency();
+}
+
+}
+
+#  if defined _WIN32
 
 #    ifndef NOMINMAX
 #      define NOMINMAX
@@ -28,6 +58,7 @@ namespace tracy
 
 static const GUID PerfInfoGuid = { 0xce1dbfb4, 0x137e, 0x4da6, { 0x87, 0xb0, 0x3f, 0x59, 0xaa, 0x10, 0x2c, 0xbc } };
 static const GUID DxgKrnlGuid  = { 0x802ec45a, 0x1e99, 0x4b83, { 0x99, 0x20, 0x87, 0xc9, 0x82, 0x77, 0xba, 0x9d } };
+static const GUID ThreadV2Guid = { 0x3d6fa8d1, 0xfe05, 0x11d0, { 0x9d, 0xda, 0x00, 0xc0, 0x4f, 0xd7, 0xba, 0x7c } };
 
 
 static TRACEHANDLE s_traceHandle;
@@ -100,14 +131,6 @@ struct VSyncInfo
     uint64_t    flipFenceId;
 };
 
-#ifdef __CYGWIN__
-extern "C" typedef DWORD (WINAPI *t_GetProcessIdOfThread)( HANDLE );
-extern "C" typedef DWORD (WINAPI *t_GetProcessImageFileNameA)( HANDLE, LPSTR, DWORD );
-extern "C" ULONG WMIAPI TraceSetInformation(TRACEHANDLE SessionHandle, TRACE_INFO_CLASS InformationClass, PVOID TraceInformation, ULONG InformationLength);
-t_GetProcessIdOfThread GetProcessIdOfThread = (t_GetProcessIdOfThread)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetProcessIdOfThread" );
-t_GetProcessImageFileNameA GetProcessImageFileNameA = (t_GetProcessImageFileNameA)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetProcessImageFileNameA" );
-#endif
-
 extern "C" typedef NTSTATUS (WINAPI *t_NtQueryInformationThread)( HANDLE, THREADINFOCLASS, PVOID, ULONG, PULONG );
 extern "C" typedef BOOL (WINAPI *t_EnumProcessModules)( HANDLE, HMODULE*, DWORD, LPDWORD );
 extern "C" typedef BOOL (WINAPI *t_GetModuleInformation)( HANDLE, HMODULE, LPMODULEINFO, DWORD );
@@ -138,10 +161,8 @@ void WINAPI EventRecordCallback( PEVENT_RECORD record )
 
             TracyLfqPrepare( QueueType::ContextSwitch );
             MemWrite( &item->contextSwitch.time, hdr.TimeStamp.QuadPart );
-            memcpy( &item->contextSwitch.oldThread, &cswitch->oldThreadId, sizeof( cswitch->oldThreadId ) );
-            memcpy( &item->contextSwitch.newThread, &cswitch->newThreadId, sizeof( cswitch->newThreadId ) );
-            memset( ((char*)&item->contextSwitch.oldThread)+4, 0, 4 );
-            memset( ((char*)&item->contextSwitch.newThread)+4, 0, 4 );
+            MemWrite( &item->contextSwitch.oldThread, cswitch->oldThreadId );
+            MemWrite( &item->contextSwitch.newThread, cswitch->newThreadId );
             MemWrite( &item->contextSwitch.cpu, record->BufferContext.ProcessorNumber );
             MemWrite( &item->contextSwitch.reason, cswitch->oldThreadWaitReason );
             MemWrite( &item->contextSwitch.state, cswitch->oldThreadState );
@@ -153,8 +174,7 @@ void WINAPI EventRecordCallback( PEVENT_RECORD record )
 
             TracyLfqPrepare( QueueType::ThreadWakeup );
             MemWrite( &item->threadWakeup.time, hdr.TimeStamp.QuadPart );
-            memcpy( &item->threadWakeup.thread, &rt->threadId, sizeof( rt->threadId ) );
-            memset( ((char*)&item->threadWakeup.thread)+4, 0, 4 );
+            MemWrite( &item->threadWakeup.thread, rt->threadId );
             TracyLfqCommit;
         }
         else if( hdr.EventDescriptor.Opcode == 1 || hdr.EventDescriptor.Opcode == 3 )
@@ -174,7 +194,7 @@ void WINAPI EventRecordCallback( PEVENT_RECORD record )
         if( hdr.EventDescriptor.Opcode == 32 )
         {
             const auto sw = (const StackWalkEvent*)record->UserData;
-            if( sw->stackProcess == s_pid && ( sw->stack[0] & 0x8000000000000000 ) == 0 )
+            if( sw->stackProcess == s_pid )
             {
                 const uint64_t sz = ( record->UserDataLength - 16 ) / 8;
                 if( sz > 0 )
@@ -184,7 +204,7 @@ void WINAPI EventRecordCallback( PEVENT_RECORD record )
                     memcpy( trace+1, sw->stack, sizeof( uint64_t ) * sz );
                     TracyLfqPrepare( QueueType::CallstackSample );
                     MemWrite( &item->callstackSampleFat.time, sw->eventTimeStamp );
-                    MemWrite( &item->callstackSampleFat.thread, (uint64_t)sw->stackThread );
+                    MemWrite( &item->callstackSampleFat.thread, sw->stackThread );
                     MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
                     TracyLfqCommit;
                 }
@@ -196,20 +216,6 @@ void WINAPI EventRecordCallback( PEVENT_RECORD record )
     }
 }
 
-static constexpr const char* VsyncName[] = {
-    "[0] Vsync",
-    "[1] Vsync",
-    "[2] Vsync",
-    "[3] Vsync",
-    "[4] Vsync",
-    "[5] Vsync",
-    "[6] Vsync",
-    "[7] Vsync",
-    "Vsync"
-};
-
-static uint32_t VsyncTarget[8] = {};
-
 void WINAPI EventRecordCallbackVsync( PEVENT_RECORD record )
 {
 #ifdef TRACY_ON_DEMAND
@@ -222,30 +228,15 @@ void WINAPI EventRecordCallbackVsync( PEVENT_RECORD record )
 
     const auto vs = (const VSyncInfo*)record->UserData;
 
-    int idx = 0;
-    do
-    {
-        if( VsyncTarget[idx] == 0 )
-        {
-            VsyncTarget[idx] = vs->vidPnTargetId;
-            break;
-        }
-        else if( VsyncTarget[idx] == vs->vidPnTargetId )
-        {
-            break;
-        }
-    }
-    while( ++idx < 8 );
-
-    TracyLfqPrepare( QueueType::FrameMarkMsg );
-    MemWrite( &item->frameMark.time, hdr.TimeStamp.QuadPart );
-    MemWrite( &item->frameMark.name, uint64_t( VsyncName[idx] ) );
+    TracyLfqPrepare( QueueType::FrameVsync );
+    MemWrite( &item->frameVsync.time, hdr.TimeStamp.QuadPart );
+    MemWrite( &item->frameVsync.id, vs->vidPnTargetId );
     TracyLfqCommit;
 }
 
 static void SetupVsync()
 {
-#if _WIN32_WINNT >= _WIN32_WINNT_WINBLUE
+#if _WIN32_WINNT >= _WIN32_WINNT_WINBLUE && !defined(__MINGW32__)
     const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + MAX_PATH;
     s_propVsync = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz );
     memset( s_propVsync, 0, sizeof( EVENT_TRACE_PROPERTIES ) );
@@ -330,6 +321,11 @@ static void SetupVsync()
 #endif
 }
 
+static constexpr int GetSamplingInterval()
+{
+    return GetSamplingPeriod() / 100;
+}
+
 bool SysTraceStart( int64_t& samplingPeriod )
 {
     if( !_GetThreadDescription ) _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" );
@@ -360,10 +356,10 @@ bool SysTraceStart( int64_t& samplingPeriod )
     if( isOs64Bit )
     {
         TRACE_PROFILE_INTERVAL interval = {};
-        interval.Interval = 1250;   // 8 kHz
+        interval.Interval = GetSamplingInterval();
         const auto intervalStatus = TraceSetInformation( 0, TraceSampledProfileIntervalInfo, &interval, sizeof( interval ) );
         if( intervalStatus != ERROR_SUCCESS ) return false;
-        samplingPeriod = 125*1000;
+        samplingPeriod = GetSamplingPeriod();
     }
 
     const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + sizeof( KERNEL_LOGGER_NAME );
@@ -415,9 +411,11 @@ bool SysTraceStart( int64_t& samplingPeriod )
 
     if( isOs64Bit )
     {
-        CLASSIC_EVENT_ID stackId;
-        stackId.EventGuid = PerfInfoGuid;
-        stackId.Type = 46;
+        CLASSIC_EVENT_ID stackId[2] = {};
+        stackId[0].EventGuid = PerfInfoGuid;
+        stackId[0].Type = 46;
+        stackId[1].EventGuid = ThreadV2Guid;
+        stackId[1].Type = 36;
         const auto stackStatus = TraceSetInformation( s_traceHandle, TraceStackTracingInfo, &stackId, sizeof( stackId ) );
         if( stackStatus != ERROR_SUCCESS )
         {
@@ -476,7 +474,7 @@ void SysTraceWorker( void* ptr )
     tracy_free( s_prop );
 }
 
-void SysTraceSendExternalName( uint64_t thread )
+void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name )
 {
     bool threadSent = false;
     auto hnd = OpenThread( THREAD_QUERY_INFORMATION, FALSE, DWORD( thread ) );
@@ -486,16 +484,19 @@ void SysTraceSendExternalName( uint64_t thread )
     }
     if( hnd != 0 )
     {
-        PWSTR tmp;
-        _GetThreadDescription( hnd, &tmp );
-        char buf[256];
-        if( tmp )
+        if( _GetThreadDescription )
         {
-            auto ret = wcstombs( buf, tmp, 256 );
-            if( ret != 0 )
+            PWSTR tmp;
+            _GetThreadDescription( hnd, &tmp );
+            char buf[256];
+            if( tmp )
             {
-                GetProfiler().SendString( thread, buf, ret, QueueType::ExternalThreadName );
-                threadSent = true;
+                auto ret = wcstombs( buf, tmp, 256 );
+                if( ret != 0 )
+                {
+                    threadName = CopyString( buf, ret );
+                    threadSent = true;
+                }
             }
         }
         const auto pid = GetProcessIdOfThread( hnd );
@@ -525,7 +526,7 @@ void SysTraceSendExternalName( uint64_t thread )
                                     const auto modlen = _GetModuleBaseNameA( phnd, modules[i], buf2, 1024 );
                                     if( modlen != 0 )
                                     {
-                                        GetProfiler().SendString( thread, buf2, modlen, QueueType::ExternalThreadName );
+                                        threadName = CopyString( buf2, modlen );
                                         threadSent = true;
                                     }
                                 }
@@ -539,7 +540,7 @@ void SysTraceSendExternalName( uint64_t thread )
         CloseHandle( hnd );
         if( !threadSent )
         {
-            GetProfiler().SendString( thread, "???", 3, QueueType::ExternalThreadName );
+            threadName = CopyString( "???", 3 );
             threadSent = true;
         }
         if( pid != 0 )
@@ -553,7 +554,7 @@ void SysTraceSendExternalName( uint64_t thread )
             }
             if( pid == 4 )
             {
-                GetProfiler().SendString( thread, "System", 6, QueueType::ExternalName );
+                name = CopyStringFast( "System", 6 );
                 return;
             }
             else
@@ -569,7 +570,7 @@ void SysTraceSendExternalName( uint64_t thread )
                         auto ptr = buf2 + sz - 1;
                         while( ptr > buf2 && *ptr != '\\' ) ptr--;
                         if( *ptr == '\\' ) ptr++;
-                        GetProfiler().SendString( thread, ptr, QueueType::ExternalName );
+                        name = CopyStringFast( ptr );
                         return;
                     }
                 }
@@ -579,9 +580,9 @@ void SysTraceSendExternalName( uint64_t thread )
 
     if( !threadSent )
     {
-        GetProfiler().SendString( thread, "???", 3, QueueType::ExternalThreadName );
+        threadName = CopyString( "???", 3 );
     }
-    GetProfiler().SendString( thread, "???", 3, QueueType::ExternalName );
+    name = CopyStringFast( "???", 3 );
 }
 
 }
@@ -607,281 +608,139 @@ void SysTraceSendExternalName( uint64_t thread )
 #    include <sys/ioctl.h>
 #    include <sys/syscall.h>
 
+#    if defined __i386 || defined __x86_64__
+#      include "TracyCpuid.hpp"
+#    endif
+
 #    include "TracyProfiler.hpp"
 #    include "TracyRingBuffer.hpp"
 #    include "TracyThread.hpp"
 
-#    ifdef __ANDROID__
-#      include "TracySysTracePayload.hpp"
-#    endif
-
 namespace tracy
 {
 
-static const char BasePath[] = "/sys/kernel/debug/tracing/";
-static const char TracingOn[] = "tracing_on";
-static const char CurrentTracer[] = "current_tracer";
-static const char TraceOptions[] = "trace_options";
-static const char TraceClock[] = "trace_clock";
-static const char SchedSwitch[] = "events/sched/sched_switch/enable";
-static const char SchedWakeup[] = "events/sched/sched_wakeup/enable";
-static const char BufferSizeKb[] = "buffer_size_kb";
-static const char TracePipe[] = "trace_pipe";
-
 static std::atomic<bool> traceActive { false };
-static Thread* s_threadSampling = nullptr;
 static int s_numCpus = 0;
+static int s_numBuffers = 0;
+static int s_ctxBufferIdx = 0;
 
-static constexpr size_t RingBufSize = 64*1024;
-static RingBuffer<RingBufSize>* s_ring = nullptr;
+static RingBuffer* s_ring = nullptr;
+
+static const int ThreadHashSize = 4 * 1024;
+static uint32_t s_threadHash[ThreadHashSize] = {};
+
+static bool CurrentProcOwnsThread( uint32_t tid )
+{
+    const auto hash = tid & ( ThreadHashSize-1 );
+    const auto hv = s_threadHash[hash];
+    if( hv == tid ) return true;
+    if( hv == -tid ) return false;
+
+    char path[256];
+    sprintf( path, "/proc/self/task/%d", tid );
+    struct stat st;
+    if( stat( path, &st ) == 0 )
+    {
+        s_threadHash[hash] = tid;
+        return true;
+    }
+    else
+    {
+        s_threadHash[hash] = -tid;
+        return false;
+    }
+}
 
 static int perf_event_open( struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags )
 {
     return syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags );
 }
 
-static void SetupSampling( int64_t& samplingPeriod )
+enum TraceEventId
 {
-#ifndef CLOCK_MONOTONIC_RAW
-    return;
-#endif
+    EventCallstack,
+    EventCpuCycles,
+    EventInstructionsRetired,
+    EventCacheReference,
+    EventCacheMiss,
+    EventBranchRetired,
+    EventBranchMiss,
+    EventVsync,
+    EventContextSwitch,
+    EventWakeup,
+};
 
-    samplingPeriod = 100*1000;
-
-    s_numCpus = (int)std::thread::hardware_concurrency();
-    s_ring = (RingBuffer<RingBufSize>*)tracy_malloc( sizeof( RingBuffer<RingBufSize> ) * s_numCpus );
-
-    perf_event_attr pe = {};
-
-    pe.type = PERF_TYPE_SOFTWARE;
-    pe.size = sizeof( perf_event_attr );
-    pe.config = PERF_COUNT_SW_CPU_CLOCK;
-
-    pe.sample_freq = 10000;
-    pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN;
-#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 )
-    pe.sample_max_stack = 127;
-#endif
-    pe.exclude_callchain_kernel = 1;
-
-    pe.disabled = 1;
-    pe.freq = 1;
-#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
-    pe.use_clockid = 1;
-    pe.clockid = CLOCK_MONOTONIC_RAW;
-#endif
-
-    for( int i=0; i<s_numCpus; i++ )
+static void ProbePreciseIp( perf_event_attr& pe, unsigned long long config0, unsigned long long config1, pid_t pid )
+{
+    pe.config = config1;
+    pe.precise_ip = 3;
+    while( pe.precise_ip != 0 )
     {
-        const int fd = perf_event_open( &pe, -1, i, -1, 0 );
-        if( fd == -1 )
-        {
-            for( int j=0; j<i; j++ ) s_ring[j].~RingBuffer<RingBufSize>();
-            tracy_free( s_ring );
-            return;
-        }
-        new( s_ring+i ) RingBuffer<RingBufSize>( fd );
-    }
-
-    s_threadSampling = (Thread*)tracy_malloc( sizeof( Thread ) );
-    new(s_threadSampling) Thread( [] (void*) {
-        ThreadExitHandler threadExitHandler;
-        SetThreadName( "Tracy Sampling" );
-        sched_param sp = { 5 };
-        pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp );
-        uint32_t currentPid = (uint32_t)getpid();
-#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
-        for( int i=0; i<s_numCpus; i++ )
-        {
-            if( !s_ring[i].CheckTscCaps() )
-            {
-                for( int j=0; j<s_numCpus; j++ ) s_ring[j].~RingBuffer<RingBufSize>();
-                tracy_free( s_ring );
-                const char* err = "Tracy Profiler: sampling is disabled due to non-native scheduler clock. Are you running under a VM?";
-                Profiler::MessageAppInfo( err, strlen( err ) );
-                return;
-            }
-        }
-#endif
-        for( int i=0; i<s_numCpus; i++ ) s_ring[i].Enable();
-        for(;;)
-        {
-            bool hadData = false;
-            for( int i=0; i<s_numCpus; i++ )
-            {
-                if( !traceActive.load( std::memory_order_relaxed ) ) break;
-                if( !s_ring[i].HasData() ) continue;
-                hadData = true;
-
-                perf_event_header hdr;
-                s_ring[i].Read( &hdr, 0, sizeof( perf_event_header ) );
-                if( hdr.type == PERF_RECORD_SAMPLE )
-                {
-                    uint32_t pid, tid;
-                    uint64_t t0;
-                    uint64_t cnt;
-
-                    auto offset = sizeof( perf_event_header );
-                    s_ring[i].Read( &pid, offset, sizeof( uint32_t ) );
-                    if( pid == currentPid )
-                    {
-                        offset += sizeof( uint32_t );
-                        s_ring[i].Read( &tid, offset, sizeof( uint32_t ) );
-                        offset += sizeof( uint32_t );
-                        s_ring[i].Read( &t0, offset, sizeof( uint64_t ) );
-                        offset += sizeof( uint64_t );
-                        s_ring[i].Read( &cnt, offset, sizeof( uint64_t ) );
-                        offset += sizeof( uint64_t );
-
-                        auto trace = (uint64_t*)tracy_malloc( ( 1 + cnt ) * sizeof( uint64_t ) );
-                        s_ring[i].Read( trace+1, offset, sizeof( uint64_t ) * cnt );
-
-                        // remove non-canonical pointers
-                        do
-                        {
-                            const auto test = (int64_t)trace[cnt];
-                            const auto m1 = test >> 63;
-                            const auto m2 = test >> 47;
-                            if( m1 == m2 ) break;
-                        }
-                        while( --cnt > 0 );
-                        for( uint64_t j=1; j<cnt; j++ )
-                        {
-                            const auto test = (int64_t)trace[j];
-                            const auto m1 = test >> 63;
-                            const auto m2 = test >> 47;
-                            if( m1 != m2 ) trace[j] = 0;
-                        }
-
-                        // skip kernel frames
-                        uint64_t j;
-                        for( j=0; j<cnt; j++ )
-                        {
-                            if( (int64_t)trace[j+1] >= 0 ) break;
-                        }
-                        if( j == cnt )
-                        {
-                            tracy_free( trace );
-                        }
-                        else
-                        {
-                            if( j > 0 )
-                            {
-                                cnt -= j;
-                                memmove( trace+1, trace+1+j, sizeof( uint64_t ) * cnt );
-                            }
-                            memcpy( trace, &cnt, sizeof( uint64_t ) );
-
-#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
-                            t0 = s_ring[i].ConvertTimeToTsc( t0 );
-#endif
-
-                            TracyLfqPrepare( QueueType::CallstackSample );
-                            MemWrite( &item->callstackSampleFat.time, t0 );
-                            MemWrite( &item->callstackSampleFat.thread, (uint64_t)tid );
-                            MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
-                            TracyLfqCommit;
-                        }
-                    }
-                }
-                s_ring[i].Advance( hdr.size );
-            }
-            if( !traceActive.load( std::memory_order_relaxed) ) break;
-            if( !hadData )
-            {
-                std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
-            }
-        }
-
-        for( int i=0; i<s_numCpus; i++ ) s_ring[i].~RingBuffer<RingBufSize>();
-        tracy_free( s_ring );
-    }, nullptr );
-}
-
-#ifdef __ANDROID__
-static bool TraceWrite( const char* path, size_t psz, const char* val, size_t vsz )
-{
-    // Explanation for "su root sh -c": there are 2 flavors of "su" in circulation
-    // on Android. The default Android su has the following syntax to run a command
-    // as root:
-    //   su root 'command'
-    // and 'command' is exec'd not passed to a shell, so if shell interpretation is
-    // wanted, one needs to do:
-    //   su root sh -c 'command'
-    // Besides that default Android 'su' command, some Android devices use a different
-    // su with a command-line interface closer to the familiar util-linux su found
-    // on Linux distributions. Fortunately, both the util-linux su and the one
-    // in https://github.com/topjohnwu/Magisk seem to be happy with the above
-    // `su root sh -c 'command'` command line syntax.
-    char tmp[256];
-    sprintf( tmp, "su root sh -c 'echo \"%s\" > %s%s'", val, BasePath, path );
-    return system( tmp ) == 0;
-}
-#else
-static bool TraceWrite( const char* path, size_t psz, const char* val, size_t vsz )
-{
-    char tmp[256];
-    memcpy( tmp, BasePath, sizeof( BasePath ) - 1 );
-    memcpy( tmp + sizeof( BasePath ) - 1, path, psz );
-
-    int fd = open( tmp, O_WRONLY );
-    if( fd < 0 ) return false;
-
-    for(;;)
-    {
-        ssize_t cnt = write( fd, val, vsz );
-        if( cnt == (ssize_t)vsz )
+        const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC );
+        if( fd != -1 )
         {
             close( fd );
-            return true;
+            break;
         }
-        if( cnt < 0 )
+        pe.precise_ip--;
+    }
+    pe.config = config0;
+    while( pe.precise_ip != 0 )
+    {
+        const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC );
+        if( fd != -1 )
         {
             close( fd );
-            return false;
+            break;
         }
-        vsz -= cnt;
-        val += cnt;
+        pe.precise_ip--;
     }
+    TracyDebug( "  Probed precise_ip: %i\n", pe.precise_ip );
 }
-#endif
 
-#ifdef __ANDROID__
-void SysTraceInjectPayload()
+static void ProbePreciseIp( perf_event_attr& pe, pid_t pid )
 {
-    int pipefd[2];
-    if( pipe( pipefd ) == 0 )
+    pe.precise_ip = 3;
+    while( pe.precise_ip != 0 )
     {
-        const auto pid = fork();
-        if( pid == 0 )
+        const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC );
+        if( fd != -1 )
         {
-            // child
-            close( pipefd[1] );
-            if( dup2( pipefd[0], STDIN_FILENO ) >= 0 )
-            {
-                close( pipefd[0] );
-                execlp( "su", "su", "root", "sh", "-c", "cat > /data/tracy_systrace", (char*)nullptr );
-                exit( 1 );
-            }
-        }
-        else if( pid > 0 )
-        {
-            // parent
-            close( pipefd[0] );
-
-#ifdef __aarch64__
-            write( pipefd[1], tracy_systrace_aarch64_data, tracy_systrace_aarch64_size );
-#else
-            write( pipefd[1], tracy_systrace_armv7_data, tracy_systrace_armv7_size );
-#endif
-            close( pipefd[1] );
-            waitpid( pid, nullptr, 0 );
-
-            system( "su root sh -c 'chmod 700 /data/tracy_systrace'" );
+            close( fd );
+            break;
         }
+        pe.precise_ip--;
     }
+    TracyDebug( "  Probed precise_ip: %i\n", pe.precise_ip );
 }
+
+static bool IsGenuineIntel()
+{
+#if defined __i386 || defined __x86_64__
+    uint32_t regs[4] = {};
+    __get_cpuid( 0, regs, regs+1, regs+2, regs+3 );
+    char manufacturer[12];
+    memcpy( manufacturer, regs+1, 4 );
+    memcpy( manufacturer+4, regs+3, 4 );
+    memcpy( manufacturer+8, regs+2, 4 );
+    return memcmp( manufacturer, "GenuineIntel", 12 ) == 0;
+#else
+    return false;
 #endif
+}
+
+static const char* ReadFile( const char* path )
+{
+    int fd = open( path, O_RDONLY );
+    if( fd < 0 ) return nullptr;
+
+    static char tmp[64];
+    const auto cnt = read( fd, tmp, 63 );
+    close( fd );
+    if( cnt < 0 ) return nullptr;
+    tmp[cnt] = '\0';
+    return tmp;
+}
 
 bool SysTraceStart( int64_t& samplingPeriod )
 {
@@ -889,374 +748,776 @@ bool SysTraceStart( int64_t& samplingPeriod )
     return false;
 #endif
 
-    if( !TraceWrite( TracingOn, sizeof( TracingOn ), "0", 2 ) ) return false;
-    if( !TraceWrite( CurrentTracer, sizeof( CurrentTracer ), "nop", 4 ) ) return false;
-    TraceWrite( TraceOptions, sizeof( TraceOptions ), "norecord-cmd", 13 );
-    TraceWrite( TraceOptions, sizeof( TraceOptions ), "norecord-tgid", 14 );
-    TraceWrite( TraceOptions, sizeof( TraceOptions ), "noirq-info", 11 );
-    TraceWrite( TraceOptions, sizeof( TraceOptions ), "noannotate", 11 );
-#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
-    if( !TraceWrite( TraceClock, sizeof( TraceClock ), "x86-tsc", 8 ) ) return false;
+    const auto paranoidLevelStr = ReadFile( "/proc/sys/kernel/perf_event_paranoid" );
+    if( !paranoidLevelStr ) return false;
+#ifdef TRACY_VERBOSE
+    int paranoidLevel = 2;
+    paranoidLevel = atoi( paranoidLevelStr );
+    TracyDebug( "perf_event_paranoid: %i\n", paranoidLevel );
+#endif
+
+    int switchId = -1, wakeupId = -1, vsyncId = -1;
+    const auto switchIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_switch/id" );
+    if( switchIdStr ) switchId = atoi( switchIdStr );
+    const auto wakeupIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_wakeup/id" );
+    if( wakeupIdStr ) wakeupId = atoi( wakeupIdStr );
+    const auto vsyncIdStr = ReadFile( "/sys/kernel/debug/tracing/events/drm/drm_vblank_event/id" );
+    if( vsyncIdStr ) vsyncId = atoi( vsyncIdStr );
+
+    TracyDebug( "sched_switch id: %i\n", switchId );
+    TracyDebug( "sched_wakeup id: %i\n", wakeupId );
+    TracyDebug( "drm_vblank_event id: %i\n", vsyncId );
+
+#ifdef TRACY_NO_SAMPLE_RETIREMENT
+    const bool noRetirement = true;
 #else
-    if( !TraceWrite( TraceClock, sizeof( TraceClock ), "mono_raw", 9 ) ) return false;
-#endif
-    if( !TraceWrite( SchedSwitch, sizeof( SchedSwitch ), "1", 2 ) ) return false;
-    if( !TraceWrite( SchedWakeup, sizeof( SchedWakeup ), "1", 2 ) ) return false;
-    if( !TraceWrite( BufferSizeKb, sizeof( BufferSizeKb ), "4096", 5 ) ) return false;
-
-#if defined __ANDROID__ && ( defined __aarch64__ || defined __ARM_ARCH )
-    SysTraceInjectPayload();
+    const char* noRetirementEnv = GetEnvVar( "TRACY_NO_SAMPLE_RETIREMENT" );
+    const bool noRetirement = noRetirementEnv && noRetirementEnv[0] == '1';
 #endif
 
-    if( !TraceWrite( TracingOn, sizeof( TracingOn ), "1", 2 ) ) return false;
+#ifdef TRACY_NO_SAMPLE_CACHE
+    const bool noCache = true;
+#else
+    const char* noCacheEnv = GetEnvVar( "TRACY_NO_SAMPLE_CACHE" );
+    const bool noCache = noCacheEnv && noCacheEnv[0] == '1';
+#endif
+
+#ifdef TRACY_NO_SAMPLE_BRANCH
+    const bool noBranch = true;
+#else
+    const char* noBranchEnv = GetEnvVar( "TRACY_NO_SAMPLE_BRANCH" );
+    const bool noBranch = noBranchEnv && noBranchEnv[0] == '1';
+#endif
+
+#ifdef TRACY_NO_CONTEXT_SWITCH
+    const bool noCtxSwitch = true;
+#else
+    const char* noCtxSwitchEnv = GetEnvVar( "TRACY_NO_CONTEXT_SWITCH" );
+    const bool noCtxSwitch = noCtxSwitchEnv && noCtxSwitchEnv[0] == '1';
+#endif
+
+#ifdef TRACY_NO_VSYNC_CAPTURE
+    const bool noVsync = true;
+#else
+    const char* noVsyncEnv = GetEnvVar( "TRACY_NO_VSYNC_CAPTURE" );
+    const bool noVsync = noVsyncEnv && noVsyncEnv[0] == '1';
+#endif
+
+    samplingPeriod = GetSamplingPeriod();
+    uint32_t currentPid = (uint32_t)getpid();
+
+    s_numCpus = (int)std::thread::hardware_concurrency();
+
+    const auto maxNumBuffers = s_numCpus * (
+        1 +     // software sampling
+        2 +     // CPU cycles + instructions retired
+        2 +     // cache reference + miss
+        2 +     // branch retired + miss
+        2 +     // context switches + wakeups
+        1       // vsync
+    );
+    s_ring = (RingBuffer*)tracy_malloc( sizeof( RingBuffer ) * maxNumBuffers );
+    s_numBuffers = 0;
+
+    // software sampling
+    perf_event_attr pe = {};
+    pe.type = PERF_TYPE_SOFTWARE;
+    pe.size = sizeof( perf_event_attr );
+    pe.config = PERF_COUNT_SW_CPU_CLOCK;
+    pe.sample_freq = GetSamplingFrequency();
+    pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 )
+    pe.sample_max_stack = 127;
+#endif
+    pe.disabled = 1;
+    pe.freq = 1;
+    pe.inherit = 1;
+#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+    pe.use_clockid = 1;
+    pe.clockid = CLOCK_MONOTONIC_RAW;
+#endif
+
+    TracyDebug( "Setup software sampling\n" );
+    ProbePreciseIp( pe, currentPid );
+    for( int i=0; i<s_numCpus; i++ )
+    {
+        int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+        if( fd == -1 )
+        {
+            pe.exclude_kernel = 1;
+            ProbePreciseIp( pe, currentPid );
+            fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd == -1 )
+            {
+                TracyDebug( "  Failed to setup!\n");
+                break;
+            }
+            TracyDebug( "  No access to kernel samples\n" );
+        }
+        new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCallstack );
+        if( s_ring[s_numBuffers].IsValid() )
+        {
+            s_numBuffers++;
+            TracyDebug( "  Core %i ok\n", i );
+        }
+    }
+
+    // CPU cycles + instructions retired
+    pe = {};
+    pe.type = PERF_TYPE_HARDWARE;
+    pe.size = sizeof( perf_event_attr );
+    pe.sample_freq = 5000;
+    pe.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TIME;
+    pe.disabled = 1;
+    pe.exclude_kernel = 1;
+    pe.exclude_guest = 1;
+    pe.exclude_hv = 1;
+    pe.freq = 1;
+    pe.inherit = 1;
+#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+    pe.use_clockid = 1;
+    pe.clockid = CLOCK_MONOTONIC_RAW;
+#endif
+
+    if( !noRetirement )
+    {
+        TracyDebug( "Setup sampling cycles + retirement\n" );
+        ProbePreciseIp( pe, PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS, currentPid );
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCpuCycles );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+
+        pe.config = PERF_COUNT_HW_INSTRUCTIONS;
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventInstructionsRetired );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+    }
+
+    // cache reference + miss
+    if( !noCache )
+    {
+        TracyDebug( "Setup sampling CPU cache references + misses\n" );
+        ProbePreciseIp( pe, PERF_COUNT_HW_CACHE_REFERENCES, PERF_COUNT_HW_CACHE_MISSES, currentPid );
+        if( IsGenuineIntel() )
+        {
+            pe.precise_ip = 0;
+            TracyDebug( "  CPU is GenuineIntel, forcing precise_ip down to 0\n" );
+        }
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCacheReference );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+
+        pe.config = PERF_COUNT_HW_CACHE_MISSES;
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCacheMiss );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+    }
+
+    // branch retired + miss
+    if( !noBranch )
+    {
+        TracyDebug( "Setup sampling CPU branch retirements + misses\n" );
+        ProbePreciseIp( pe, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, PERF_COUNT_HW_BRANCH_MISSES, currentPid );
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventBranchRetired );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+
+        pe.config = PERF_COUNT_HW_BRANCH_MISSES;
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventBranchMiss );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+    }
+
+    s_ctxBufferIdx = s_numBuffers;
+
+    // vsync
+    if( !noVsync && vsyncId != -1 )
+    {
+        pe = {};
+        pe.type = PERF_TYPE_TRACEPOINT;
+        pe.size = sizeof( perf_event_attr );
+        pe.sample_period = 1;
+        pe.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_RAW;
+        pe.disabled = 1;
+        pe.config = vsyncId;
+#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+        pe.use_clockid = 1;
+        pe.clockid = CLOCK_MONOTONIC_RAW;
+#endif
+
+        TracyDebug( "Setup vsync capture\n" );
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventVsync, i );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+    }
+
+    // context switches
+    if( !noCtxSwitch && switchId != -1 )
+    {
+        pe = {};
+        pe.type = PERF_TYPE_TRACEPOINT;
+        pe.size = sizeof( perf_event_attr );
+        pe.sample_period = 1;
+        pe.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 )
+        pe.sample_max_stack = 127;
+#endif
+        pe.disabled = 1;
+        pe.inherit = 1;
+        pe.config = switchId;
+#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+        pe.use_clockid = 1;
+        pe.clockid = CLOCK_MONOTONIC_RAW;
+#endif
+
+        TracyDebug( "Setup context switch capture\n" );
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 256*1024, fd, EventContextSwitch, i );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+
+        if( wakeupId != -1 )
+        {
+            pe.config = wakeupId;
+            pe.config &= ~PERF_SAMPLE_CALLCHAIN;
+
+            TracyDebug( "Setup wakeup capture\n" );
+            for( int i=0; i<s_numCpus; i++ )
+            {
+                const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
+                if( fd != -1 )
+                {
+                    new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventWakeup, i );
+                    if( s_ring[s_numBuffers].IsValid() )
+                    {
+                        s_numBuffers++;
+                        TracyDebug( "  Core %i ok\n", i );
+                    }
+                }
+            }
+        }
+    }
+
+    TracyDebug( "Ringbuffers in use: %i\n", s_numBuffers );
+
     traceActive.store( true, std::memory_order_relaxed );
-
-    SetupSampling( samplingPeriod );
-
     return true;
 }
 
 void SysTraceStop()
 {
-    TraceWrite( TracingOn, sizeof( TracingOn ), "0", 2 );
     traceActive.store( false, std::memory_order_relaxed );
-    if( s_threadSampling )
+}
+
+static uint64_t* GetCallstackBlock( uint64_t cnt, RingBuffer& ring, uint64_t offset )
+{
+    auto trace = (uint64_t*)tracy_malloc_fast( ( 1 + cnt ) * sizeof( uint64_t ) );
+    ring.Read( trace+1, offset, sizeof( uint64_t ) * cnt );
+
+#if defined __x86_64__ || defined _M_X64
+    // remove non-canonical pointers
+    do
     {
-        s_threadSampling->~Thread();
-        tracy_free( s_threadSampling );
+        const auto test = (int64_t)trace[cnt];
+        const auto m1 = test >> 63;
+        const auto m2 = test >> 47;
+        if( m1 == m2 ) break;
     }
-}
-
-static uint64_t ReadNumber( const char*& data )
-{
-    auto ptr = data;
-    assert( *ptr >= '0' && *ptr <= '9' );
-    uint64_t val = *ptr++ - '0';
-    for(;;)
+    while( --cnt > 0 );
+    for( uint64_t j=1; j<cnt; j++ )
     {
-        const uint8_t v = uint8_t( *ptr - '0' );
-        if( v > 9 ) break;
-        val = val * 10 + v;
-        ptr++;
+        const auto test = (int64_t)trace[j];
+        const auto m1 = test >> 63;
+        const auto m2 = test >> 47;
+        if( m1 != m2 ) trace[j] = 0;
     }
-    data = ptr;
-    return val;
-}
-
-static uint8_t ReadState( char state )
-{
-    switch( state )
-    {
-    case 'D': return 101;
-    case 'I': return 102;
-    case 'R': return 103;
-    case 'S': return 104;
-    case 'T': return 105;
-    case 't': return 106;
-    case 'W': return 107;
-    case 'X': return 108;
-    case 'Z': return 109;
-    default: return 100;
-    }
-}
-
-#if defined __ANDROID__ && defined __ANDROID_API__ && __ANDROID_API__ < 18
-/*-
- * Copyright (c) 2011 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by Christos Zoulas.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-ssize_t getdelim(char **buf, size_t *bufsiz, int delimiter, FILE *fp)
-{
-	char *ptr, *eptr;
-
-	if (*buf == NULL || *bufsiz == 0) {
-		*bufsiz = BUFSIZ;
-		if ((*buf = (char*)malloc(*bufsiz)) == NULL)
-			return -1;
-	}
-
-	for (ptr = *buf, eptr = *buf + *bufsiz;;) {
-		int c = fgetc(fp);
-		if (c == -1) {
-			if (feof(fp))
-				return ptr == *buf ? -1 : ptr - *buf;
-			else
-				return -1;
-		}
-		*ptr++ = c;
-		if (c == delimiter) {
-			*ptr = '\0';
-			return ptr - *buf;
-		}
-		if (ptr + 2 >= eptr) {
-			char *nbuf;
-			size_t nbufsiz = *bufsiz * 2;
-			ssize_t d = ptr - *buf;
-			if ((nbuf = (char*)realloc(*buf, nbufsiz)) == NULL)
-				return -1;
-			*buf = nbuf;
-			*bufsiz = nbufsiz;
-			eptr = nbuf + nbufsiz;
-			ptr = nbuf + d;
-		}
-	}
-}
-
-ssize_t getline(char **buf, size_t *bufsiz, FILE *fp)
-{
-	return getdelim(buf, bufsiz, '\n', fp);
-}
 #endif
 
-static void HandleTraceLine( const char* line )
-{
-    line += 23;
-    while( *line != '[' ) line++;
-    line++;
-    const auto cpu = (uint8_t)ReadNumber( line );
-    line++;      // ']'
-    while( *line == ' ' ) line++;
-
-#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
-    const auto time = ReadNumber( line );
-#else
-    const auto ts = ReadNumber( line );
-    line++;      // '.'
-    const auto tus = ReadNumber( line );
-    const auto time = ts * 1000000000ll + tus * 1000ll;
-#endif
-
-    line += 2;   // ': '
-    if( memcmp( line, "sched_switch", 12 ) == 0 )
+    for( uint64_t j=1; j<=cnt; j++ )
     {
-        line += 14;
-
-        while( memcmp( line, "prev_pid", 8 ) != 0 ) line++;
-        line += 9;
-
-        const auto oldPid = ReadNumber( line );
-        line++;
-
-        while( memcmp( line, "prev_state", 10 ) != 0 ) line++;
-        line += 11;
-
-        const auto oldState = (uint8_t)ReadState( *line );
-        line += 5;
-
-        while( memcmp( line, "next_pid", 8 ) != 0 ) line++;
-        line += 9;
-
-        const auto newPid = ReadNumber( line );
-
-        uint8_t reason = 100;
-
-        TracyLfqPrepare( QueueType::ContextSwitch );
-        MemWrite( &item->contextSwitch.time, time );
-        MemWrite( &item->contextSwitch.oldThread, oldPid );
-        MemWrite( &item->contextSwitch.newThread, newPid );
-        MemWrite( &item->contextSwitch.cpu, cpu );
-        MemWrite( &item->contextSwitch.reason, reason );
-        MemWrite( &item->contextSwitch.state, oldState );
-        TracyLfqCommit;
+        if( trace[j] >= (uint64_t)-4095 )       // PERF_CONTEXT_MAX
+        {
+            memmove( trace+j, trace+j+1, sizeof( uint64_t ) * ( cnt - j ) );
+            cnt--;
+        }
     }
-    else if( memcmp( line, "sched_wakeup", 12 ) == 0 )
-    {
-        line += 14;
 
-        while( memcmp( line, "pid=", 4 ) != 0 ) line++;
-        line += 4;
-
-        const auto pid = ReadNumber( line );
-
-        TracyLfqPrepare( QueueType::ThreadWakeup );
-        MemWrite( &item->threadWakeup.time, time );
-        MemWrite( &item->threadWakeup.thread, pid );
-        TracyLfqCommit;
-    }
+    memcpy( trace, &cnt, sizeof( uint64_t ) );
+    return trace;
 }
 
-#ifdef __ANDROID__
-static void ProcessTraceLines( int fd )
+void SysTraceWorker( void* ptr )
 {
-    // Linux pipe buffer is 64KB, additional 1KB is for unfinished lines
-    char* buf = (char*)tracy_malloc( (64+1)*1024 );
-    char* line = buf;
-
+    ThreadExitHandler threadExitHandler;
+    SetThreadName( "Tracy Sampling" );
+    InitRpmalloc();
+    sched_param sp = { 99 };
+    if( pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp ) != 0 ) TracyDebug( "Failed to increase SysTraceWorker thread priority!\n" );
+    auto ctxBufferIdx = s_ctxBufferIdx;
+    auto ringArray = s_ring;
+    auto numBuffers = s_numBuffers;
+    for( int i=0; i<numBuffers; i++ ) ringArray[i].Enable();
     for(;;)
     {
-        if( !traceActive.load( std::memory_order_relaxed ) ) break;
-
-        const auto rd = read( fd, line, 64*1024 );
-        if( rd <= 0 ) break;
-
 #ifdef TRACY_ON_DEMAND
         if( !GetProfiler().IsConnected() )
         {
-            if( rd < 64*1024 )
+            if( !traceActive.load( std::memory_order_relaxed ) ) break;
+            for( int i=0; i<numBuffers; i++ )
             {
-                assert( line[rd-1] == '\n' );
-                line = buf;
-                std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
-            }
-            else
-            {
-                const auto end = line + rd;
-                line = end - 1;
-                while( line > buf && *line != '\n' ) line--;
-                if( line > buf )
+                auto& ring = ringArray[i];
+                const auto head = ring.LoadHead();
+                const auto tail = ring.GetTail();
+                if( head != tail )
                 {
-                    line++;
-                    const auto lsz = end - line;
-                    memmove( buf, line, lsz );
-                    line = buf + lsz;
+                    const auto end = head - tail;
+                    ring.Advance( end );
                 }
             }
+            if( !traceActive.load( std::memory_order_relaxed ) ) break;
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
             continue;
         }
 #endif
 
-        const auto end = line + rd;
-        line = buf;
-        for(;;)
-        {
-            auto next = (char*)memchr( line, '\n', end - line );
-            if( !next )
-            {
-                const auto lsz = end - line;
-                memmove( buf, line, lsz );
-                line = buf + lsz;
-                break;
-            }
-            HandleTraceLine( line );
-            line = ++next;
-        }
-        if( rd < 64*1024 )
-        {
-            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
-        }
-    }
-
-    tracy_free( buf );
-}
-
-void SysTraceWorker( void* ptr )
-{
-    ThreadExitHandler threadExitHandler;
-    SetThreadName( "Tracy SysTrace" );
-    int pipefd[2];
-    if( pipe( pipefd ) == 0 )
-    {
-        const auto pid = fork();
-        if( pid == 0 )
-        {
-            // child
-            close( pipefd[0] );
-            dup2( open( "/dev/null", O_WRONLY ), STDERR_FILENO );
-            if( dup2( pipefd[1], STDOUT_FILENO ) >= 0 )
-            {
-                close( pipefd[1] );
-                sched_param sp = { 4 };
-                pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp );
-#if defined __ANDROID__ && ( defined __aarch64__ || defined __ARM_ARCH )
-                execlp( "su", "su", "root", "sh", "-c", "/data/tracy_systrace", (char*)nullptr );
-#endif
-                execlp( "su", "su", "root", "sh", "-c", "cat /sys/kernel/debug/tracing/trace_pipe", (char*)nullptr );
-                exit( 1 );
-            }
-        }
-        else if( pid > 0 )
-        {
-            // parent
-            close( pipefd[1] );
-            sched_param sp = { 5 };
-            pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp );
-            ProcessTraceLines( pipefd[0] );
-            close( pipefd[0] );
-            waitpid( pid, nullptr, 0 );
-        }
-    }
-}
-#else
-static void ProcessTraceLines( int fd )
-{
-    char* buf = (char*)tracy_malloc( 64*1024 );
-
-    struct pollfd pfd;
-    pfd.fd = fd;
-    pfd.events = POLLIN | POLLERR;
-
-    for(;;)
-    {
-        while( poll( &pfd, 1, 0 ) <= 0 )
+        bool hadData = false;
+        for( int i=0; i<ctxBufferIdx; i++ )
         {
             if( !traceActive.load( std::memory_order_relaxed ) ) break;
-            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+            auto& ring = ringArray[i];
+            const auto head = ring.LoadHead();
+            const auto tail = ring.GetTail();
+            if( head == tail ) continue;
+            assert( head > tail );
+            hadData = true;
+
+            const auto id = ring.GetId();
+            assert( id != EventContextSwitch );
+            const auto end = head - tail;
+            uint64_t pos = 0;
+            if( id == EventCallstack )
+            {
+                while( pos < end )
+                {
+                    perf_event_header hdr;
+                    ring.Read( &hdr, pos, sizeof( perf_event_header ) );
+                    if( hdr.type == PERF_RECORD_SAMPLE )
+                    {
+                        auto offset = pos + sizeof( perf_event_header );
+
+                        // Layout:
+                        //   u32 pid, tid
+                        //   u64 time
+                        //   u64 cnt
+                        //   u64 ip[cnt]
+
+                        uint32_t tid;
+                        uint64_t t0;
+                        uint64_t cnt;
+
+                        offset += sizeof( uint32_t );
+                        ring.Read( &tid, offset, sizeof( uint32_t ) );
+                        offset += sizeof( uint32_t );
+                        ring.Read( &t0, offset, sizeof( uint64_t ) );
+                        offset += sizeof( uint64_t );
+                        ring.Read( &cnt, offset, sizeof( uint64_t ) );
+                        offset += sizeof( uint64_t );
+
+                        if( cnt > 0 )
+                        {
+#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+                            t0 = ring.ConvertTimeToTsc( t0 );
+#endif
+                            auto trace = GetCallstackBlock( cnt, ring, offset );
+
+                            TracyLfqPrepare( QueueType::CallstackSample );
+                            MemWrite( &item->callstackSampleFat.time, t0 );
+                            MemWrite( &item->callstackSampleFat.thread, tid );
+                            MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
+                            TracyLfqCommit;
+                        }
+                    }
+                    pos += hdr.size;
+                }
+            }
+            else
+            {
+                while( pos < end )
+                {
+                    perf_event_header hdr;
+                    ring.Read( &hdr, pos, sizeof( perf_event_header ) );
+                    if( hdr.type == PERF_RECORD_SAMPLE )
+                    {
+                        auto offset = pos + sizeof( perf_event_header );
+
+                        // Layout:
+                        //   u64 ip
+                        //   u64 time
+
+                        uint64_t ip, t0;
+                        ring.Read( &ip, offset, sizeof( uint64_t ) );
+                        offset += sizeof( uint64_t );
+                        ring.Read( &t0, offset, sizeof( uint64_t ) );
+
+#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+                        t0 = ring.ConvertTimeToTsc( t0 );
+#endif
+                        QueueType type;
+                        switch( id )
+                        {
+                        case EventCpuCycles:
+                            type = QueueType::HwSampleCpuCycle;
+                            break;
+                        case EventInstructionsRetired:
+                            type = QueueType::HwSampleInstructionRetired;
+                            break;
+                        case EventCacheReference:
+                            type = QueueType::HwSampleCacheReference;
+                            break;
+                        case EventCacheMiss:
+                            type = QueueType::HwSampleCacheMiss;
+                            break;
+                        case EventBranchRetired:
+                            type = QueueType::HwSampleBranchRetired;
+                            break;
+                        case EventBranchMiss:
+                            type = QueueType::HwSampleBranchMiss;
+                            break;
+                        default:
+                            assert( false );
+                            break;
+                        }
+
+                        TracyLfqPrepare( type );
+                        MemWrite( &item->hwSample.ip, ip );
+                        MemWrite( &item->hwSample.time, t0 );
+                        TracyLfqCommit;
+                    }
+                    pos += hdr.size;
+                }
+            }
+            assert( pos == end );
+            ring.Advance( end );
         }
+        if( !traceActive.load( std::memory_order_relaxed ) ) break;
 
-        const auto rd = read( fd, buf, 64*1024 );
-        if( rd <= 0 ) break;
+        if( ctxBufferIdx != numBuffers )
+        {
+            const auto ctxBufNum = numBuffers - ctxBufferIdx;
 
-#ifdef TRACY_ON_DEMAND
-        if( !GetProfiler().IsConnected() ) continue;
+            int activeNum = 0;
+            uint16_t active[512];
+            uint32_t end[512];
+            uint32_t pos[512];
+            for( int i=0; i<ctxBufNum; i++ )
+            {
+                const auto rbIdx = ctxBufferIdx + i;
+                const auto rbHead = ringArray[rbIdx].LoadHead();
+                const auto rbTail = ringArray[rbIdx].GetTail();
+                const auto rbActive = rbHead != rbTail;
+
+                if( rbActive )
+                {
+                    active[activeNum] = (uint16_t)i;
+                    activeNum++;
+                    end[i] = rbHead - rbTail;
+                    pos[i] = 0;
+                }
+                else
+                {
+                    end[i] = 0;
+                }
+            }
+            if( activeNum > 0 )
+            {
+                hadData = true;
+                while( activeNum > 0 )
+                {
+                    int sel = -1;
+                    int selPos;
+                    int64_t t0 = std::numeric_limits<int64_t>::max();
+                    for( int i=0; i<activeNum; i++ )
+                    {
+                        auto idx = active[i];
+                        auto rbPos = pos[idx];
+                        assert( rbPos < end[idx] );
+                        const auto rbIdx = ctxBufferIdx + idx;
+                        perf_event_header hdr;
+                        ringArray[rbIdx].Read( &hdr, rbPos, sizeof( perf_event_header ) );
+                        if( hdr.type == PERF_RECORD_SAMPLE )
+                        {
+                            int64_t rbTime;
+                            ringArray[rbIdx].Read( &rbTime, rbPos + sizeof( perf_event_header ), sizeof( int64_t ) );
+                            if( rbTime < t0 )
+                            {
+                                t0 = rbTime;
+                                sel = idx;
+                                selPos = i;
+                            }
+                        }
+                        else
+                        {
+                            rbPos += hdr.size;
+                            if( rbPos == end[idx] )
+                            {
+                                memmove( active+i, active+i+1, sizeof(*active) * ( activeNum - i - 1 ) );
+                                activeNum--;
+                                i--;
+                            }
+                            else
+                            {
+                                pos[idx] = rbPos;
+                            }
+                        }
+                    }
+                    if( sel >= 0 )
+                    {
+                        auto& ring = ringArray[ctxBufferIdx + sel];
+                        auto rbPos = pos[sel];
+                        auto offset = rbPos;
+                        perf_event_header hdr;
+                        ring.Read( &hdr, offset, sizeof( perf_event_header ) );
+
+#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+                        t0 = ring.ConvertTimeToTsc( t0 );
 #endif
 
-        auto line = buf;
-        const auto end = buf + rd;
-        for(;;)
+                        const auto rid = ring.GetId();
+                        if( rid == EventContextSwitch )
+                        {
+                            // Layout:
+                            //   u64 time
+                            //   u64 cnt
+                            //   u64 ip[cnt]
+                            //   u32 size
+                            //   u8  data[size]
+                            // Data (not ABI stable, but has not changed since it was added, in 2009):
+                            //   u8  hdr[8]
+                            //   u8  prev_comm[16]
+                            //   u32 prev_pid
+                            //   u32 prev_prio
+                            //   lng prev_state
+                            //   u8  next_comm[16]
+                            //   u32 next_pid
+                            //   u32 next_prio
+
+                            offset += sizeof( perf_event_header ) + sizeof( uint64_t );
+
+                            uint64_t cnt;
+                            ring.Read( &cnt, offset, sizeof( uint64_t ) );
+                            offset += sizeof( uint64_t );
+                            const auto traceOffset = offset;
+                            offset += sizeof( uint64_t ) * cnt + sizeof( uint32_t ) + 8 + 16;
+
+                            uint32_t prev_pid, next_pid;
+                            long prev_state;
+
+                            ring.Read( &prev_pid, offset, sizeof( uint32_t ) );
+                            offset += sizeof( uint32_t ) + sizeof( uint32_t );
+                            ring.Read( &prev_state, offset, sizeof( long ) );
+                            offset += sizeof( long ) + 16;
+                            ring.Read( &next_pid, offset, sizeof( uint32_t ) );
+
+                            uint8_t reason = 100;
+                            uint8_t state;
+
+                            if(      prev_state & 0x0001 ) state = 104;
+                            else if( prev_state & 0x0002 ) state = 101;
+                            else if( prev_state & 0x0004 ) state = 105;
+                            else if( prev_state & 0x0008 ) state = 106;
+                            else if( prev_state & 0x0010 ) state = 108;
+                            else if( prev_state & 0x0020 ) state = 109;
+                            else if( prev_state & 0x0040 ) state = 110;
+                            else if( prev_state & 0x0080 ) state = 102;
+                            else                           state = 103;
+
+                            TracyLfqPrepare( QueueType::ContextSwitch );
+                            MemWrite( &item->contextSwitch.time, t0 );
+                            MemWrite( &item->contextSwitch.oldThread, prev_pid );
+                            MemWrite( &item->contextSwitch.newThread, next_pid );
+                            MemWrite( &item->contextSwitch.cpu, uint8_t( ring.GetCpu() ) );
+                            MemWrite( &item->contextSwitch.reason, reason );
+                            MemWrite( &item->contextSwitch.state, state );
+                            TracyLfqCommit;
+
+                            if( cnt > 0 && prev_pid != 0 && CurrentProcOwnsThread( prev_pid ) )
+                            {
+                                auto trace = GetCallstackBlock( cnt, ring, traceOffset );
+
+                                TracyLfqPrepare( QueueType::CallstackSampleContextSwitch );
+                                MemWrite( &item->callstackSampleFat.time, t0 );
+                                MemWrite( &item->callstackSampleFat.thread, prev_pid );
+                                MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
+                                TracyLfqCommit;
+                            }
+                        }
+                        else if( rid == EventWakeup )
+                        {
+                            // Layout:
+                            //   u64 time
+                            //   u32 size
+                            //   u8  data[size]
+                            // Data:
+                            //   u8  hdr[8]
+                            //   u8  comm[16]
+                            //   u32 pid
+                            //   u32 prio
+                            //   u64 target_cpu
+
+                            offset += sizeof( perf_event_header ) + sizeof( uint64_t ) + sizeof( uint32_t ) + 8 + 16;
+
+                            uint32_t pid;
+                            ring.Read( &pid, offset, sizeof( uint32_t ) );
+
+                            TracyLfqPrepare( QueueType::ThreadWakeup );
+                            MemWrite( &item->threadWakeup.time, t0 );
+                            MemWrite( &item->threadWakeup.thread, pid );
+                            TracyLfqCommit;
+                        }
+                        else
+                        {
+                            assert( rid == EventVsync );
+                            // Layout:
+                            //   u64 time
+                            //   u32 size
+                            //   u8  data[size]
+                            // Data (not ABI stable):
+                            //   u8  hdr[8]
+                            //   i32 crtc
+                            //   u32 seq
+                            //   i64 ktime
+                            //   u8  high precision
+
+                            offset += sizeof( perf_event_header ) + sizeof( uint64_t ) + sizeof( uint32_t ) + 8;
+
+                            int32_t crtc;
+                            ring.Read( &crtc, offset, sizeof( int32_t ) );
+
+                            // Note: The timestamp value t0 might be off by a number of microseconds from the
+                            // true hardware vblank event. The ktime value should be used instead, but it is
+                            // measured in CLOCK_MONOTONIC time. Tracy only supports the timestamp counter
+                            // register (TSC) or CLOCK_MONOTONIC_RAW clock.
+#if 0
+                            offset += sizeof( uint32_t ) * 2;
+                            int64_t ktime;
+                            ring.Read( &ktime, offset, sizeof( int64_t ) );
+#endif
+
+                            TracyLfqPrepare( QueueType::FrameVsync );
+                            MemWrite( &item->frameVsync.id, crtc );
+                            MemWrite( &item->frameVsync.time, t0 );
+                            TracyLfqCommit;
+                        }
+
+                        rbPos += hdr.size;
+                        if( rbPos == end[sel] )
+                        {
+                            memmove( active+selPos, active+selPos+1, sizeof(*active) * ( activeNum - selPos - 1 ) );
+                            activeNum--;
+                        }
+                        else
+                        {
+                            pos[sel] = rbPos;
+                        }
+                    }
+                }
+                for( int i=0; i<ctxBufNum; i++ )
+                {
+                    if( end[i] != 0 ) ringArray[ctxBufferIdx + i].Advance( end[i] );
+                }
+            }
+        }
+        if( !traceActive.load( std::memory_order_relaxed ) ) break;
+        if( !hadData )
         {
-            auto next = (char*)memchr( line, '\n', end - line );
-            if( !next ) break;
-            HandleTraceLine( line );
-            line = ++next;
+            std::this_thread::sleep_for( std::chrono::milliseconds( 1 ) );
         }
     }
 
-    tracy_free( buf );
+    for( int i=0; i<numBuffers; i++ ) ringArray[i].~RingBuffer();
+    tracy_free_fast( ringArray );
 }
 
-void SysTraceWorker( void* ptr )
-{
-    ThreadExitHandler threadExitHandler;
-    SetThreadName( "Tracy SysTrace" );
-    char tmp[256];
-    memcpy( tmp, BasePath, sizeof( BasePath ) - 1 );
-    memcpy( tmp + sizeof( BasePath ) - 1, TracePipe, sizeof( TracePipe ) );
-
-    int fd = open( tmp, O_RDONLY );
-    if( fd < 0 ) return;
-    sched_param sp = { 5 };
-    pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp );
-    ProcessTraceLines( fd );
-    close( fd );
-}
-#endif
-
-void SysTraceSendExternalName( uint64_t thread )
+void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name )
 {
     FILE* f;
     char fn[256];
@@ -1267,33 +1528,37 @@ void SysTraceSendExternalName( uint64_t thread )
         char buf[256];
         const auto sz = fread( buf, 1, 256, f );
         if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0';
-        GetProfiler().SendString( thread, buf, QueueType::ExternalThreadName );
+        threadName = CopyString( buf );
         fclose( f );
     }
     else
     {
-        GetProfiler().SendString( thread, "???", 3, QueueType::ExternalThreadName );
+        threadName = CopyString( "???", 3 );
     }
 
     sprintf( fn, "/proc/%" PRIu64 "/status", thread );
     f = fopen( fn, "rb" );
     if( f )
     {
+        char* tmp = (char*)tracy_malloc_fast( 8*1024 );
+        const auto fsz = (ptrdiff_t)fread( tmp, 1, 8*1024, f );
+        fclose( f );
+
         int pid = -1;
-        size_t lsz = 1024;
-        auto line = (char*)tracy_malloc( lsz );
+        auto line = tmp;
         for(;;)
         {
-            auto rd = getline( &line, &lsz, f );
-            if( rd <= 0 ) break;
             if( memcmp( "Tgid:\t", line, 6 ) == 0 )
             {
                 pid = atoi( line + 6 );
                 break;
             }
+            while( line - tmp < fsz && *line != '\n' ) line++;
+            if( *line != '\n' ) break;
+            line++;
         }
-        tracy_free( line );
-        fclose( f );
+        tracy_free_fast( tmp );
+
         if( pid >= 0 )
         {
             {
@@ -1310,13 +1575,13 @@ void SysTraceSendExternalName( uint64_t thread )
                 char buf[256];
                 const auto sz = fread( buf, 1, 256, f );
                 if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0';
-                GetProfiler().SendString( thread, buf, QueueType::ExternalName );
+                name = CopyStringFast( buf );
                 fclose( f );
                 return;
             }
         }
     }
-    GetProfiler().SendString( thread, "???", 3, QueueType::ExternalName );
+    name = CopyStringFast( "???", 3 );
 }
 
 }
diff --git a/Source/ThirdParty/tracy/client/TracySysTrace.hpp b/Source/ThirdParty/tracy/client/TracySysTrace.hpp
index 688cbf2ae..8c663cd7a 100644
--- a/Source/ThirdParty/tracy/client/TracySysTrace.hpp
+++ b/Source/ThirdParty/tracy/client/TracySysTrace.hpp
@@ -1,8 +1,11 @@
 #ifndef __TRACYSYSTRACE_HPP__
 #define __TRACYSYSTRACE_HPP__
 
-#if !defined TRACY_NO_SYSTEM_TRACING && ( defined _WIN32 || defined __CYGWIN__ || defined __linux__ )
-#  define TRACY_HAS_SYSTEM_TRACING
+#if !defined TRACY_NO_SYSTEM_TRACING && ( defined _WIN32 || defined __linux__ )
+#  include "../common/TracyUwp.hpp"
+#  ifndef TRACY_UWP
+#    define TRACY_HAS_SYSTEM_TRACING
+#  endif
 #endif
 
 #ifdef TRACY_HAS_SYSTEM_TRACING
@@ -16,7 +19,7 @@ bool SysTraceStart( int64_t& samplingPeriod );
 void SysTraceStop();
 void SysTraceWorker( void* ptr );
 
-void SysTraceSendExternalName( uint64_t thread );
+void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name );
 
 }
 
diff --git a/Source/ThirdParty/tracy/client/TracySysTracePayload.hpp b/Source/ThirdParty/tracy/client/TracySysTracePayload.hpp
deleted file mode 100644
index 7c292f9d0..000000000
--- a/Source/ThirdParty/tracy/client/TracySysTracePayload.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-// File: 'extra/systrace/tracy_systrace.armv7' (1149 bytes)
-// File: 'extra/systrace/tracy_systrace.aarch64' (1650 bytes)
-
-// Exported using binary_to_compressed_c.cpp
-
-namespace tracy
-{
-
-static const unsigned int tracy_systrace_armv7_size = 1149;
-static const unsigned int tracy_systrace_armv7_data[1152/4] =
-{
-    0x464c457f, 0x00010101, 0x00000000, 0x00000000, 0x00280003, 0x00000001, 0x000001f0, 0x00000034, 0x00000000, 0x05000200, 0x00200034, 0x00280007, 
-    0x00000000, 0x00000006, 0x00000034, 0x00000034, 0x00000034, 0x000000e0, 0x000000e0, 0x00000004, 0x00000004, 0x00000003, 0x00000114, 0x00000114, 
-    0x00000114, 0x00000013, 0x00000013, 0x00000004, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x000003fd, 0x000003fd, 0x00000005, 
-    0x00001000, 0x00000001, 0x000003fd, 0x000013fd, 0x000013fd, 0x00000080, 0x000000b3, 0x00000006, 0x00001000, 0x00000002, 0x00000400, 0x00001400, 
-    0x00001400, 0x0000007d, 0x000000b0, 0x00000006, 0x00000004, 0x6474e551, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000006, 
-    0x00000004, 0x70000001, 0x000003a4, 0x000003a4, 0x000003a4, 0x00000008, 0x00000008, 0x00000004, 0x00000004, 0x7379732f, 0x2f6d6574, 0x2f6e6962, 
-    0x6b6e696c, 0x00007265, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000012, 0x00000016, 0x00000000, 
-    0x00000000, 0x00000012, 0x6f6c6400, 0x006e6570, 0x4342494c, 0x62696c00, 0x732e6c64, 0x6c64006f, 0x006d7973, 0x00000001, 0x00000003, 0x00000001, 
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00010001, 0x0000000d, 0x00000010, 0x00000000, 0x00050d63, 0x00020000, 0x00000008, 
-    0x00000000, 0x000014bc, 0x00000116, 0x000014c0, 0x00000216, 0xe52de004, 0xe59fe004, 0xe08fe00e, 0xe5bef008, 0x000012dc, 0xe28fc600, 0xe28cca01, 
-    0xe5bcf2dc, 0xe28fc600, 0xe28cca01, 0xe5bcf2d4, 0xe92d4ff0, 0xe28db01c, 0xe24dd024, 0xe24dd801, 0xe59f017c, 0xe3a01001, 0xe3a08001, 0xe08f0000, 
-    0xebfffff0, 0xe59f116c, 0xe1a04000, 0xe08f1001, 0xebffffef, 0xe59f1160, 0xe1a06000, 0xe1a00004, 0xe08f1001, 0xebffffea, 0xe59f1150, 0xe1a07000, 
-    0xe1a00004, 0xe08f1001, 0xebffffe5, 0xe59f1140, 0xe1a05000, 0xe1a00004, 0xe08f1001, 0xebffffe0, 0xe58d0004, 0xe1a00004, 0xe59f1128, 0xe08f1001, 
-    0xebffffdb, 0xe59f1120, 0xe1a0a000, 0xe1a00004, 0xe08f1001, 0xebffffd6, 0xe1a04000, 0xe59f010c, 0xe3a01000, 0xe3a09000, 0xe08f0000, 0xe12fff36, 
-    0xe1a06000, 0xe3700001, 0xca000001, 0xe3a00000, 0xe12fff37, 0xe3a00009, 0xe3a01001, 0xe1cd01bc, 0xe3a00008, 0xe1cd01b4, 0xe3090680, 0xe3400098, 
-    0xe3a02000, 0xe58d000c, 0xe28d0010, 0xe58d7000, 0xe58d6018, 0xe58d8010, 0xe58d9008, 0xe12fff35, 0xe3500000, 0xca00001d, 0xe28d7018, 0xe28d8010, 
-    0xe28d9020, 0xe1a00007, 0xe3a01001, 0xe3a02000, 0xe12fff35, 0xe3500000, 0xda00000a, 0xe1a00006, 0xe1a01009, 0xe3a02801, 0xe12fff3a, 0xe3500001, 
-    0xba00000e, 0xe1a02000, 0xe3a00001, 0xe1a01009, 0xe12fff34, 0xea000003, 0xe59d2004, 0xe28d0008, 0xe3a01000, 0xe12fff32, 0xe1a00008, 0xe3a01001, 
-    0xe3a02000, 0xe12fff35, 0xe3500001, 0xbaffffe4, 0xe59d1000, 0xe3a00000, 0xe12fff31, 0xe24bd01c, 0xe8bd8ff0, 0x00000198, 0x00000190, 0x00000181, 
-    0x00000172, 0x00000163, 0x00000159, 0x0000014a, 0x00000138, 0x7ffffe4c, 0x00000001, 0x6362696c, 0x006f732e, 0x6e65706f, 0x69786500, 0x6f700074, 
-    0x6e006c6c, 0x736f6e61, 0x7065656c, 0x61657200, 0x72770064, 0x00657469, 0x7379732f, 0x72656b2f, 0x2f6c656e, 0x75626564, 0x72742f67, 0x6e696361, 
-    0x72742f67, 0x5f656361, 0x65706970, 0x00000000, 0x00000003, 0x000014b0, 0x00000002, 0x00000010, 0x00000017, 0x000001b4, 0x00000014, 0x00000011, 
-    0x00000015, 0x00000000, 0x00000006, 0x00000128, 0x0000000b, 0x00000010, 0x00000005, 0x00000158, 0x0000000a, 0x0000001c, 0x6ffffef5, 0x00000174, 
-    0x00000001, 0x0000000d, 0x0000001e, 0x00000008, 0x6ffffffb, 0x00000001, 0x6ffffff0, 0x0000018c, 0x6ffffffe, 0x00000194, 0x6fffffff, 0x00000001, 
-};
-
-static const unsigned int tracy_systrace_aarch64_size = 1650;
-static const unsigned int tracy_systrace_aarch64_data[1652/4] =
-{
-    0x464c457f, 0x00010102, 0x00000000, 0x00000000, 0x00b70003, 0x00000001, 0x000002e0, 0x00000000, 0x00000040, 0x00000000, 0x00000000, 0x00000000, 
-    0x00000000, 0x00380040, 0x00400006, 0x00000000, 0x00000006, 0x00000005, 0x00000040, 0x00000000, 0x00000040, 0x00000000, 0x00000040, 0x00000000, 
-    0x00000150, 0x00000000, 0x00000150, 0x00000000, 0x00000008, 0x00000000, 0x00000003, 0x00000004, 0x00000190, 0x00000000, 0x00000190, 0x00000000, 
-    0x00000190, 0x00000000, 0x00000015, 0x00000000, 0x00000015, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000005, 0x00000000, 0x00000000, 
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000004e1, 0x00000000, 0x000004e1, 0x00000000, 0x00001000, 0x00000000, 0x00000001, 0x00000006, 
-    0x000004e8, 0x00000000, 0x000014e8, 0x00000000, 0x000014e8, 0x00000000, 0x0000018a, 0x00000000, 0x00000190, 0x00000000, 0x00001000, 0x00000000, 
-    0x00000002, 0x00000006, 0x000004e8, 0x00000000, 0x000014e8, 0x00000000, 0x000014e8, 0x00000000, 0x00000160, 0x00000000, 0x00000160, 0x00000000, 
-    0x00000008, 0x00000000, 0x6474e551, 0x00000006, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
-    0x00000000, 0x00000000, 0x00000008, 0x00000000, 0x7379732f, 0x2f6d6574, 0x2f6e6962, 0x6b6e696c, 0x34367265, 0x00000000, 0x00000001, 0x00000001, 
-    0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
-    0x00000000, 0x00090003, 0x000002e0, 0x00000000, 0x00000000, 0x00000000, 0x00000010, 0x00000012, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
-    0x0000000a, 0x00000012, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x62696c00, 0x732e6c64, 0x6c64006f, 0x006d7973, 0x706f6c64, 0x4c006e65, 
-    0x00434249, 0x00000000, 0x00000000, 0x00000000, 0x00010001, 0x00000001, 0x00000010, 0x00000000, 0x00050d63, 0x00020000, 0x00000017, 0x00000000, 
-    0x00001668, 0x00000000, 0x00000402, 0x00000002, 0x00000000, 0x00000000, 0x00001670, 0x00000000, 0x00000402, 0x00000003, 0x00000000, 0x00000000, 
-    0xa9bf7bf0, 0xb0000010, 0xf9433211, 0x91198210, 0xd61f0220, 0xd503201f, 0xd503201f, 0xd503201f, 0xb0000010, 0xf9433611, 0x9119a210, 0xd61f0220, 
-    0xb0000010, 0xf9433a11, 0x9119c210, 0xd61f0220, 0xa9bb67fc, 0xa9015ff8, 0xa90257f6, 0xa9034ff4, 0xa9047bfd, 0x910103fd, 0xd14043ff, 0xd10083ff, 
-    0x90000000, 0x91124000, 0x52800021, 0x52800039, 0x97ffffec, 0x90000001, 0x91126021, 0xaa0003f7, 0x97ffffec, 0x90000001, 0xaa0003f8, 0x91127421, 
-    0xaa1703e0, 0x97ffffe7, 0x90000001, 0xaa0003f3, 0x91128821, 0xaa1703e0, 0x97ffffe2, 0x90000001, 0xaa0003f4, 0x91129c21, 0xaa1703e0, 0x97ffffdd, 
-    0x90000001, 0xaa0003f5, 0x9112c421, 0xaa1703e0, 0x97ffffd8, 0x90000001, 0xaa0003f6, 0x9112d821, 0xaa1703e0, 0x97ffffd3, 0xaa0003f7, 0x90000000, 
-    0x9112f000, 0x2a1f03e1, 0xd63f0300, 0x2a0003f8, 0x36f80060, 0x2a1f03e0, 0xd63f0260, 0x90000009, 0x3dc12120, 0x52800128, 0x79003be8, 0x52800108, 
-    0x910043e0, 0x52800021, 0x2a1f03e2, 0xb9001bf8, 0xb90013f9, 0x79002be8, 0x3d8003e0, 0xd63f0280, 0x7100001f, 0x5400036c, 0x910063e0, 0x52800021, 
-    0x2a1f03e2, 0xd63f0280, 0x7100001f, 0x5400018d, 0x910083e1, 0x52a00022, 0x2a1803e0, 0xd63f02c0, 0xf100041f, 0x540001eb, 0xaa0003e2, 0x910083e1, 
-    0x52800020, 0xd63f02e0, 0x14000004, 0x910003e0, 0xaa1f03e1, 0xd63f02a0, 0x910043e0, 0x52800021, 0x2a1f03e2, 0xd63f0280, 0x7100041f, 0x54fffceb, 
-    0x2a1f03e0, 0xd63f0260, 0x914043ff, 0x910083ff, 0xa9447bfd, 0xa9434ff4, 0xa94257f6, 0xa9415ff8, 0xa8c567fc, 0xd65f03c0, 0x00000000, 0x00000000, 
-    0x00000000, 0x00000000, 0x00989680, 0x00000000, 0x6362696c, 0x006f732e, 0x6e65706f, 0x69786500, 0x6f700074, 0x6e006c6c, 0x736f6e61, 0x7065656c, 
-    0x61657200, 0x72770064, 0x00657469, 0x7379732f, 0x72656b2f, 0x2f6c656e, 0x75626564, 0x72742f67, 0x6e696361, 0x72742f67, 0x5f656361, 0x65706970, 
-    0x00000000, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x6ffffef5, 0x00000000, 0x000001a8, 0x00000000, 0x00000005, 0x00000000, 
-    0x00000228, 0x00000000, 0x00000006, 0x00000000, 0x000001c8, 0x00000000, 0x0000000a, 0x00000000, 0x0000001c, 0x00000000, 0x0000000b, 0x00000000, 
-    0x00000018, 0x00000000, 0x00000015, 0x00000000, 0x00000000, 0x00000000, 0x00000003, 0x00000000, 0x00001650, 0x00000000, 0x00000002, 0x00000000, 
-    0x00000030, 0x00000000, 0x00000014, 0x00000000, 0x00000007, 0x00000000, 0x00000017, 0x00000000, 0x00000270, 0x00000000, 0x0000001e, 0x00000000, 
-    0x00000008, 0x00000000, 0x6ffffffb, 0x00000000, 0x00000001, 0x00000000, 0x6ffffffe, 0x00000000, 0x00000250, 0x00000000, 0x6fffffff, 0x00000000, 
-    0x00000001, 0x00000000, 0x6ffffff0, 0x00000000, 0x00000244, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
-    0x00000000, 0x00000000, 0x000002a0, 0x00000000, 0x000002a0, 
-};
-
-}
diff --git a/Source/ThirdParty/tracy/client/TracyThread.hpp b/Source/ThirdParty/tracy/client/TracyThread.hpp
index edd255e87..5638756ac 100644
--- a/Source/ThirdParty/tracy/client/TracyThread.hpp
+++ b/Source/ThirdParty/tracy/client/TracyThread.hpp
@@ -1,7 +1,7 @@
 #ifndef __TRACYTHREAD_HPP__
 #define __TRACYTHREAD_HPP__
 
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
 #  include <windows.h>
 #else
 #  include <pthread.h>
@@ -14,18 +14,23 @@
 namespace tracy
 {
 
+#ifdef TRACY_MANUAL_LIFETIME
+extern thread_local bool RpThreadInitDone;
+#endif
+
 class ThreadExitHandler
 {
 public:
     ~ThreadExitHandler()
     {
 #ifdef TRACY_MANUAL_LIFETIME
-        rpmalloc_thread_finalize();
+        rpmalloc_thread_finalize( 1 );
+        RpThreadInitDone = false;
 #endif
     }
 };
 
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
 
 class Thread
 {
diff --git a/Source/ThirdParty/tracy/client/tracy_SPSCQueue.h b/Source/ThirdParty/tracy/client/tracy_SPSCQueue.h
new file mode 100644
index 000000000..7f1752b56
--- /dev/null
+++ b/Source/ThirdParty/tracy/client/tracy_SPSCQueue.h
@@ -0,0 +1,148 @@
+/*
+Copyright (c) 2020 Erik Rigtorp <erik@rigtorp.se>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+ */
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <stdexcept>
+#include <type_traits> // std::enable_if, std::is_*_constructible
+
+#include "../common/TracyAlloc.hpp"
+
+#if defined (_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4324)
+#endif
+
+namespace tracy {
+
+template <typename T> class SPSCQueue {
+public:
+  explicit SPSCQueue(const size_t capacity)
+      : capacity_(capacity) {
+    capacity_++; // Needs one slack element
+    slots_ = (T*)tracy_malloc(sizeof(T) * (capacity_ + 2 * kPadding));
+
+    static_assert(alignof(SPSCQueue<T>) == kCacheLineSize, "");
+    static_assert(sizeof(SPSCQueue<T>) >= 3 * kCacheLineSize, "");
+    assert(reinterpret_cast<char *>(&readIdx_) -
+               reinterpret_cast<char *>(&writeIdx_) >=
+           static_cast<std::ptrdiff_t>(kCacheLineSize));
+  }
+
+  ~SPSCQueue() {
+    while (front()) {
+      pop();
+    }
+    tracy_free(slots_);
+  }
+
+  // non-copyable and non-movable
+  SPSCQueue(const SPSCQueue &) = delete;
+  SPSCQueue &operator=(const SPSCQueue &) = delete;
+
+  template <typename... Args>
+  void emplace(Args &&...args) noexcept(
+      std::is_nothrow_constructible<T, Args &&...>::value) {
+    static_assert(std::is_constructible<T, Args &&...>::value,
+                  "T must be constructible with Args&&...");
+    auto const writeIdx = writeIdx_.load(std::memory_order_relaxed);
+    auto nextWriteIdx = writeIdx + 1;
+    if (nextWriteIdx == capacity_) {
+      nextWriteIdx = 0;
+    }
+    while (nextWriteIdx == readIdxCache_) {
+      readIdxCache_ = readIdx_.load(std::memory_order_acquire);
+    }
+    new (&slots_[writeIdx + kPadding]) T(std::forward<Args>(args)...);
+    writeIdx_.store(nextWriteIdx, std::memory_order_release);
+  }
+
+  T *front() noexcept {
+    auto const readIdx = readIdx_.load(std::memory_order_relaxed);
+    if (readIdx == writeIdxCache_) {
+      writeIdxCache_ = writeIdx_.load(std::memory_order_acquire);
+      if (writeIdxCache_ == readIdx) {
+        return nullptr;
+      }
+    }
+    return &slots_[readIdx + kPadding];
+  }
+
+  void pop() noexcept {
+    static_assert(std::is_nothrow_destructible<T>::value,
+                  "T must be nothrow destructible");
+    auto const readIdx = readIdx_.load(std::memory_order_relaxed);
+    assert(writeIdx_.load(std::memory_order_acquire) != readIdx);
+    slots_[readIdx + kPadding].~T();
+    auto nextReadIdx = readIdx + 1;
+    if (nextReadIdx == capacity_) {
+      nextReadIdx = 0;
+    }
+    readIdx_.store(nextReadIdx, std::memory_order_release);
+  }
+
+  size_t size() const noexcept {
+    std::ptrdiff_t diff = writeIdx_.load(std::memory_order_acquire) -
+                          readIdx_.load(std::memory_order_acquire);
+    if (diff < 0) {
+      diff += capacity_;
+    }
+    return static_cast<size_t>(diff);
+  }
+
+  bool empty() const noexcept {
+      return writeIdx_.load(std::memory_order_acquire) ==
+          readIdx_.load(std::memory_order_acquire);
+  }
+
+  size_t capacity() const noexcept { return capacity_ - 1; }
+
+private:
+  static constexpr size_t kCacheLineSize = 64;
+
+  // Padding to avoid false sharing between slots_ and adjacent allocations
+  static constexpr size_t kPadding = (kCacheLineSize - 1) / sizeof(T) + 1;
+
+private:
+  size_t capacity_;
+  T *slots_;
+
+  // Align to cache line size in order to avoid false sharing
+  // readIdxCache_ and writeIdxCache_ is used to reduce the amount of cache
+  // coherency traffic
+  alignas(kCacheLineSize) std::atomic<size_t> writeIdx_ = {0};
+  alignas(kCacheLineSize) size_t readIdxCache_ = 0;
+  alignas(kCacheLineSize) std::atomic<size_t> readIdx_ = {0};
+  alignas(kCacheLineSize) size_t writeIdxCache_ = 0;
+
+  // Padding to avoid adjacent allocations to share cache line with
+  // writeIdxCache_
+  char padding_[kCacheLineSize - sizeof(SPSCQueue<T>::writeIdxCache_)];
+};
+} // namespace rigtorp
+
+#if defined (_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/Source/ThirdParty/tracy/client/tracy_concurrentqueue.h b/Source/ThirdParty/tracy/client/tracy_concurrentqueue.h
index bf095bc36..3b88b20b6 100644
--- a/Source/ThirdParty/tracy/client/tracy_concurrentqueue.h
+++ b/Source/ThirdParty/tracy/client/tracy_concurrentqueue.h
@@ -201,7 +201,7 @@ namespace details
 		ConcurrentQueueProducerTypelessBase* next;
 		std::atomic<bool> inactive;
 		ProducerToken* token;
-        uint64_t threadId;
+        uint32_t threadId;
 
 		ConcurrentQueueProducerTypelessBase()
 			: next(nullptr), inactive(false), token(nullptr), threadId(0)
@@ -209,19 +209,19 @@ namespace details
 		}
 	};
 
-	template<typename T>
-	static inline bool circular_less_than(T a, T b)
-	{
 #ifdef _MSC_VER
 #pragma warning(push)
 #pragma warning(disable: 4554)
 #endif
+	template<typename T>
+	static inline bool circular_less_than(T a, T b)
+	{
 		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
-		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << static_cast<T>(sizeof(T) * CHAR_BIT - 1));
+		return static_cast<T>(a - b) > (static_cast<T>(static_cast<T>(1) << static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+	}
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
-	}
 
 	template<typename U>
 	static inline char* align_for(char* ptr)
diff --git a/Source/ThirdParty/tracy/client/tracy_rpmalloc.cpp b/Source/ThirdParty/tracy/client/tracy_rpmalloc.cpp
index 8aae78e03..8efa626a9 100644
--- a/Source/ThirdParty/tracy/client/tracy_rpmalloc.cpp
+++ b/Source/ThirdParty/tracy/client/tracy_rpmalloc.cpp
@@ -1,6 +1,6 @@
 #ifdef TRACY_ENABLE
 
-/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016 Mattias Jansson
+/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016-2020 Mattias Jansson
  *
  * This library provides a cross-platform lock free thread caching malloc implementation in C11.
  * The latest source code is always available at
@@ -13,7 +13,26 @@
 
 #include "tracy_rpmalloc.hpp"
 
+#define BUILD_DYNAMIC_LINK 1
+
+////////////
+///
 /// Build time configurable limits
+///
+//////
+
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wunused-macros"
+#pragma clang diagnostic ignored "-Wunused-function"
+#if __has_warning("-Wreserved-identifier")
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+#elif defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wunused-macros"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+
 #ifndef HEAP_ARRAY_SIZE
 //! Size of heap hashmap
 #define HEAP_ARRAY_SIZE           47
@@ -47,59 +66,46 @@
 #define ENABLE_PRELOAD            0
 #endif
 #ifndef DISABLE_UNMAP
-//! Disable unmapping memory pages
+//! Disable unmapping memory pages (also enables unlimited cache)
 #define DISABLE_UNMAP             0
 #endif
+#ifndef ENABLE_UNLIMITED_CACHE
+//! Enable unlimited global cache (no unmapping until finalization)
+#define ENABLE_UNLIMITED_CACHE    0
+#endif
+#ifndef ENABLE_ADAPTIVE_THREAD_CACHE
+//! Enable adaptive thread cache size based on use heuristics
+#define ENABLE_ADAPTIVE_THREAD_CACHE 0
+#endif
 #ifndef DEFAULT_SPAN_MAP_COUNT
 //! Default number of spans to map in call to map more virtual memory (default values yield 4MiB here)
 #define DEFAULT_SPAN_MAP_COUNT    64
 #endif
-
-#if ENABLE_THREAD_CACHE
-#ifndef ENABLE_UNLIMITED_CACHE
-//! Unlimited thread and global cache
-#define ENABLE_UNLIMITED_CACHE    0
-#endif
-#ifndef ENABLE_UNLIMITED_THREAD_CACHE
-//! Unlimited cache disables any thread cache limitations
-#define ENABLE_UNLIMITED_THREAD_CACHE ENABLE_UNLIMITED_CACHE
-#endif
-#if !ENABLE_UNLIMITED_THREAD_CACHE
-#ifndef THREAD_CACHE_MULTIPLIER
-//! Multiplier for thread cache (cache limit will be span release count multiplied by this value)
-#define THREAD_CACHE_MULTIPLIER 16
-#endif
-#ifndef ENABLE_ADAPTIVE_THREAD_CACHE
-//! Enable adaptive size of per-thread cache (still bounded by THREAD_CACHE_MULTIPLIER hard limit)
-#define ENABLE_ADAPTIVE_THREAD_CACHE  0
-#endif
-#endif
-#endif
-
-#if ENABLE_GLOBAL_CACHE && ENABLE_THREAD_CACHE
-#ifndef ENABLE_UNLIMITED_GLOBAL_CACHE
-//! Unlimited cache disables any global cache limitations
-#define ENABLE_UNLIMITED_GLOBAL_CACHE ENABLE_UNLIMITED_CACHE
-#endif
-#if !ENABLE_UNLIMITED_GLOBAL_CACHE
-//! Multiplier for global cache (cache limit will be span release count multiplied by this value)
-#define GLOBAL_CACHE_MULTIPLIER (THREAD_CACHE_MULTIPLIER * 6)
-#endif
-#else
-#  undef ENABLE_GLOBAL_CACHE
-#  define ENABLE_GLOBAL_CACHE 0
-#endif
-
-#if !ENABLE_THREAD_CACHE || ENABLE_UNLIMITED_THREAD_CACHE
-#  undef ENABLE_ADAPTIVE_THREAD_CACHE
-#  define ENABLE_ADAPTIVE_THREAD_CACHE 0
+#ifndef GLOBAL_CACHE_MULTIPLIER
+//! Multiplier for global cache
+#define GLOBAL_CACHE_MULTIPLIER   8
 #endif
 
 #if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE
-#  error Must use global cache if unmap is disabled
+#error Must use global cache if unmap is disabled
 #endif
 
-#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 )
+#if DISABLE_UNMAP
+#undef ENABLE_UNLIMITED_CACHE
+#define ENABLE_UNLIMITED_CACHE 1
+#endif
+
+#if !ENABLE_GLOBAL_CACHE
+#undef ENABLE_UNLIMITED_CACHE
+#define ENABLE_UNLIMITED_CACHE 0
+#endif
+
+#if !ENABLE_THREAD_CACHE
+#undef ENABLE_ADAPTIVE_THREAD_CACHE
+#define ENABLE_ADAPTIVE_THREAD_CACHE 0
+#endif
+
+#if defined(_WIN32) || defined(__WIN32__) || defined(_WIN64)
 #  define PLATFORM_WINDOWS 1
 #  define PLATFORM_POSIX 0
 #else
@@ -107,13 +113,14 @@
 #  define PLATFORM_POSIX 1
 #endif
 
-#define _Static_assert static_assert
-
 /// Platform and arch specifics
-#ifndef FORCEINLINE
-#  if defined(_MSC_VER) && !defined(__clang__)
+#if defined(_MSC_VER) && !defined(__clang__)
+#  pragma warning (disable: 5105)
+#  ifndef FORCEINLINE
 #    define FORCEINLINE inline __forceinline
-#  else
+#  endif
+#else
+#  ifndef FORCEINLINE
 #    define FORCEINLINE inline __attribute__((__always_inline__))
 #  endif
 #endif
@@ -123,25 +130,62 @@
 #  endif
 #  include <windows.h>
 #  if ENABLE_VALIDATE_ARGS
-#    include <Intsafe.h>
+#    include <intsafe.h>
 #  endif
 #else
 #  include <unistd.h>
 #  include <stdio.h>
 #  include <stdlib.h>
+#  include <time.h>
+#  if defined(__linux__) || defined(__ANDROID__)
+#    include <sys/prctl.h>
+#    if !defined(PR_SET_VMA)
+#      define PR_SET_VMA 0x53564d41
+#      define PR_SET_VMA_ANON_NAME 0
+#    endif
+#  endif
 #  if defined(__APPLE__)
+#    include <TargetConditionals.h>
+#    if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
 #    include <mach/mach_vm.h>
 #    include <mach/vm_statistics.h>
+#    endif
 #    include <pthread.h>
 #  endif
-#  if defined(__HAIKU__)
-#    include <OS.h>
+#  if defined(__HAIKU__) || defined(__TINYC__)
 #    include <pthread.h>
 #  endif
 #endif
 
 #include <stdint.h>
 #include <string.h>
+#include <errno.h>
+
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+#include <fibersapi.h>
+static DWORD fls_key;
+#endif
+
+#if PLATFORM_POSIX
+#  include <sys/mman.h>
+#  include <sched.h>
+#  ifdef __FreeBSD__
+#    include <sys/sysctl.h>
+#    define MAP_HUGETLB MAP_ALIGNED_SUPER
+#    ifndef PROT_MAX
+#      define PROT_MAX(f) 0
+#    endif
+#  else
+#    define PROT_MAX(f) 0
+#  endif
+#  ifdef __sun
+extern int madvise(caddr_t, size_t, int);
+#  endif
+#  ifndef MAP_UNINITIALIZED
+#    define MAP_UNINITIALIZED 0
+#  endif
+#endif
+#include <errno.h>
 
 #if ENABLE_ASSERTS
 #  undef NDEBUG
@@ -149,47 +193,105 @@
 #    define _DEBUG
 #  endif
 #  include <assert.h>
+#define RPMALLOC_TOSTRING_M(x) #x
+#define RPMALLOC_TOSTRING(x) RPMALLOC_TOSTRING_M(x)
+#define rpmalloc_assert(truth, message)                                                                      \
+	do {                                                                                                     \
+		if (!(truth)) {                                                                                      \
+			if (_memory_config.error_callback) {                                                             \
+				_memory_config.error_callback(                                                               \
+				    message " (" RPMALLOC_TOSTRING(truth) ") at " __FILE__ ":" RPMALLOC_TOSTRING(__LINE__)); \
+			} else {                                                                                         \
+				assert((truth) && message);                                                                  \
+			}                                                                                                \
+		}                                                                                                    \
+	} while (0)
 #else
-#  undef  assert
-#  define assert(x) do {} while(0)
+#  define rpmalloc_assert(truth, message) do {} while(0)
 #endif
 #if ENABLE_STATISTICS
 #  include <stdio.h>
 #endif
 
-#include <atomic>
+//////
+///
+/// Atomic access abstraction (since MSVC does not do C11 yet)
+///
+//////
 
-namespace tracy
-{
+#include <atomic>
 
 typedef std::atomic<int32_t> atomic32_t;
 typedef std::atomic<int64_t> atomic64_t;
 typedef std::atomic<void*> atomicptr_t;
 
-#define atomic_thread_fence_acquire() std::atomic_thread_fence(std::memory_order_acquire)
-#define atomic_thread_fence_release() std::atomic_thread_fence(std::memory_order_release)
-
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return std::atomic_load_explicit(src, std::memory_order_relaxed); }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { std::atomic_store_explicit(dst, val, std::memory_order_relaxed); }
 static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return std::atomic_fetch_add_explicit(val, 1, std::memory_order_relaxed) + 1; }
-#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
-static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; }
-#endif
+static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return std::atomic_fetch_add_explicit(val, -1, std::memory_order_relaxed) - 1; }
 static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return std::atomic_fetch_add_explicit(val, add, std::memory_order_relaxed) + add; }
+static FORCEINLINE int     atomic_cas32_acquire(atomic32_t* dst, int32_t val, int32_t ref) { return std::atomic_compare_exchange_weak_explicit(dst, &ref, val, std::memory_order_acquire, std::memory_order_relaxed); }
+static FORCEINLINE void    atomic_store32_release(atomic32_t* dst, int32_t val) { std::atomic_store_explicit(dst, val, std::memory_order_release); }
+static FORCEINLINE int64_t atomic_load64(atomic64_t* val) { return std::atomic_load_explicit(val, std::memory_order_relaxed); }
+static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return std::atomic_fetch_add_explicit(val, add, std::memory_order_relaxed) + add; }
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return std::atomic_load_explicit(src, std::memory_order_relaxed); }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { std::atomic_store_explicit(dst, val, std::memory_order_relaxed); }
-static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return std::atomic_compare_exchange_weak_explicit(dst, &ref, val, std::memory_order_release, std::memory_order_acquire); }
+static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { std::atomic_store_explicit(dst, val, std::memory_order_release); }
+static FORCEINLINE void*   atomic_exchange_ptr_acquire(atomicptr_t* dst, void* val) { return std::atomic_exchange_explicit(dst, val, std::memory_order_acquire); }
+static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return std::atomic_compare_exchange_weak_explicit(dst, &ref, val, std::memory_order_relaxed, std::memory_order_relaxed); }
 
 #if defined(_MSC_VER) && !defined(__clang__)
-#  define EXPECTED(x) (x)
-#  define UNEXPECTED(x) (x)
+
+#define EXPECTED(x) (x)
+#define UNEXPECTED(x) (x)
+
 #else
-#  define EXPECTED(x) __builtin_expect((x), 1)
-#  define UNEXPECTED(x) __builtin_expect((x), 0)
+
+#define EXPECTED(x) __builtin_expect((x), 1)
+#define UNEXPECTED(x) __builtin_expect((x), 0)
+
 #endif
 
+////////////
+///
+/// Statistics related functions (evaluate to nothing when statistics not enabled)
+///
+//////
+
+#if ENABLE_STATISTICS
+#  define _rpmalloc_stat_inc(counter) atomic_incr32(counter)
+#  define _rpmalloc_stat_dec(counter) atomic_decr32(counter)
+#  define _rpmalloc_stat_add(counter, value) atomic_add32(counter, (int32_t)(value))
+#  define _rpmalloc_stat_add64(counter, value) atomic_add64(counter, (int64_t)(value))
+#  define _rpmalloc_stat_add_peak(counter, value, peak) do { int32_t _cur_count = atomic_add32(counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0)
+#  define _rpmalloc_stat_sub(counter, value) atomic_add32(counter, -(int32_t)(value))
+#  define _rpmalloc_stat_inc_alloc(heap, class_idx) do { \
+	int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \
+	if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \
+		heap->size_class_use[class_idx].alloc_peak = alloc_current; \
+	atomic_incr32(&heap->size_class_use[class_idx].alloc_total); \
+} while(0)
+#  define _rpmalloc_stat_inc_free(heap, class_idx) do { \
+	atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \
+	atomic_incr32(&heap->size_class_use[class_idx].free_total); \
+} while(0)
+#else
+#  define _rpmalloc_stat_inc(counter) do {} while(0)
+#  define _rpmalloc_stat_dec(counter) do {} while(0)
+#  define _rpmalloc_stat_add(counter, value) do {} while(0)
+#  define _rpmalloc_stat_add64(counter, value) do {} while(0)
+#  define _rpmalloc_stat_add_peak(counter, value, peak) do {} while (0)
+#  define _rpmalloc_stat_sub(counter, value) do {} while(0)
+#  define _rpmalloc_stat_inc_alloc(heap, class_idx) do {} while(0)
+#  define _rpmalloc_stat_inc_free(heap, class_idx) do {} while(0)
+#endif
+
+
+///
 /// Preconfigured limits and sizes
-//! Granularity of a small allocation block
+///
+
+//! Granularity of a small allocation block (must be power of two)
 #define SMALL_GRANULARITY         16
 //! Small granularity shift count
 #define SMALL_GRANULARITY_SHIFT   4
@@ -206,13 +308,24 @@ static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref
 //! Total number of small + medium size classes
 #define SIZE_CLASS_COUNT          (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT)
 //! Number of large block size classes
-#define LARGE_CLASS_COUNT         32
+#define LARGE_CLASS_COUNT         63
 //! Maximum size of a medium block
 #define MEDIUM_SIZE_LIMIT         (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT))
 //! Maximum size of a large block
 #define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
-//! Size of a span header (must be a multiple of SMALL_GRANULARITY)
-#define SPAN_HEADER_SIZE          96
+//! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power of two)
+#define SPAN_HEADER_SIZE          128
+//! Number of spans in thread cache
+#define MAX_THREAD_SPAN_CACHE     400
+//! Number of spans to transfer between thread and global cache
+#define THREAD_SPAN_CACHE_TRANSFER 64
+//! Number of spans in thread cache for large spans (must be greater than LARGE_CLASS_COUNT / 2)
+#define MAX_THREAD_SPAN_LARGE_CACHE 100
+//! Number of spans to transfer between thread and global cache for large spans
+#define THREAD_SPAN_LARGE_CACHE_TRANSFER 6
+
+static_assert((SMALL_GRANULARITY & (SMALL_GRANULARITY - 1)) == 0, "Small granularity must be power of two");
+static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0, "Span header size must be power of two");
 
 #if ENABLE_VALIDATE_ARGS
 //! Maximum allocation size to avoid integer overflow
@@ -225,11 +338,20 @@ static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref
 
 #define INVALID_POINTER ((void*)((uintptr_t)-1))
 
+#define SIZE_CLASS_LARGE SIZE_CLASS_COUNT
+#define SIZE_CLASS_HUGE ((uint32_t)-1)
+
+////////////
+///
 /// Data types
+///
+//////
+
+namespace tracy
+{
+
 //! A memory heap, per thread
 typedef struct heap_t heap_t;
-//! Heap spans per size class
-typedef struct heap_class_t heap_class_t;
 //! Span of memory pages
 typedef struct span_t span_t;
 //! Span list
@@ -247,28 +369,32 @@ typedef struct global_cache_t global_cache_t;
 #define SPAN_FLAG_SUBSPAN 2U
 //! Flag indicating span has blocks with increased alignment
 #define SPAN_FLAG_ALIGNED_BLOCKS 4U
+//! Flag indicating an unmapped master span
+#define SPAN_FLAG_UNMAPPED_MASTER 8U
 
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 struct span_use_t {
 	//! Current number of spans used (actually used, not in cache)
 	atomic32_t current;
 	//! High water mark of spans used
-	uint32_t high;
+	atomic32_t high;
 #if ENABLE_STATISTICS
+	//! Number of spans in deferred list
+	atomic32_t spans_deferred;
 	//! Number of spans transitioned to global cache
-	uint32_t spans_to_global;
+	atomic32_t spans_to_global;
 	//! Number of spans transitioned from global cache
-	uint32_t spans_from_global;
+	atomic32_t spans_from_global;
 	//! Number of spans transitioned to thread cache
-	uint32_t spans_to_cache;
+	atomic32_t spans_to_cache;
 	//! Number of spans transitioned from thread cache
-	uint32_t spans_from_cache;
+	atomic32_t spans_from_cache;
 	//! Number of spans transitioned to reserved state
-	uint32_t spans_to_reserved;
+	atomic32_t spans_to_reserved;
 	//! Number of spans transitioned from reserved state
-	uint32_t spans_from_reserved;
+	atomic32_t spans_from_reserved;
 	//! Number of raw memory map calls
-	uint32_t spans_map_calls;
+	atomic32_t spans_map_calls;
 #endif
 };
 typedef struct span_use_t span_use_t;
@@ -281,64 +407,59 @@ struct size_class_use_t {
 	//! Peak number of allocations
 	int32_t alloc_peak;
 	//! Total number of allocations
-	int32_t alloc_total;
+	atomic32_t alloc_total;
 	//! Total number of frees
 	atomic32_t free_total;
 	//! Number of spans in use
-	uint32_t spans_current;
+	atomic32_t spans_current;
 	//! Number of spans transitioned to cache
-	uint32_t spans_peak;
+	int32_t spans_peak;
 	//! Number of spans transitioned to cache
-	uint32_t spans_to_cache;
+	atomic32_t spans_to_cache;
 	//! Number of spans transitioned from cache
-	uint32_t spans_from_cache;
+	atomic32_t spans_from_cache;
 	//! Number of spans transitioned from reserved state
-	uint32_t spans_from_reserved;
+	atomic32_t spans_from_reserved;
 	//! Number of spans mapped
-	uint32_t spans_map_calls;
+	atomic32_t spans_map_calls;
+	int32_t unused;
 };
 typedef struct size_class_use_t size_class_use_t;
 #endif
 
-typedef enum span_state_t {
-	SPAN_STATE_ACTIVE = 0,
-	SPAN_STATE_PARTIAL,
-	SPAN_STATE_FULL
-} span_state_t;
-
-//A span can either represent a single span of memory pages with size declared by span_map_count configuration variable,
-//or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single
-//span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first
-//(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans
-//that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire
-//superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released
-//in the same call to release the virtual memory range, but individual subranges can be decommitted individually
-//to reduce physical memory use).
+// A span can either represent a single span of memory pages with size declared by span_map_count configuration variable,
+// or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single
+// span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first
+// (super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans
+// that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire
+// superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released
+// in the same call to release the virtual memory range, but individual subranges can be decommitted individually
+// to reduce physical memory use).
 struct span_t {
 	//! Free list
 	void*       free_list;
-	//! State
-	uint32_t    state;
-	//! Used count when not active (not including deferred free list)
-	uint32_t    used_count;
-	//! Block count
+	//! Total block count of size class
 	uint32_t    block_count;
 	//! Size class
 	uint32_t    size_class;
 	//! Index of last block initialized in free list
 	uint32_t    free_list_limit;
-	//! Span list size when part of a cache list, or size of deferred free list when partial/full
-	uint32_t    list_size;
+	//! Number of used blocks remaining when in partial state
+	uint32_t    used_count;
 	//! Deferred free list
 	atomicptr_t free_list_deferred;
+	//! Size of deferred free list, or list of spans when part of a cache list
+	uint32_t    list_size;
 	//! Size of a block
 	uint32_t    block_size;
 	//! Flags and counters
 	uint32_t    flags;
 	//! Number of spans
 	uint32_t    span_count;
-	//! Total span counter for master spans, distance for subspans
-	uint32_t    total_spans_or_distance;
+	//! Total span counter for master spans
+	uint32_t    total_spans;
+	//! Offset from master span for subspans
+	uint32_t    offset_from_master;
 	//! Remaining span counter, for master spans
 	atomic32_t  remaining_spans;
 	//! Alignment offset
@@ -350,53 +471,89 @@ struct span_t {
 	//! Previous span
 	span_t*     prev;
 };
-_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch");
+static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch");
 
-struct heap_class_t {
+struct span_cache_t {
+	size_t       count;
+	span_t*      span[MAX_THREAD_SPAN_CACHE];
+};
+typedef struct span_cache_t span_cache_t;
+
+struct span_large_cache_t {
+	size_t       count;
+	span_t*      span[MAX_THREAD_SPAN_LARGE_CACHE];
+};
+typedef struct span_large_cache_t span_large_cache_t;
+
+struct heap_size_class_t {
 	//! Free list of active span
 	void*        free_list;
-	//! Double linked list of partially used spans with free blocks for each size class.
-	//  Current active span is at head of list. Previous span pointer in head points to tail span of list.
+	//! Double linked list of partially used spans with free blocks.
+	//  Previous span pointer in head points to tail span of list.
 	span_t*      partial_span;
+	//! Early level cache of fully free spans
+	span_t*      cache;
 };
+typedef struct heap_size_class_t heap_size_class_t;
 
+// Control structure for a heap, either a thread heap or a first class heap if enabled
 struct heap_t {
-	//! Active and semi-used span data per size class
-	heap_class_t span_class[SIZE_CLASS_COUNT];
+	//! Owning thread ID
+	uintptr_t    owner_thread;
+	//! Free lists for each size class
+	heap_size_class_t size_class[SIZE_CLASS_COUNT];
 #if ENABLE_THREAD_CACHE
-	//! List of free spans (single linked list)
-	span_t*      span_cache[LARGE_CLASS_COUNT];
-	//! List of deferred free spans of class 0 (single linked list)
-	atomicptr_t  span_cache_deferred;
-#endif
-#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
-	//! Current and high water mark of spans used per span count
-	span_use_t   span_use[LARGE_CLASS_COUNT];
+	//! Arrays of fully freed spans, single span
+	span_cache_t span_cache;
 #endif
+	//! List of deferred free spans (single linked list)
+	atomicptr_t  span_free_deferred;
+	//! Number of full spans
+	size_t       full_span_count;
 	//! Mapped but unused spans
 	span_t*      span_reserve;
 	//! Master span for mapped but unused spans
 	span_t*      span_reserve_master;
 	//! Number of mapped but unused spans
-	size_t       spans_reserved;
+	uint32_t     spans_reserved;
+	//! Child count
+	atomic32_t   child_count;
 	//! Next heap in id list
 	heap_t*      next_heap;
 	//! Next heap in orphan list
 	heap_t*      next_orphan;
-	//! Memory pages alignment offset
-	size_t       align_offset;
 	//! Heap ID
 	int32_t      id;
+	//! Finalization state flag
+	int          finalize;
+	//! Master heap owning the memory pages
+	heap_t*      master_heap;
+#if ENABLE_THREAD_CACHE
+	//! Arrays of fully freed spans, large spans with > 1 span count
+	span_large_cache_t span_large_cache[LARGE_CLASS_COUNT - 1];
+#endif
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	//! Double linked list of fully utilized spans with free blocks for each size class.
+	//  Previous span pointer in head points to tail span of list.
+	span_t*      full_span[SIZE_CLASS_COUNT];
+	//! Double linked list of large and huge spans allocated by this heap
+	span_t*      large_huge_span;
+#endif
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	//! Current and high water mark of spans used per span count
+	span_use_t   span_use[LARGE_CLASS_COUNT];
+#endif
 #if ENABLE_STATISTICS
-	//! Number of bytes transitioned thread -> global
-	size_t       thread_to_global;
-	//! Number of bytes transitioned global -> thread
-	size_t       global_to_thread;
 	//! Allocation stats per size class
 	size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1];
+	//! Number of bytes transitioned thread -> global
+	atomic64_t   thread_to_global;
+	//! Number of bytes transitioned global -> thread
+	atomic64_t   global_to_thread;
 #endif
 };
 
+// Size class for defining a block size bucket
 struct size_class_t {
 	//! Size of blocks in this class
 	uint32_t block_size;
@@ -405,20 +562,40 @@ struct size_class_t {
 	//! Class index this class is merged with
 	uint16_t class_idx;
 };
-_Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch");
+static_assert(sizeof(size_class_t) == 8, "Size class size mismatch");
 
 struct global_cache_t {
-	//! Cache list pointer
-	atomicptr_t cache;
-	//! Cache size
-	atomic32_t size;
-	//! ABA counter
-	atomic32_t counter;
+	//! Cache lock
+	atomic32_t lock;
+	//! Cache count
+	uint32_t count;
+#if ENABLE_STATISTICS
+	//! Insert count
+	size_t insert_count;
+	//! Extract count
+	size_t extract_count;
+#endif
+	//! Cached spans
+	span_t* span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE];
+	//! Unlimited cache overflow
+	span_t* overflow;
 };
 
+////////////
+///
 /// Global data
+///
+//////
+
+//! Default span size (64KiB)
+#define _memory_default_span_size (64 * 1024)
+#define _memory_default_span_size_shift 16
+#define _memory_default_span_mask (~((uintptr_t)(_memory_span_size - 1)))
+
 //! Initialized flag
 static int _rpmalloc_initialized;
+//! Main thread ID
+static uintptr_t _rpmalloc_main_thread_id;
 //! Configuration
 static rpmalloc_config_t _memory_config;
 //! Memory page size
@@ -435,17 +612,15 @@ static size_t _memory_span_size_shift;
 //! Mask to get to start of a memory span
 static uintptr_t _memory_span_mask;
 #else
-//! Hardwired span size (64KiB)
-#define _memory_span_size (64 * 1024)
-#define _memory_span_size_shift 16
-#define _memory_span_mask (~((uintptr_t)(_memory_span_size - 1)))
+//! Hardwired span size
+#define _memory_span_size _memory_default_span_size
+#define _memory_span_size_shift _memory_default_span_size_shift
+#define _memory_span_mask _memory_default_span_mask
 #endif
 //! Number of spans to map in each map call
 static size_t _memory_span_map_count;
-//! Number of spans to release from thread cache to global cache (single spans)
-static size_t _memory_span_release_count;
-//! Number of spans to release from thread cache to global cache (large multiple spans)
-static size_t _memory_span_release_count_large;
+//! Number of spans to keep reserved in each heap
+static size_t _memory_heap_reserve_count;
 //! Global size classes
 static size_class_t _memory_size_class[SIZE_CLASS_COUNT];
 //! Run-time size limit of medium blocks
@@ -458,21 +633,37 @@ static int _memory_huge_pages;
 //! Global span cache
 static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
 #endif
+//! Global reserved spans
+static span_t* _memory_global_reserve;
+//! Global reserved count
+static size_t _memory_global_reserve_count;
+//! Global reserved master
+static span_t* _memory_global_reserve_master;
 //! All heaps
-static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE];
+static heap_t* _memory_heaps[HEAP_ARRAY_SIZE];
+//! Used to restrict access to mapping memory for huge pages
+static atomic32_t _memory_global_lock;
 //! Orphaned heaps
-static atomicptr_t _memory_orphan_heaps;
-//! Running orphan counter to avoid ABA issues in linked list
-static atomic32_t _memory_orphan_counter;
+static heap_t* _memory_orphan_heaps;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+//! Orphaned heaps (first class heaps)
+static heap_t* _memory_first_class_orphan_heaps;
+#endif
 #if ENABLE_STATISTICS
+//! Allocations counter
+static atomic64_t _allocation_counter;
+//! Deallocations counter
+static atomic64_t _deallocation_counter;
 //! Active heap count
 static atomic32_t _memory_active_heaps;
 //! Number of currently mapped memory pages
 static atomic32_t _mapped_pages;
 //! Peak number of concurrently mapped memory pages
 static int32_t _mapped_pages_peak;
-//! Number of currently unused spans
-static atomic32_t _reserved_spans;
+//! Number of mapped master spans
+static atomic32_t _master_spans;
+//! Number of unmapped dangling master spans
+static atomic32_t _unmapped_master_spans;
 //! Running counter of total number of mapped memory pages since start
 static atomic32_t _mapped_total;
 //! Running counter of total number of unmapped memory pages since start
@@ -485,15 +676,25 @@ static atomic32_t _huge_pages_current;
 static int32_t _huge_pages_peak;
 #endif
 
+////////////
+///
+/// Thread local heap and ID
+///
+//////
+
 //! Current thread heap
-#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
+#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || defined(__TINYC__)
 static pthread_key_t _memory_thread_heap;
 #else
 #  ifdef _MSC_VER
 #    define _Thread_local __declspec(thread)
 #    define TLS_MODEL
 #  else
-#    define TLS_MODEL __attribute__((tls_model("initial-exec")))
+#    ifndef __HAIKU__
+#      define TLS_MODEL __attribute__((tls_model("initial-exec")))
+#    else
+#      define TLS_MODEL
+#    endif
 #    if !defined(__clang__) && defined(__GNUC__)
 #      define _Thread_local __thread
 #    endif
@@ -524,93 +725,355 @@ get_thread_heap(void) {
 #endif
 }
 
+//! Fast thread ID
+static inline uintptr_t
+get_thread_id(void) {
+#if defined(_WIN32)
+	return (uintptr_t)((void*)NtCurrentTeb());
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__)
+	uintptr_t tid;
+#  if defined(__i386__)
+	__asm__("movl %%gs:0, %0" : "=r" (tid) : : );
+#  elif defined(__x86_64__)
+#    if defined(__MACH__)
+	__asm__("movq %%gs:0, %0" : "=r" (tid) : : );
+#    else
+	__asm__("movq %%fs:0, %0" : "=r" (tid) : : );
+#    endif
+#  elif defined(__arm__)
+	__asm__ volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid));
+#  elif defined(__aarch64__)
+#    if defined(__MACH__)
+	// tpidr_el0 likely unused, always return 0 on iOS
+	__asm__ volatile ("mrs %0, tpidrro_el0" : "=r" (tid));
+#    else
+	__asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tid));
+#    endif
+#  else
+	tid = (uintptr_t)((void*)get_thread_heap_raw());
+#  endif
+	return tid;
+#else
+	return (uintptr_t)((void*)get_thread_heap_raw());
+#endif
+}
+
 //! Set the current thread heap
 static void
 set_thread_heap(heap_t* heap) {
-#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
+#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || defined(__TINYC__)
 	pthread_setspecific(_memory_thread_heap, heap);
 #else
 	_memory_thread_heap = heap;
 #endif
+	if (heap)
+		heap->owner_thread = get_thread_id();
 }
 
-//! Default implementation to map more virtual memory
-static void*
-_memory_map_os(size_t size, size_t* offset);
+//! Set main thread ID
+extern void
+rpmalloc_set_main_thread(void);
+
+void
+rpmalloc_set_main_thread(void) {
+	_rpmalloc_main_thread_id = get_thread_id();
+}
 
-//! Default implementation to unmap virtual memory
 static void
-_memory_unmap_os(void* address, size_t size, size_t offset, size_t release);
-
-//! Lookup a memory heap from heap ID
-static heap_t*
-_memory_heap_lookup(int32_t id) {
-	uint32_t list_idx = id % HEAP_ARRAY_SIZE;
-	heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
-	while (heap && (heap->id != id))
-		heap = heap->next_heap;
-	return heap;
+_rpmalloc_spin(void) {
+#if defined(_MSC_VER)
+	_mm_pause();
+#elif defined(__x86_64__) || defined(__i386__)
+	__asm__ volatile("pause" ::: "memory");
+#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH >= 7)
+	__asm__ volatile("yield" ::: "memory");
+#elif defined(__powerpc__) || defined(__powerpc64__)
+        // No idea if ever been compiled in such archs but ... as precaution
+	__asm__ volatile("or 27,27,27");
+#elif defined(__sparc__)
+	__asm__ volatile("rd %ccr, %g0 \n\trd %ccr, %g0 \n\trd %ccr, %g0");
+#else
+	struct timespec ts = {0};
+	nanosleep(&ts, 0);
+#endif
 }
 
-#if ENABLE_STATISTICS
-#  define _memory_statistics_inc(counter, value) counter += value
-#  define _memory_statistics_dec(counter, value) counter -= value
-#  define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value))
-#  define _memory_statistics_add_peak(atomic_counter, value, peak) do { int32_t _cur_count = atomic_add32(atomic_counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0)
-#  define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value))
-#  define _memory_statistics_inc_alloc(heap, class_idx) do { \
-	int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \
-	if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \
-		heap->size_class_use[class_idx].alloc_peak = alloc_current; \
-	heap->size_class_use[class_idx].alloc_total++; \
-} while(0)
-#  define _memory_statistics_inc_free(heap, class_idx) do { \
-	atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \
-	atomic_incr32(&heap->size_class_use[class_idx].free_total); \
-} while(0)
-#else
-#  define _memory_statistics_inc(counter, value) do {} while(0)
-#  define _memory_statistics_dec(counter, value) do {} while(0)
-#  define _memory_statistics_add(atomic_counter, value) do {} while(0)
-#  define _memory_statistics_add_peak(atomic_counter, value, peak) do {} while (0)
-#  define _memory_statistics_sub(atomic_counter, value) do {} while(0)
-#  define _memory_statistics_inc_alloc(heap, class_idx) do {} while(0)
-#  define _memory_statistics_inc_free(heap, class_idx) do {} while(0)
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+static void NTAPI
+_rpmalloc_thread_destructor(void* value) {
+#if ENABLE_OVERRIDE
+	// If this is called on main thread it means rpmalloc_finalize
+	// has not been called and shutdown is forced (through _exit) or unclean
+	if (get_thread_id() == _rpmalloc_main_thread_id)
+		return;
+#endif
+	if (value)
+		rpmalloc_thread_finalize(1);
+}
 #endif
 
+
+////////////
+///
+/// Low level memory map/unmap
+///
+//////
+
 static void
-_memory_heap_cache_insert(heap_t* heap, span_t* span);
+_rpmalloc_set_name(void* address, size_t size) {
+#if defined(__linux__) || defined(__ANDROID__)
+	const char *name = _memory_huge_pages ? _memory_config.huge_page_name : _memory_config.page_name;
+	if (address == MAP_FAILED || !name)
+		return;
+	// If the kernel does not support CONFIG_ANON_VMA_NAME or if the call fails
+	// (e.g. invalid name) it is a no-op basically.
+	(void)prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (uintptr_t)address, size, (uintptr_t)name);
+#else
+	(void)sizeof(size);
+	(void)sizeof(address);
+#endif
+}
+
 
 //! Map more virtual memory
+//  size is number of bytes to map
+//  offset receives the offset in bytes from start of mapped region
+//  returns address to start of mapped region to use
 static void*
-_memory_map(size_t size, size_t* offset) {
-	assert(!(size % _memory_page_size));
-	assert(size >= _memory_page_size);
-	_memory_statistics_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak);
-	_memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift));
-	return _memory_config.memory_map(size, offset);
+_rpmalloc_mmap(size_t size, size_t* offset) {
+	rpmalloc_assert(!(size % _memory_page_size), "Invalid mmap size");
+	rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size");
+	void* address = _memory_config.memory_map(size, offset);
+	if (EXPECTED(address != 0)) {
+		_rpmalloc_stat_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak);
+		_rpmalloc_stat_add(&_mapped_total, (size >> _memory_page_size_shift));
+	}
+	return address;
 }
 
 //! Unmap virtual memory
+//  address is the memory address to unmap, as returned from _memory_map
+//  size is the number of bytes to unmap, which might be less than full region for a partial unmap
+//  offset is the offset in bytes to the actual mapped region, as set by _memory_map
+//  release is set to 0 for partial unmap, or size of entire range for a full unmap
 static void
-_memory_unmap(void* address, size_t size, size_t offset, size_t release) {
-	assert(!release || (release >= size));
-	assert(!release || (release >= _memory_page_size));
+_rpmalloc_unmap(void* address, size_t size, size_t offset, size_t release) {
+	rpmalloc_assert(!release || (release >= size), "Invalid unmap size");
+	rpmalloc_assert(!release || (release >= _memory_page_size), "Invalid unmap size");
 	if (release) {
-		assert(!(release % _memory_page_size));
-		_memory_statistics_sub(&_mapped_pages, (release >> _memory_page_size_shift));
-		_memory_statistics_add(&_unmapped_total, (release >> _memory_page_size_shift));
+		rpmalloc_assert(!(release % _memory_page_size), "Invalid unmap size");
+		_rpmalloc_stat_sub(&_mapped_pages, (release >> _memory_page_size_shift));
+		_rpmalloc_stat_add(&_unmapped_total, (release >> _memory_page_size_shift));
 	}
 	_memory_config.memory_unmap(address, size, offset, release);
 }
 
+//! Default implementation to map new pages to virtual memory
+static void*
+_rpmalloc_mmap_os(size_t size, size_t* offset) {
+	//Either size is a heap (a single page) or a (multiple) span - we only need to align spans, and only if larger than map granularity
+	size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0;
+	rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size");
+#if PLATFORM_WINDOWS
+	//Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed"
+	void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+	if (!ptr) {
+		if (_memory_config.map_fail_callback) {
+			if (_memory_config.map_fail_callback(size + padding))
+				return _rpmalloc_mmap_os(size, offset);
+		} else {
+			rpmalloc_assert(ptr, "Failed to map virtual memory block");
+		}
+		return 0;
+	}
+#else
+	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED;
+#  if defined(__APPLE__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
+	int fd = (int)VM_MAKE_TAG(240U);
+	if (_memory_huge_pages)
+		fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0);
+#  elif defined(MAP_HUGETLB)
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE | PROT_MAX(PROT_READ | PROT_WRITE), (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0);
+#    if defined(MADV_HUGEPAGE)
+	// In some configurations, huge pages allocations might fail thus
+	// we fallback to normal allocations and promote the region as transparent huge page
+	if ((ptr == MAP_FAILED || !ptr) && _memory_huge_pages) {
+		ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
+		if (ptr && ptr != MAP_FAILED) {
+			int prm = madvise(ptr, size + padding, MADV_HUGEPAGE);
+			(void)prm;
+			rpmalloc_assert((prm == 0), "Failed to promote the page to THP");
+		}
+	}
+#    endif
+	_rpmalloc_set_name(ptr, size + padding);
+#  elif defined(MAP_ALIGNED)
+	const size_t align = (sizeof(size_t) * 8) - (size_t)(__builtin_clzl(size - 1));
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_ALIGNED(align) : 0) | flags, -1, 0);
+#  elif defined(MAP_ALIGN)
+	caddr_t base = (_memory_huge_pages ? (caddr_t)(4 << 20) : 0);
+	void* ptr = mmap(base, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_ALIGN : 0) | flags, -1, 0);
+#  else
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
+#  endif
+	if ((ptr == MAP_FAILED) || !ptr) {
+		if (_memory_config.map_fail_callback) {
+			if (_memory_config.map_fail_callback(size + padding))
+				return _rpmalloc_mmap_os(size, offset);
+		} else if (errno != ENOMEM) {
+			rpmalloc_assert((ptr != MAP_FAILED) && ptr, "Failed to map virtual memory block");
+		}
+		return 0;
+	}
+#endif
+	_rpmalloc_stat_add(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift));
+	if (padding) {
+		size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask);
+		rpmalloc_assert(final_padding <= _memory_span_size, "Internal failure in padding");
+		rpmalloc_assert(final_padding <= padding, "Internal failure in padding");
+		rpmalloc_assert(!(final_padding % 8), "Internal failure in padding");
+		ptr = pointer_offset(ptr, final_padding);
+		*offset = final_padding >> 3;
+	}
+	rpmalloc_assert((size < _memory_span_size) || !((uintptr_t)ptr & ~_memory_span_mask), "Internal failure in padding");
+	return ptr;
+}
+
+//! Default implementation to unmap pages from virtual memory
+static void
+_rpmalloc_unmap_os(void* address, size_t size, size_t offset, size_t release) {
+	rpmalloc_assert(release || (offset == 0), "Invalid unmap size");
+	rpmalloc_assert(!release || (release >= _memory_page_size), "Invalid unmap size");
+	rpmalloc_assert(size >= _memory_page_size, "Invalid unmap size");
+	if (release && offset) {
+		offset <<= 3;
+		address = pointer_offset(address, -(int32_t)offset);
+		if ((release >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) {
+			//Padding is always one span size
+			release += _memory_span_size;
+		}
+	}
+#if !DISABLE_UNMAP
+#if PLATFORM_WINDOWS
+	if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) {
+		rpmalloc_assert(0, "Failed to unmap virtual memory block");
+	}
+#else
+	if (release) {
+		if (munmap(address, release)) {
+			rpmalloc_assert(0, "Failed to unmap virtual memory block");
+		}
+	} else {
+#if defined(MADV_FREE_REUSABLE)
+		int ret;
+		while ((ret = madvise(address, size, MADV_FREE_REUSABLE)) == -1 && (errno == EAGAIN))
+			errno = 0;
+		if ((ret == -1) && (errno != 0)) {
+#elif defined(MADV_DONTNEED)
+		if (madvise(address, size, MADV_DONTNEED)) {
+#elif defined(MADV_PAGEOUT)
+		if (madvise(address, size, MADV_PAGEOUT)) {
+#elif defined(MADV_FREE)
+		if (madvise(address, size, MADV_FREE)) {
+#else
+		if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) {
+#endif
+			rpmalloc_assert(0, "Failed to madvise virtual memory block as free");
+		}
+	}
+#endif
+#endif
+	if (release)
+		_rpmalloc_stat_sub(&_mapped_pages_os, release >> _memory_page_size_shift);
+}
+
+static void
+_rpmalloc_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count);
+
+//! Use global reserved spans to fulfill a memory map request (reserve size must be checked by caller)
+static span_t*
+_rpmalloc_global_get_reserved_spans(size_t span_count) {
+	span_t* span = _memory_global_reserve;
+	_rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, span, span_count);
+	_memory_global_reserve_count -= span_count;
+	if (_memory_global_reserve_count)
+		_memory_global_reserve = (span_t*)pointer_offset(span, span_count << _memory_span_size_shift);
+	else
+		_memory_global_reserve = 0;
+	return span;
+}
+
+//! Store the given spans as global reserve (must only be called from within new heap allocation, not thread safe)
+static void
+_rpmalloc_global_set_reserved_spans(span_t* master, span_t* reserve, size_t reserve_span_count) {
+	_memory_global_reserve_master = master;
+	_memory_global_reserve_count = reserve_span_count;
+	_memory_global_reserve = reserve;
+}
+
+
+////////////
+///
+/// Span linked list management
+///
+//////
+
+//! Add a span to double linked list at the head
+static void
+_rpmalloc_span_double_link_list_add(span_t** head, span_t* span) {
+	if (*head)
+		(*head)->prev = span;
+	span->next = *head;
+	*head = span;
+}
+
+//! Pop head span from double linked list
+static void
+_rpmalloc_span_double_link_list_pop_head(span_t** head, span_t* span) {
+	rpmalloc_assert(*head == span, "Linked list corrupted");
+	span = *head;
+	*head = span->next;
+}
+
+//! Remove a span from double linked list
+static void
+_rpmalloc_span_double_link_list_remove(span_t** head, span_t* span) {
+	rpmalloc_assert(*head, "Linked list corrupted");
+	if (*head == span) {
+		*head = span->next;
+	} else {
+		span_t* next_span = span->next;
+		span_t* prev_span = span->prev;
+		prev_span->next = next_span;
+		if (EXPECTED(next_span != 0))
+			next_span->prev = prev_span;
+	}
+}
+
+
+////////////
+///
+/// Span control
+///
+//////
+
+static void
+_rpmalloc_heap_cache_insert(heap_t* heap, span_t* span);
+
+static void
+_rpmalloc_heap_finalize(heap_t* heap);
+
+static void
+_rpmalloc_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count);
+
 //! Declare the span to be a subspan and store distance from master span and span count
 static void
-_memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) {
-	assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER));
+_rpmalloc_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) {
+	rpmalloc_assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER), "Span master pointer and/or flag mismatch");
 	if (subspan != master) {
 		subspan->flags = SPAN_FLAG_SUBSPAN;
-		subspan->total_spans_or_distance = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift);
+		subspan->offset_from_master = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift);
 		subspan->align_offset = 0;
 	}
 	subspan->span_count = (uint32_t)span_count;
@@ -618,496 +1081,178 @@ _memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size
 
 //! Use reserved spans to fulfill a memory map request (reserve size must be checked by caller)
 static span_t*
-_memory_map_from_reserve(heap_t* heap, size_t span_count) {
+_rpmalloc_span_map_from_reserve(heap_t* heap, size_t span_count) {
 	//Update the heap span reserve
 	span_t* span = heap->span_reserve;
 	heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size);
-	heap->spans_reserved -= span_count;
+	heap->spans_reserved -= (uint32_t)span_count;
 
-	_memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count);
+	_rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count);
 	if (span_count <= LARGE_CLASS_COUNT)
-		_memory_statistics_inc(heap->span_use[span_count - 1].spans_from_reserved, 1);
+		_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_reserved);
 
 	return span;
 }
 
 //! Get the aligned number of spans to map in based on wanted count, configured mapping granularity and the page size
 static size_t
-_memory_map_align_span_count(size_t span_count) {
+_rpmalloc_span_align_count(size_t span_count) {
 	size_t request_count = (span_count > _memory_span_map_count) ? span_count : _memory_span_map_count;
 	if ((_memory_page_size > _memory_span_size) && ((request_count * _memory_span_size) % _memory_page_size))
-		request_count += _memory_span_map_count - (request_count % _memory_span_map_count);	
+		request_count += _memory_span_map_count - (request_count % _memory_span_map_count);
 	return request_count;
 }
 
-//! Store the given spans as reserve in the given heap
-static void
-_memory_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) {
-	heap->span_reserve_master = master;
-	heap->span_reserve = reserve;
-	heap->spans_reserved = reserve_span_count;
-}
-
 //! Setup a newly mapped span
 static void
-_memory_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) {
-	span->total_spans_or_distance = (uint32_t)total_span_count;
+_rpmalloc_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) {
+	span->total_spans = (uint32_t)total_span_count;
 	span->span_count = (uint32_t)span_count;
 	span->align_offset = (uint32_t)align_offset;
 	span->flags = SPAN_FLAG_MASTER;
-	atomic_store32(&span->remaining_spans, (int32_t)total_span_count);	
+	atomic_store32(&span->remaining_spans, (int32_t)total_span_count);
 }
 
-//! Map a akigned set of spans, taking configured mapping granularity and the page size into account
+static void
+_rpmalloc_span_unmap(span_t* span);
+
+//! Map an aligned set of spans, taking configured mapping granularity and the page size into account
 static span_t*
-_memory_map_aligned_span_count(heap_t* heap, size_t span_count) {
+_rpmalloc_span_map_aligned_count(heap_t* heap, size_t span_count) {
 	//If we already have some, but not enough, reserved spans, release those to heap cache and map a new
 	//full set of spans. Otherwise we would waste memory if page size > span size (huge pages)
-	size_t aligned_span_count = _memory_map_align_span_count(span_count);
+	size_t aligned_span_count = _rpmalloc_span_align_count(span_count);
 	size_t align_offset = 0;
-	span_t* span = (span_t*)_memory_map(aligned_span_count * _memory_span_size, &align_offset);
+	span_t* span = (span_t*)_rpmalloc_mmap(aligned_span_count * _memory_span_size, &align_offset);
 	if (!span)
 		return 0;
-	_memory_span_initialize(span, aligned_span_count, span_count, align_offset);
-	_memory_statistics_add(&_reserved_spans, aligned_span_count);
+	_rpmalloc_span_initialize(span, aligned_span_count, span_count, align_offset);
+	_rpmalloc_stat_inc(&_master_spans);
 	if (span_count <= LARGE_CLASS_COUNT)
-		_memory_statistics_inc(heap->span_use[span_count - 1].spans_map_calls, 1);
+		_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_map_calls);
 	if (aligned_span_count > span_count) {
+		span_t* reserved_spans = (span_t*)pointer_offset(span, span_count * _memory_span_size);
+		size_t reserved_count = aligned_span_count - span_count;
 		if (heap->spans_reserved) {
-			_memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
-			_memory_heap_cache_insert(heap, heap->span_reserve);
+			_rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
+			_rpmalloc_heap_cache_insert(heap, heap->span_reserve);
 		}
-		_memory_heap_set_reserved_spans(heap, span, (span_t*)pointer_offset(span, span_count * _memory_span_size), aligned_span_count - span_count);
+		if (reserved_count > _memory_heap_reserve_count) {
+			// If huge pages or eager spam map count, the global reserve spin lock is held by caller, _rpmalloc_span_map
+			rpmalloc_assert(atomic_load32(&_memory_global_lock) == 1, "Global spin lock not held as expected");
+			size_t remain_count = reserved_count - _memory_heap_reserve_count;
+			reserved_count = _memory_heap_reserve_count;
+			span_t* remain_span = (span_t*)pointer_offset(reserved_spans, reserved_count * _memory_span_size);
+			if (_memory_global_reserve) {
+				_rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, _memory_global_reserve, _memory_global_reserve_count);
+				_rpmalloc_span_unmap(_memory_global_reserve);
+			}
+			_rpmalloc_global_set_reserved_spans(span, remain_span, remain_count);
+		}
+		_rpmalloc_heap_set_reserved_spans(heap, span, reserved_spans, reserved_count);
 	}
 	return span;
 }
 
 //! Map in memory pages for the given number of spans (or use previously reserved pages)
 static span_t*
-_memory_map_spans(heap_t* heap, size_t span_count) {
+_rpmalloc_span_map(heap_t* heap, size_t span_count) {
 	if (span_count <= heap->spans_reserved)
-		return _memory_map_from_reserve(heap, span_count);
-	return _memory_map_aligned_span_count(heap, span_count);
+		return _rpmalloc_span_map_from_reserve(heap, span_count);
+	span_t* span = 0;
+	int use_global_reserve = (_memory_page_size > _memory_span_size) || (_memory_span_map_count > _memory_heap_reserve_count);
+	if (use_global_reserve) {
+		// If huge pages, make sure only one thread maps more memory to avoid bloat
+		while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
+			_rpmalloc_spin();
+		if (_memory_global_reserve_count >= span_count) {
+			size_t reserve_count = (!heap->spans_reserved ? _memory_heap_reserve_count : span_count);
+			if (_memory_global_reserve_count < reserve_count)
+				reserve_count = _memory_global_reserve_count;
+			span = _rpmalloc_global_get_reserved_spans(reserve_count);
+			if (span) {
+				if (reserve_count > span_count) {
+					span_t* reserved_span = (span_t*)pointer_offset(span, span_count << _memory_span_size_shift);
+					_rpmalloc_heap_set_reserved_spans(heap, _memory_global_reserve_master, reserved_span, reserve_count - span_count);
+				}
+				// Already marked as subspan in _rpmalloc_global_get_reserved_spans
+				span->span_count = (uint32_t)span_count;
+			}
+		}
+	}
+	if (!span)
+		span = _rpmalloc_span_map_aligned_count(heap, span_count);
+	if (use_global_reserve)
+		atomic_store32_release(&_memory_global_lock, 0);
+	return span;
 }
 
 //! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
 static void
-_memory_unmap_span(span_t* span) {
-	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
-	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
+_rpmalloc_span_unmap(span_t* span) {
+	rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted");
+	rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted");
 
 	int is_master = !!(span->flags & SPAN_FLAG_MASTER);
-	span_t* master = is_master ? span : (span_t*)(pointer_offset(span, -(int32_t)(span->total_spans_or_distance * _memory_span_size)));
-	assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN));
-	assert(master->flags & SPAN_FLAG_MASTER);
+	span_t* master = is_master ? span : ((span_t*)pointer_offset(span, -(intptr_t)((uintptr_t)span->offset_from_master * _memory_span_size)));
+	rpmalloc_assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted");
+	rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted");
 
 	size_t span_count = span->span_count;
 	if (!is_master) {
 		//Directly unmap subspans (unless huge pages, in which case we defer and unmap entire page range with master)
-		assert(span->align_offset == 0);
-		if (_memory_span_size >= _memory_page_size) {
-			_memory_unmap(span, span_count * _memory_span_size, 0, 0);
-			_memory_statistics_sub(&_reserved_spans, span_count);
-		}
+		rpmalloc_assert(span->align_offset == 0, "Span align offset corrupted");
+		if (_memory_span_size >= _memory_page_size)
+			_rpmalloc_unmap(span, span_count * _memory_span_size, 0, 0);
 	} else {
 		//Special double flag to denote an unmapped master
 		//It must be kept in memory since span header must be used
-		span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN;
+		span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN | SPAN_FLAG_UNMAPPED_MASTER;
+		_rpmalloc_stat_add(&_unmapped_master_spans, 1);
 	}
 
 	if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) {
 		//Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span
-		assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN));
+		rpmalloc_assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted");
 		size_t unmap_count = master->span_count;
 		if (_memory_span_size < _memory_page_size)
-			unmap_count = master->total_spans_or_distance;
-		_memory_statistics_sub(&_reserved_spans, unmap_count);
-		_memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, master->total_spans_or_distance * _memory_span_size);
+			unmap_count = master->total_spans;
+		_rpmalloc_stat_sub(&_master_spans, 1);
+		_rpmalloc_stat_sub(&_unmapped_master_spans, 1);
+		_rpmalloc_unmap(master, unmap_count * _memory_span_size, master->align_offset, (size_t)master->total_spans * _memory_span_size);
 	}
 }
 
-#if ENABLE_THREAD_CACHE
-
-//! Unmap a single linked list of spans
-static void
-_memory_unmap_span_list(span_t* span) {
-	size_t list_size = span->list_size;
-	for (size_t ispan = 0; ispan < list_size; ++ispan) {
-		span_t* next_span = span->next;
-		_memory_unmap_span(span);
-		span = next_span;
-	}
-	assert(!span);
-}
-
-//! Add span to head of single linked span list
-static size_t
-_memory_span_list_push(span_t** head, span_t* span) {
-	span->next = *head;
-	if (*head)
-		span->list_size = (*head)->list_size + 1;
-	else
-		span->list_size = 1;
-	*head = span;
-	return span->list_size;
-}
-
-//! Remove span from head of single linked span list, returns the new list head
-static span_t*
-_memory_span_list_pop(span_t** head) {
-	span_t* span = *head;
-	span_t* next_span = 0;
-	if (span->list_size > 1) {
-		assert(span->next);
-		next_span = span->next;
-		assert(next_span);
-		next_span->list_size = span->list_size - 1;
-	}
-	*head = next_span;
-	return span;
-}
-
-//! Split a single linked span list
-static span_t*
-_memory_span_list_split(span_t* span, size_t limit) {
-	span_t* next = 0;
-	if (limit < 2)
-		limit = 2;
-	if (span->list_size > limit) {
-		uint32_t list_size = 1;
-		span_t* last = span;
-		next = span->next;
-		while (list_size < limit) {
-			last = next;
-			next = next->next;
-			++list_size;
-		}
-		last->next = 0;
-		assert(next);
-		next->list_size = span->list_size - list_size;
-		span->list_size = list_size;
-		span->prev = 0;
-	}
-	return next;
-}
-
-#endif
-
-//! Add a span to partial span double linked list at the head
-static void
-_memory_span_partial_list_add(span_t** head, span_t* span) {
-	if (*head) {
-		span->next = *head;
-		//Maintain pointer to tail span
-		span->prev = (*head)->prev;
-		(*head)->prev = span;
-	} else {
-		span->next = 0;
-		span->prev = span;
-	}
-	*head = span;
-}
-
-//! Add a span to partial span double linked list at the tail
-static void
-_memory_span_partial_list_add_tail(span_t** head, span_t* span) {
-	span->next = 0;
-	if (*head) {
-		span_t* tail = (*head)->prev;
-		tail->next = span;
-		span->prev = tail;
-		//Maintain pointer to tail span
-		(*head)->prev = span;
-	} else {
-		span->prev = span;
-		*head = span;
-	}
-}
-
-//! Pop head span from partial span double linked list
-static void
-_memory_span_partial_list_pop_head(span_t** head) {
-	span_t* span = *head;
-	*head = span->next;
-	if (*head) {
-		//Maintain pointer to tail span
-		(*head)->prev = span->prev;
-	}
-}
-
-//! Remove a span from partial span double linked list
-static void
-_memory_span_partial_list_remove(span_t** head, span_t* span) {
-	if (UNEXPECTED(*head == span)) {
-		_memory_span_partial_list_pop_head(head);
-	} else {
-		span_t* next_span = span->next;
-		span_t* prev_span = span->prev;
-		prev_span->next = next_span;
-		if (EXPECTED(next_span != 0)) {
-			next_span->prev = prev_span;
-		} else {
-			//Update pointer to tail span
-			(*head)->prev = prev_span;
-		}
-	}
-}
-
-#if ENABLE_GLOBAL_CACHE
-
-//! Insert the given list of memory page spans in the global cache
-static void
-_memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) {
-	assert((span->list_size == 1) || (span->next != 0));
-	int32_t list_size = (int32_t)span->list_size;
-	//Unmap if cache has reached the limit
-	if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
-#if !ENABLE_UNLIMITED_GLOBAL_CACHE
-		_memory_unmap_span_list(span);
-		atomic_add32(&cache->size, -list_size);
-		return;
-#endif
-	}
-	void* current_cache, *new_cache;
-	do {
-		current_cache = atomic_load_ptr(&cache->cache);
-		span->prev = (span_t*)((uintptr_t)current_cache & _memory_span_mask);
-		new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
-	} while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache));
-}
-
-//! Extract a number of memory page spans from the global cache
-static span_t*
-_memory_cache_extract(global_cache_t* cache) {
-	uintptr_t span_ptr;
-	do {
-		void* global_span = atomic_load_ptr(&cache->cache);
-		span_ptr = (uintptr_t)global_span & _memory_span_mask;
-		if (span_ptr) {
-			span_t* span = (span_t*)span_ptr;
-			//By accessing the span ptr before it is swapped out of list we assume that a contending thread
-			//does not manage to traverse the span to being unmapped before we access it
-			void* new_cache = (void*)((uintptr_t)span->prev | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
-			if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) {
-				atomic_add32(&cache->size, -(int32_t)span->list_size);
-				return span;
-			}
-		}
-	} while (span_ptr);
-	return 0;
-}
-
-//! Finalize a global cache, only valid from allocator finalization (not thread safe)
-static void
-_memory_cache_finalize(global_cache_t* cache) {
-	void* current_cache = atomic_load_ptr(&cache->cache);
-	span_t* span = (span_t*)((uintptr_t)current_cache & _memory_span_mask);
-	while (span) {
-		span_t* skip_span = (span_t*)((uintptr_t)span->prev & _memory_span_mask);
-		atomic_add32(&cache->size, -(int32_t)span->list_size);
-		_memory_unmap_span_list(span);
-		span = skip_span;
-	}
-	assert(!atomic_load32(&cache->size));
-	atomic_store_ptr(&cache->cache, 0);
-	atomic_store32(&cache->size, 0);
-}
-
-//! Insert the given list of memory page spans in the global cache
-static void
-_memory_global_cache_insert(span_t* span) {
-	size_t span_count = span->span_count;
-#if ENABLE_UNLIMITED_GLOBAL_CACHE
-	_memory_cache_insert(&_memory_span_cache[span_count - 1], span, 0);
-#else
-	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * ((span_count == 1) ? _memory_span_release_count : _memory_span_release_count_large));
-	_memory_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit);
-#endif
-}
-
-//! Extract a number of memory page spans from the global cache for large blocks
-static span_t*
-_memory_global_cache_extract(size_t span_count) {
-	span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]);
-	assert(!span || (span->span_count == span_count));
-	return span;
-}
-
-#endif
-
-#if ENABLE_THREAD_CACHE
-//! Adopt the deferred span cache list
-static void
-_memory_heap_cache_adopt_deferred(heap_t* heap) {
-	atomic_thread_fence_acquire();
-	span_t* span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred);
-	if (!span)
-		return;
-	do {
-		span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred);
-	} while (!atomic_cas_ptr(&heap->span_cache_deferred, 0, span));
-	while (span) {
-		span_t* next_span = span->next;
-		_memory_span_list_push(&heap->span_cache[0], span);
-#if ENABLE_STATISTICS
-		atomic_decr32(&heap->span_use[span->span_count - 1].current);
-		++heap->size_class_use[span->size_class].spans_to_cache;
-		--heap->size_class_use[span->size_class].spans_current;
-#endif
-		span = next_span;
-	}
-}
-#endif
-
-//! Insert a single span into thread heap cache, releasing to global cache if overflow
-static void
-_memory_heap_cache_insert(heap_t* heap, span_t* span) {
-#if ENABLE_THREAD_CACHE
-	size_t span_count = span->span_count;
-	size_t idx = span_count - 1;
-	_memory_statistics_inc(heap->span_use[idx].spans_to_cache, 1);
-	if (!idx)
-		_memory_heap_cache_adopt_deferred(heap);
-#if ENABLE_UNLIMITED_THREAD_CACHE
-	_memory_span_list_push(&heap->span_cache[idx], span);
-#else
-	const size_t release_count = (!idx ? _memory_span_release_count : _memory_span_release_count_large);
-	size_t current_cache_size = _memory_span_list_push(&heap->span_cache[idx], span);
-	if (current_cache_size <= release_count)
-		return;
-	const size_t hard_limit = release_count * THREAD_CACHE_MULTIPLIER;
-	if (current_cache_size <= hard_limit) {
-#if ENABLE_ADAPTIVE_THREAD_CACHE
-		//Require 25% of high water mark to remain in cache (and at least 1, if use is 0)
-		const size_t high_mark = heap->span_use[idx].high;
-		const size_t min_limit = (high_mark >> 2) + release_count + 1;
-		if (current_cache_size < min_limit)
-			return;
-#else
-		return;
-#endif
-	}
-	heap->span_cache[idx] = _memory_span_list_split(span, release_count);
-	assert(span->list_size == release_count);
-#if ENABLE_STATISTICS
-	heap->thread_to_global += (size_t)span->list_size * span_count * _memory_span_size;
-	heap->span_use[idx].spans_to_global += span->list_size;
-#endif
-#if ENABLE_GLOBAL_CACHE
-	_memory_global_cache_insert(span);
-#else
-	_memory_unmap_span_list(span);
-#endif
-#endif
-#else
-	(void)sizeof(heap);
-	_memory_unmap_span(span);
-#endif
-}
-
-//! Extract the given number of spans from the different cache levels
-static span_t*
-_memory_heap_thread_cache_extract(heap_t* heap, size_t span_count) {
-#if ENABLE_THREAD_CACHE
-	size_t idx = span_count - 1;
-	if (!idx)
-		_memory_heap_cache_adopt_deferred(heap);
-	if (heap->span_cache[idx]) {
-#if ENABLE_STATISTICS
-		heap->span_use[idx].spans_from_cache++;
-#endif
-		return _memory_span_list_pop(&heap->span_cache[idx]);
-	}
-#endif
-	return 0;
-}
-
-static span_t*
-_memory_heap_reserved_extract(heap_t* heap, size_t span_count) {
-	if (heap->spans_reserved >= span_count)
-		return _memory_map_spans(heap, span_count);
-	return 0;
-}
-
-//! Extract a span from the global cache
-static span_t*
-_memory_heap_global_cache_extract(heap_t* heap, size_t span_count) {
-#if ENABLE_GLOBAL_CACHE
-	size_t idx = span_count - 1;
-	heap->span_cache[idx] = _memory_global_cache_extract(span_count);
-	if (heap->span_cache[idx]) {
-#if ENABLE_STATISTICS
-		heap->global_to_thread += (size_t)heap->span_cache[idx]->list_size * span_count * _memory_span_size;
-		heap->span_use[idx].spans_from_global += heap->span_cache[idx]->list_size;
-#endif
-		return _memory_span_list_pop(&heap->span_cache[idx]);
-	}
-#endif
-	return 0;
-}
-
-//! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory
-static span_t*
-_memory_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_idx) {
-	(void)sizeof(class_idx);
-#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
-	uint32_t idx = (uint32_t)span_count - 1;
-	uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current);
-	if (current_count > heap->span_use[idx].high)
-		heap->span_use[idx].high = current_count;
-#if ENABLE_STATISTICS
-	uint32_t spans_current = ++heap->size_class_use[class_idx].spans_current;
-	if (spans_current > heap->size_class_use[class_idx].spans_peak)
-		heap->size_class_use[class_idx].spans_peak = spans_current;
-#endif
-#endif	
-	span_t* span = _memory_heap_thread_cache_extract(heap, span_count);
-	if (EXPECTED(span != 0)) {
-		_memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1);
-		return span;
-	}
-	span = _memory_heap_reserved_extract(heap, span_count);
-	if (EXPECTED(span != 0)) {
-		_memory_statistics_inc(heap->size_class_use[class_idx].spans_from_reserved, 1);
-		return span;
-	}
-	span = _memory_heap_global_cache_extract(heap, span_count);
-	if (EXPECTED(span != 0)) {
-		_memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1);
-		return span;
-	}
-	//Final fallback, map in more virtual memory
-	span = _memory_map_spans(heap, span_count);
-	_memory_statistics_inc(heap->size_class_use[class_idx].spans_map_calls, 1);
-	return span;
-}
-
 //! Move the span (used for small or medium allocations) to the heap thread cache
 static void
-_memory_span_release_to_cache(heap_t* heap, span_t* span) {
-	heap_class_t* heap_class = heap->span_class + span->size_class;
-	assert(heap_class->partial_span != span);
-	if (span->state == SPAN_STATE_PARTIAL)
-		_memory_span_partial_list_remove(&heap_class->partial_span, span);
+_rpmalloc_span_release_to_cache(heap_t* heap, span_t* span) {
+	rpmalloc_assert(heap == span->heap, "Span heap pointer corrupted");
+	rpmalloc_assert(span->size_class < SIZE_CLASS_COUNT, "Invalid span size class");
+	rpmalloc_assert(span->span_count == 1, "Invalid span count");
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	atomic_decr32(&heap->span_use[0].current);
 #endif
-	_memory_statistics_inc(heap->span_use[0].spans_to_cache, 1);
-	_memory_statistics_inc(heap->size_class_use[span->size_class].spans_to_cache, 1);
-	_memory_statistics_dec(heap->size_class_use[span->size_class].spans_current, 1);
-	_memory_heap_cache_insert(heap, span);
+	_rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
+	if (!heap->finalize) {
+		_rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache);
+		_rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache);
+		if (heap->size_class[span->size_class].cache)
+			_rpmalloc_heap_cache_insert(heap, heap->size_class[span->size_class].cache);
+		heap->size_class[span->size_class].cache = span;
+	} else {
+		_rpmalloc_span_unmap(span);
+	}
 }
 
 //! Initialize a (partial) free list up to next system memory page, while reserving the first block
 //! as allocated, returning number of blocks in list
 static uint32_t
-free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start,
-                       uint32_t block_count, uint32_t block_size) {
-	assert(block_count);
+free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start, uint32_t block_count, uint32_t block_size) {
+	rpmalloc_assert(block_count, "Internal failure");
 	*first_block = block_start;
 	if (block_count > 1) {
 		void* free_block = pointer_offset(block_start, block_size);
-		void* block_end = pointer_offset(block_start, block_size * block_count);
+		void* block_end = pointer_offset(block_start, (size_t)block_size * block_count);
 		//If block size is less than half a memory page, bound init to next memory page boundary
 		if (block_size < (_memory_page_size >> 1)) {
 			void* page_end = pointer_offset(page_start, _memory_page_size);
@@ -1130,75 +1275,802 @@ free_list_partial_init(void** list, void** first_block, void* page_start, void*
 	return block_count;
 }
 
-//! Initialize an unused span (from cache or mapped) to be new active span
+//! Initialize an unused span (from cache or mapped) to be new active span, putting the initial free list in heap class free list
 static void*
-_memory_span_set_new_active(heap_t* heap, heap_class_t* heap_class, span_t* span, uint32_t class_idx) {
-	assert(span->span_count == 1);
+_rpmalloc_span_initialize_new(heap_t* heap, heap_size_class_t* heap_size_class, span_t* span, uint32_t class_idx) {
+	rpmalloc_assert(span->span_count == 1, "Internal failure");
 	size_class_t* size_class = _memory_size_class + class_idx;
 	span->size_class = class_idx;
 	span->heap = heap;
 	span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS;
-	span->block_count = size_class->block_count;
 	span->block_size = size_class->block_size;
-	span->state = SPAN_STATE_ACTIVE;
+	span->block_count = size_class->block_count;
 	span->free_list = 0;
+	span->list_size = 0;
+	atomic_store_ptr_release(&span->free_list_deferred, 0);
 
 	//Setup free list. Only initialize one system page worth of free blocks in list
 	void* block;
-	span->free_list_limit = free_list_partial_init(&heap_class->free_list, &block, 
+	span->free_list_limit = free_list_partial_init(&heap_size_class->free_list, &block, 
 		span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size);
-	atomic_store_ptr(&span->free_list_deferred, 0);
-	span->list_size = 0;
-	atomic_thread_fence_release();
-
-	_memory_span_partial_list_add(&heap_class->partial_span, span);
+	//Link span as partial if there remains blocks to be initialized as free list, or full if fully initialized
+	if (span->free_list_limit < span->block_count) {
+		_rpmalloc_span_double_link_list_add(&heap_size_class->partial_span, span);
+		span->used_count = span->free_list_limit;
+	} else {
+#if RPMALLOC_FIRST_CLASS_HEAPS
+		_rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span);
+#endif
+		++heap->full_span_count;
+		span->used_count = span->block_count;
+	}
 	return block;
 }
 
-//! Promote a partially used span (from heap used list) to be new active span
 static void
-_memory_span_set_partial_active(heap_class_t* heap_class, span_t* span) {
-	assert(span->state == SPAN_STATE_PARTIAL);
-	assert(span->block_count == _memory_size_class[span->size_class].block_count);
-	//Move data to heap size class and set span as active
-	heap_class->free_list = span->free_list;
-	span->state = SPAN_STATE_ACTIVE;
-	span->free_list = 0;
-	assert(heap_class->free_list);
-}
-
-//! Mark span as full (from active)
-static void
-_memory_span_set_active_full(heap_class_t* heap_class, span_t* span) {
-	assert(span->state == SPAN_STATE_ACTIVE);
-	assert(span == heap_class->partial_span);
-	_memory_span_partial_list_pop_head(&heap_class->partial_span);
-	span->used_count = span->block_count;
-	span->state = SPAN_STATE_FULL;
-	span->free_list = 0;
-}
-
-//! Move span from full to partial state
-static void
-_memory_span_set_full_partial(heap_t* heap, span_t* span) {
-	assert(span->state == SPAN_STATE_FULL);
-	heap_class_t* heap_class = &heap->span_class[span->size_class];
-	span->state = SPAN_STATE_PARTIAL;
-	_memory_span_partial_list_add_tail(&heap_class->partial_span, span);
-}
-
-static void*
-_memory_span_extract_deferred(span_t* span) {
-	void* free_list;
+_rpmalloc_span_extract_free_list_deferred(span_t* span) {
+	// We need acquire semantics on the CAS operation since we are interested in the list size
+	// Refer to _rpmalloc_deallocate_defer_small_or_medium for further comments on this dependency
 	do {
-		free_list = atomic_load_ptr(&span->free_list_deferred);
-	} while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list));
+		span->free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+	} while (span->free_list == INVALID_POINTER);
+	span->used_count -= span->list_size;
 	span->list_size = 0;
-	atomic_store_ptr(&span->free_list_deferred, 0);
-	atomic_thread_fence_release();
-	return free_list;
+	atomic_store_ptr_release(&span->free_list_deferred, 0);
 }
 
+static int
+_rpmalloc_span_is_fully_utilized(span_t* span) {
+	rpmalloc_assert(span->free_list_limit <= span->block_count, "Span free list corrupted");
+	return !span->free_list && (span->free_list_limit >= span->block_count);
+}
+
+static int
+_rpmalloc_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list_head) {
+	void* free_list = heap->size_class[iclass].free_list;
+	span_t* class_span = (span_t*)((uintptr_t)free_list & _memory_span_mask);
+	if (span == class_span) {
+		// Adopt the heap class free list back into the span free list
+		void* block = span->free_list;
+		void* last_block = 0;
+		while (block) {
+			last_block = block;
+			block = *((void**)block);
+		}
+		uint32_t free_count = 0;
+		block = free_list;
+		while (block) {
+			++free_count;
+			block = *((void**)block);
+		}
+		if (last_block) {
+			*((void**)last_block) = free_list;
+		} else {
+			span->free_list = free_list;
+		}
+		heap->size_class[iclass].free_list = 0;
+		span->used_count -= free_count;
+	}
+	//If this assert triggers you have memory leaks
+	rpmalloc_assert(span->list_size == span->used_count, "Memory leak detected");
+	if (span->list_size == span->used_count) {
+		_rpmalloc_stat_dec(&heap->span_use[0].current);
+		_rpmalloc_stat_dec(&heap->size_class_use[iclass].spans_current);
+		// This function only used for spans in double linked lists
+		if (list_head)
+			_rpmalloc_span_double_link_list_remove(list_head, span);
+		_rpmalloc_span_unmap(span);
+		return 1;
+	}
+	return 0;
+}
+
+
+////////////
+///
+/// Global cache
+///
+//////
+
+#if ENABLE_GLOBAL_CACHE
+
+//! Finalize a global cache
+static void
+_rpmalloc_global_cache_finalize(global_cache_t* cache) {
+	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+		_rpmalloc_spin();
+
+	for (size_t ispan = 0; ispan < cache->count; ++ispan)
+		_rpmalloc_span_unmap(cache->span[ispan]);
+	cache->count = 0;
+
+	while (cache->overflow) {
+		span_t* span = cache->overflow;
+		cache->overflow = span->next;
+		_rpmalloc_span_unmap(span);
+	}
+
+	atomic_store32_release(&cache->lock, 0);
+}
+
+static void
+_rpmalloc_global_cache_insert_spans(span_t** span, size_t span_count, size_t count) {
+	const size_t cache_limit = (span_count == 1) ? 
+		GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE :
+		GLOBAL_CACHE_MULTIPLIER * (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1));
+
+	global_cache_t* cache = &_memory_span_cache[span_count - 1];
+
+	size_t insert_count = count;
+	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+		_rpmalloc_spin();
+
+#if ENABLE_STATISTICS
+	cache->insert_count += count;
+#endif
+	if ((cache->count + insert_count) > cache_limit)
+		insert_count = cache_limit - cache->count;
+
+	memcpy(cache->span + cache->count, span, sizeof(span_t*) * insert_count);
+	cache->count += (uint32_t)insert_count;
+
+#if ENABLE_UNLIMITED_CACHE
+	while (insert_count < count) {
+#else
+	// Enable unlimited cache if huge pages, or we will leak since it is unlikely that an entire huge page
+	// will be unmapped, and we're unable to partially decommit a huge page
+	while ((_memory_page_size > _memory_span_size) && (insert_count < count)) {
+#endif		
+		span_t* current_span = span[insert_count++];
+		current_span->next = cache->overflow;
+		cache->overflow = current_span;
+	}
+	atomic_store32_release(&cache->lock, 0);
+
+	span_t* keep = 0;
+	for (size_t ispan = insert_count; ispan < count; ++ispan) {
+		span_t* current_span = span[ispan];
+		// Keep master spans that has remaining subspans to avoid dangling them
+		if ((current_span->flags & SPAN_FLAG_MASTER) &&
+		    (atomic_load32(&current_span->remaining_spans) > (int32_t)current_span->span_count)) {
+			current_span->next = keep;
+			keep = current_span;
+		} else {
+			_rpmalloc_span_unmap(current_span);
+		}
+	}
+
+	if (keep) {
+		while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+			_rpmalloc_spin();
+
+		size_t islot = 0;
+		while (keep) {
+			for (; islot < cache->count; ++islot) {
+				span_t* current_span = cache->span[islot];
+				if (!(current_span->flags & SPAN_FLAG_MASTER) || ((current_span->flags & SPAN_FLAG_MASTER) &&
+				    (atomic_load32(&current_span->remaining_spans) <= (int32_t)current_span->span_count))) {
+					_rpmalloc_span_unmap(current_span);
+					cache->span[islot] = keep;
+					break;
+				}
+			}
+			if (islot == cache->count)
+				break;
+			keep = keep->next;
+		}
+
+		if (keep) {
+			span_t* tail = keep;
+			while (tail->next)
+				tail = tail->next;
+			tail->next = cache->overflow;
+			cache->overflow = keep;
+		}
+
+		atomic_store32_release(&cache->lock, 0);
+	}
+}
+
+static size_t
+_rpmalloc_global_cache_extract_spans(span_t** span, size_t span_count, size_t count) {
+	global_cache_t* cache = &_memory_span_cache[span_count - 1];
+
+	size_t extract_count = 0;
+	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+		_rpmalloc_spin();
+
+#if ENABLE_STATISTICS
+	cache->extract_count += count;
+#endif
+	size_t want = count - extract_count;
+	if (want > cache->count)
+		want = cache->count;
+
+	memcpy(span + extract_count, cache->span + (cache->count - want), sizeof(span_t*) * want);
+	cache->count -= (uint32_t)want;
+	extract_count += want;
+
+	while ((extract_count < count) && cache->overflow) {
+		span_t* current_span = cache->overflow;
+		span[extract_count++] = current_span;
+		cache->overflow = current_span->next;
+	}
+
+#if ENABLE_ASSERTS
+	for (size_t ispan = 0; ispan < extract_count; ++ispan) {
+		assert(span[ispan]->span_count == span_count);
+	}
+#endif
+
+	atomic_store32_release(&cache->lock, 0);
+
+	return extract_count;
+}
+
+#endif
+
+////////////
+///
+/// Heap control
+///
+//////
+
+static void _rpmalloc_deallocate_huge(span_t*);
+
+//! Store the given spans as reserve in the given heap
+static void
+_rpmalloc_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) {
+	heap->span_reserve_master = master;
+	heap->span_reserve = reserve;
+	heap->spans_reserved = (uint32_t)reserve_span_count;
+}
+
+//! Adopt the deferred span cache list, optionally extracting the first single span for immediate re-use
+static void
+_rpmalloc_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
+	span_t* span = (span_t*)((void*)atomic_exchange_ptr_acquire(&heap->span_free_deferred, 0));
+	while (span) {
+		span_t* next_span = (span_t*)span->free_list;
+		rpmalloc_assert(span->heap == heap, "Span heap pointer corrupted");
+		if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
+			rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted");
+			--heap->full_span_count;
+			_rpmalloc_stat_dec(&heap->span_use[0].spans_deferred);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+			_rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span);
+#endif
+			_rpmalloc_stat_dec(&heap->span_use[0].current);
+			_rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
+			if (single_span && !*single_span)
+				*single_span = span;
+			else
+				_rpmalloc_heap_cache_insert(heap, span);
+		} else {
+			if (span->size_class == SIZE_CLASS_HUGE) {
+				_rpmalloc_deallocate_huge(span);
+			} else {
+				rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, "Span size class invalid");
+				rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted");
+				--heap->full_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+				_rpmalloc_span_double_link_list_remove(&heap->large_huge_span, span);
+#endif
+				uint32_t idx = span->span_count - 1;
+				_rpmalloc_stat_dec(&heap->span_use[idx].spans_deferred);
+				_rpmalloc_stat_dec(&heap->span_use[idx].current);
+				if (!idx && single_span && !*single_span)
+					*single_span = span;
+				else
+					_rpmalloc_heap_cache_insert(heap, span);
+			}
+		}
+		span = next_span;
+	}
+}
+
+static void
+_rpmalloc_heap_unmap(heap_t* heap) {
+	if (!heap->master_heap) {
+		if ((heap->finalize > 1) && !atomic_load32(&heap->child_count)) {
+			span_t* span = (span_t*)((uintptr_t)heap & _memory_span_mask);
+			_rpmalloc_span_unmap(span);
+		}
+	} else {
+		if (atomic_decr32(&heap->master_heap->child_count) == 0) {
+			_rpmalloc_heap_unmap(heap->master_heap);
+		}
+	}
+}
+
+static void
+_rpmalloc_heap_global_finalize(heap_t* heap) {
+	if (heap->finalize++ > 1) {
+		--heap->finalize;
+		return;
+	}
+
+	_rpmalloc_heap_finalize(heap);
+
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
+		span_cache->count = 0;
+	}
+#endif
+
+	if (heap->full_span_count) {
+		--heap->finalize;
+		return;
+	}
+
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		if (heap->size_class[iclass].free_list || heap->size_class[iclass].partial_span) {
+			--heap->finalize;
+			return;
+		}
+	}
+	//Heap is now completely free, unmap and remove from heap list
+	size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE;
+	heap_t* list_heap = _memory_heaps[list_idx];
+	if (list_heap == heap) {
+		_memory_heaps[list_idx] = heap->next_heap;
+	} else {
+		while (list_heap->next_heap != heap)
+			list_heap = list_heap->next_heap;
+		list_heap->next_heap = heap->next_heap;
+	}
+
+	_rpmalloc_heap_unmap(heap);
+}
+
+//! Insert a single span into thread heap cache, releasing to global cache if overflow
+static void
+_rpmalloc_heap_cache_insert(heap_t* heap, span_t* span) {
+	if (UNEXPECTED(heap->finalize != 0)) {
+		_rpmalloc_span_unmap(span);
+		_rpmalloc_heap_global_finalize(heap);
+		return;
+	}
+#if ENABLE_THREAD_CACHE
+	size_t span_count = span->span_count;
+	_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_to_cache);
+	if (span_count == 1) {
+		span_cache_t* span_cache = &heap->span_cache;
+		span_cache->span[span_cache->count++] = span;
+		if (span_cache->count == MAX_THREAD_SPAN_CACHE) {
+			const size_t remain_count = MAX_THREAD_SPAN_CACHE - THREAD_SPAN_CACHE_TRANSFER;
+#if ENABLE_GLOBAL_CACHE
+			_rpmalloc_stat_add64(&heap->thread_to_global, THREAD_SPAN_CACHE_TRANSFER * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, THREAD_SPAN_CACHE_TRANSFER);
+			_rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, span_count, THREAD_SPAN_CACHE_TRANSFER);
+#else
+			for (size_t ispan = 0; ispan < THREAD_SPAN_CACHE_TRANSFER; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[remain_count + ispan]);
+#endif
+			span_cache->count = remain_count;
+		}
+	} else {
+		size_t cache_idx = span_count - 2;
+		span_large_cache_t* span_cache = heap->span_large_cache + cache_idx;
+		span_cache->span[span_cache->count++] = span;
+		const size_t cache_limit = (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1));
+		if (span_cache->count == cache_limit) {
+			const size_t transfer_limit = 2 + (cache_limit >> 2);
+			const size_t transfer_count = (THREAD_SPAN_LARGE_CACHE_TRANSFER <= transfer_limit ? THREAD_SPAN_LARGE_CACHE_TRANSFER : transfer_limit);
+			const size_t remain_count = cache_limit - transfer_count;
+#if ENABLE_GLOBAL_CACHE
+			_rpmalloc_stat_add64(&heap->thread_to_global, transfer_count * span_count * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, transfer_count);
+			_rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, span_count, transfer_count);
+#else
+			for (size_t ispan = 0; ispan < transfer_count; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[remain_count + ispan]);
+#endif
+			span_cache->count = remain_count;
+		}
+	}
+#else
+	(void)sizeof(heap);
+	_rpmalloc_span_unmap(span);
+#endif
+}
+
+//! Extract the given number of spans from the different cache levels
+static span_t*
+_rpmalloc_heap_thread_cache_extract(heap_t* heap, size_t span_count) {
+	span_t* span = 0;
+#if ENABLE_THREAD_CACHE
+	span_cache_t* span_cache;
+	if (span_count == 1)
+		span_cache = &heap->span_cache;
+	else
+		span_cache = (span_cache_t*)(heap->span_large_cache + (span_count - 2));
+	if (span_cache->count) {
+		_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_cache);
+		return span_cache->span[--span_cache->count];
+	}
+#endif
+	return span;
+}
+
+static span_t*
+_rpmalloc_heap_thread_cache_deferred_extract(heap_t* heap, size_t span_count) {
+	span_t* span = 0;
+	if (span_count == 1) {
+		_rpmalloc_heap_cache_adopt_deferred(heap, &span);
+	} else {
+		_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+		span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
+	}
+	return span;
+}
+
+static span_t*
+_rpmalloc_heap_reserved_extract(heap_t* heap, size_t span_count) {
+	if (heap->spans_reserved >= span_count)
+		return _rpmalloc_span_map(heap, span_count);
+	return 0;
+}
+
+//! Extract a span from the global cache
+static span_t*
+_rpmalloc_heap_global_cache_extract(heap_t* heap, size_t span_count) {
+#if ENABLE_GLOBAL_CACHE
+#if ENABLE_THREAD_CACHE
+	span_cache_t* span_cache;
+	size_t wanted_count;
+	if (span_count == 1) {
+		span_cache = &heap->span_cache;
+		wanted_count = THREAD_SPAN_CACHE_TRANSFER;
+	} else {
+		span_cache = (span_cache_t*)(heap->span_large_cache + (span_count - 2));
+		wanted_count = THREAD_SPAN_LARGE_CACHE_TRANSFER;
+	}
+	span_cache->count = _rpmalloc_global_cache_extract_spans(span_cache->span, span_count, wanted_count);
+	if (span_cache->count) {
+		_rpmalloc_stat_add64(&heap->global_to_thread, span_count * span_cache->count * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, span_cache->count);
+		return span_cache->span[--span_cache->count];
+	}
+#else
+	span_t* span = 0;
+	size_t count = _rpmalloc_global_cache_extract_spans(&span, span_count, 1);
+	if (count) {
+		_rpmalloc_stat_add64(&heap->global_to_thread, span_count * count * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, count);
+		return span;
+	}
+#endif
+#endif
+	(void)sizeof(heap);
+	(void)sizeof(span_count);
+	return 0;
+}
+
+static void
+_rpmalloc_inc_span_statistics(heap_t* heap, size_t span_count, uint32_t class_idx) {
+	(void)sizeof(heap);
+	(void)sizeof(span_count);
+	(void)sizeof(class_idx);
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	uint32_t idx = (uint32_t)span_count - 1;
+	uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current);
+	if (current_count > (uint32_t)atomic_load32(&heap->span_use[idx].high))
+		atomic_store32(&heap->span_use[idx].high, (int32_t)current_count);
+	_rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1, heap->size_class_use[class_idx].spans_peak);
+#endif
+}
+
+//! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory
+static span_t*
+_rpmalloc_heap_extract_new_span(heap_t* heap, heap_size_class_t* heap_size_class, size_t span_count, uint32_t class_idx) {
+	span_t* span;
+#if ENABLE_THREAD_CACHE
+	if (heap_size_class && heap_size_class->cache) {
+		span = heap_size_class->cache;
+		heap_size_class->cache = (heap->span_cache.count ? heap->span_cache.span[--heap->span_cache.count] : 0);
+		_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+		return span;
+	}
+#endif
+	(void)sizeof(class_idx);
+	// Allow 50% overhead to increase cache hits
+	size_t base_span_count = span_count;
+	size_t limit_span_count = (span_count > 2) ? (span_count + (span_count >> 1)) : span_count;
+	if (limit_span_count > LARGE_CLASS_COUNT)
+		limit_span_count = LARGE_CLASS_COUNT;
+	do {
+		span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
+		if (EXPECTED(span != 0)) {
+			_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
+			_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+			return span;
+		}
+		span = _rpmalloc_heap_thread_cache_deferred_extract(heap, span_count);
+		if (EXPECTED(span != 0)) {
+			_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
+			_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+			return span;
+		}
+		span = _rpmalloc_heap_reserved_extract(heap, span_count);
+		if (EXPECTED(span != 0)) {
+			_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_reserved);
+			_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+			return span;
+		}
+		span = _rpmalloc_heap_global_cache_extract(heap, span_count);
+		if (EXPECTED(span != 0)) {
+			_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
+			_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+			return span;
+		}
+		++span_count;
+	} while (span_count <= limit_span_count);
+	//Final fallback, map in more virtual memory
+	span = _rpmalloc_span_map(heap, base_span_count);
+	_rpmalloc_inc_span_statistics(heap, base_span_count, class_idx);
+	_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_map_calls);
+	return span;
+}
+
+static void
+_rpmalloc_heap_initialize(heap_t* heap) {
+	memset((void*)heap, 0, sizeof(heap_t));
+	//Get a new heap ID
+	heap->id = 1 + atomic_incr32(&_memory_heap_id);
+
+	//Link in heap in heap ID map
+	size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE;
+	heap->next_heap = _memory_heaps[list_idx];
+	_memory_heaps[list_idx] = heap;
+}
+
+static void
+_rpmalloc_heap_orphan(heap_t* heap, int first_class) {
+	heap->owner_thread = (uintptr_t)-1;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	heap_t** heap_list = (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps);
+#else
+	(void)sizeof(first_class);
+	heap_t** heap_list = &_memory_orphan_heaps;
+#endif
+	heap->next_orphan = *heap_list;
+	*heap_list = heap;
+}
+
+//! Allocate a new heap from newly mapped memory pages
+static heap_t*
+_rpmalloc_heap_allocate_new(void) {
+	// Map in pages for a 16 heaps. If page size is greater than required size for this, map a page and
+	// use first part for heaps and remaining part for spans for allocations. Adds a lot of complexity,
+	// but saves a lot of memory on systems where page size > 64 spans (4MiB)
+	size_t heap_size = sizeof(heap_t);
+	size_t aligned_heap_size = 16 * ((heap_size + 15) / 16);
+	size_t request_heap_count = 16;
+	size_t heap_span_count = ((aligned_heap_size * request_heap_count) + sizeof(span_t) + _memory_span_size - 1) / _memory_span_size;
+	size_t block_size = _memory_span_size * heap_span_count;
+	size_t span_count = heap_span_count;
+	span_t* span = 0;
+	// If there are global reserved spans, use these first
+	if (_memory_global_reserve_count >= heap_span_count) {
+		span = _rpmalloc_global_get_reserved_spans(heap_span_count);
+	}
+	if (!span) {
+		if (_memory_page_size > block_size) {
+			span_count = _memory_page_size / _memory_span_size;
+			block_size = _memory_page_size;
+			// If using huge pages, make sure to grab enough heaps to avoid reallocating a huge page just to serve new heaps
+			size_t possible_heap_count = (block_size - sizeof(span_t)) / aligned_heap_size;
+			if (possible_heap_count >= (request_heap_count * 16))
+				request_heap_count *= 16;
+			else if (possible_heap_count < request_heap_count)
+				request_heap_count = possible_heap_count;
+			heap_span_count = ((aligned_heap_size * request_heap_count) + sizeof(span_t) + _memory_span_size - 1) / _memory_span_size;
+		}
+
+		size_t align_offset = 0;
+		span = (span_t*)_rpmalloc_mmap(block_size, &align_offset);
+		if (!span)
+			return 0;
+
+		// Master span will contain the heaps
+		_rpmalloc_stat_inc(&_master_spans);
+		_rpmalloc_span_initialize(span, span_count, heap_span_count, align_offset);
+	}
+
+	size_t remain_size = _memory_span_size - sizeof(span_t);
+	heap_t* heap = (heap_t*)pointer_offset(span, sizeof(span_t));
+	_rpmalloc_heap_initialize(heap);
+
+	// Put extra heaps as orphans
+	size_t num_heaps = remain_size / aligned_heap_size;
+	if (num_heaps < request_heap_count)
+		num_heaps = request_heap_count;
+	atomic_store32(&heap->child_count, (int32_t)num_heaps - 1);
+	heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size);
+	while (num_heaps > 1) {
+		_rpmalloc_heap_initialize(extra_heap);
+		extra_heap->master_heap = heap;
+		_rpmalloc_heap_orphan(extra_heap, 1);
+		extra_heap = (heap_t*)pointer_offset(extra_heap, aligned_heap_size);
+		--num_heaps;
+	}
+
+	if (span_count > heap_span_count) {
+		// Cap reserved spans
+		size_t remain_count = span_count - heap_span_count;
+		size_t reserve_count = (remain_count > _memory_heap_reserve_count ? _memory_heap_reserve_count : remain_count);
+		span_t* remain_span = (span_t*)pointer_offset(span, heap_span_count * _memory_span_size);
+		_rpmalloc_heap_set_reserved_spans(heap, span, remain_span, reserve_count);
+
+		if (remain_count > reserve_count) {
+			// Set to global reserved spans
+			remain_span = (span_t*)pointer_offset(remain_span, reserve_count * _memory_span_size);
+			reserve_count = remain_count - reserve_count;
+			_rpmalloc_global_set_reserved_spans(span, remain_span, reserve_count);
+		}
+	}
+
+	return heap;
+}
+
+static heap_t*
+_rpmalloc_heap_extract_orphan(heap_t** heap_list) {
+	heap_t* heap = *heap_list;
+	*heap_list = (heap ? heap->next_orphan : 0);
+	return heap;
+}
+
+//! Allocate a new heap, potentially reusing a previously orphaned heap
+static heap_t*
+_rpmalloc_heap_allocate(int first_class) {
+	heap_t* heap = 0;
+	while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
+		_rpmalloc_spin();
+	if (first_class == 0)
+		heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	if (!heap)
+		heap = _rpmalloc_heap_extract_orphan(&_memory_first_class_orphan_heaps);
+#endif
+	if (!heap)
+		heap = _rpmalloc_heap_allocate_new();
+	atomic_store32_release(&_memory_global_lock, 0);
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+	return heap;
+}
+
+extern thread_local bool RpThreadShutdown;
+
+static void
+_rpmalloc_heap_release(void* heapptr, int first_class, int release_cache) {
+	heap_t* heap = (heap_t*)heapptr;
+	if (!heap)
+		return;
+	RpThreadShutdown = true;
+	//Release thread cache spans back to global cache
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+	if (release_cache  || heap->finalize) {
+#if ENABLE_THREAD_CACHE
+		for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+			span_cache_t* span_cache;
+			if (!iclass)
+				span_cache = &heap->span_cache;
+			else
+				span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+			if (!span_cache->count)
+				continue;
+#if ENABLE_GLOBAL_CACHE
+			if (heap->finalize) {
+				for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+					_rpmalloc_span_unmap(span_cache->span[ispan]);
+			} else {
+				_rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size);
+				_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count);
+				_rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count);
+			}
+#else
+			for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[ispan]);
+#endif
+			span_cache->count = 0;
+		}
+#endif
+	}
+
+	if (get_thread_heap_raw() == heap)
+		set_thread_heap(0);
+
+#if ENABLE_STATISTICS
+	atomic_decr32(&_memory_active_heaps);
+	rpmalloc_assert(atomic_load32(&_memory_active_heaps) >= 0, "Still active heaps during finalization");
+#endif
+
+	// If we are forcibly terminating with _exit the state of the
+	// lock atomic is unknown and it's best to just go ahead and exit
+	if (get_thread_id() != _rpmalloc_main_thread_id) {
+		while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
+			_rpmalloc_spin();
+	}
+	_rpmalloc_heap_orphan(heap, first_class);
+	atomic_store32_release(&_memory_global_lock, 0);
+}
+
+static void
+_rpmalloc_heap_release_raw(void* heapptr, int release_cache) {
+	_rpmalloc_heap_release(heapptr, 0, release_cache);
+}
+
+static void
+_rpmalloc_heap_release_raw_fc(void* heapptr) {
+	_rpmalloc_heap_release_raw(heapptr, 1);
+}
+
+static void
+_rpmalloc_heap_finalize(heap_t* heap) {
+	if (heap->spans_reserved) {
+		span_t* span = _rpmalloc_span_map(heap, heap->spans_reserved);
+		_rpmalloc_span_unmap(span);
+		heap->spans_reserved = 0;
+	}
+
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		if (heap->size_class[iclass].cache)
+			_rpmalloc_span_unmap(heap->size_class[iclass].cache);
+		heap->size_class[iclass].cache = 0;
+		span_t* span = heap->size_class[iclass].partial_span;
+		while (span) {
+			span_t* next = span->next;
+			_rpmalloc_span_finalize(heap, iclass, span, &heap->size_class[iclass].partial_span);
+			span = next;
+		}
+		// If class still has a free list it must be a full span
+		if (heap->size_class[iclass].free_list) {
+			span_t* class_span = (span_t*)((uintptr_t)heap->size_class[iclass].free_list & _memory_span_mask);
+			span_t** list = 0;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+			list = &heap->full_span[iclass];
+#endif
+			--heap->full_span_count;
+			if (!_rpmalloc_span_finalize(heap, iclass, class_span, list)) {
+				if (list)
+					_rpmalloc_span_double_link_list_remove(list, class_span);
+				_rpmalloc_span_double_link_list_add(&heap->size_class[iclass].partial_span, class_span);
+			}
+		}
+	}
+
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
+		span_cache->count = 0;
+	}
+#endif
+	rpmalloc_assert(!atomic_load_ptr(&heap->span_free_deferred), "Heaps still active during finalization");
+}
+
+
+////////////
+///
+/// Allocation entry points
+///
+//////
+
 //! Pop first block from a free list
 static void*
 free_list_pop(void** list) {
@@ -1209,84 +2081,85 @@ free_list_pop(void** list) {
 
 //! Allocate a small/medium sized memory block from the given heap
 static void*
-_memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
-	heap_class_t* heap_class = &heap->span_class[class_idx];
-	void* block;
-
-	span_t* active_span = heap_class->partial_span;
-	if (EXPECTED(active_span != 0)) {
-		assert(active_span->state == SPAN_STATE_ACTIVE);
-		assert(active_span->block_count == _memory_size_class[active_span->size_class].block_count);
-		//Swap in free list if not empty
-		if (active_span->free_list) {
-			heap_class->free_list = active_span->free_list;
-			active_span->free_list = 0;
-			return free_list_pop(&heap_class->free_list);
-		}
-		//If the span did not fully initialize free list, link up another page worth of blocks
-		if (active_span->free_list_limit < active_span->block_count) {
-			void* block_start = pointer_offset(active_span, SPAN_HEADER_SIZE + (active_span->free_list_limit * active_span->block_size));
-			active_span->free_list_limit += free_list_partial_init(&heap_class->free_list, &block,
+_rpmalloc_allocate_from_heap_fallback(heap_t* heap, heap_size_class_t* heap_size_class, uint32_t class_idx) {
+	span_t* span = heap_size_class->partial_span;
+	if (EXPECTED(span != 0)) {
+		rpmalloc_assert(span->block_count == _memory_size_class[span->size_class].block_count, "Span block count corrupted");
+		rpmalloc_assert(!_rpmalloc_span_is_fully_utilized(span), "Internal failure");
+		void* block;
+		if (span->free_list) {
+			//Span local free list is not empty, swap to size class free list
+			block = free_list_pop(&span->free_list);
+			heap_size_class->free_list = span->free_list;
+			span->free_list = 0;
+		} else {
+			//If the span did not fully initialize free list, link up another page worth of blocks			
+			void* block_start = pointer_offset(span, SPAN_HEADER_SIZE + ((size_t)span->free_list_limit * span->block_size));
+			span->free_list_limit += free_list_partial_init(&heap_size_class->free_list, &block,
 				(void*)((uintptr_t)block_start & ~(_memory_page_size - 1)), block_start,
-				active_span->block_count - active_span->free_list_limit, active_span->block_size);
+				span->block_count - span->free_list_limit, span->block_size);
+		}
+		rpmalloc_assert(span->free_list_limit <= span->block_count, "Span block count corrupted");
+		span->used_count = span->free_list_limit;
+
+		//Swap in deferred free list if present
+		if (atomic_load_ptr(&span->free_list_deferred))
+			_rpmalloc_span_extract_free_list_deferred(span);
+
+		//If span is still not fully utilized keep it in partial list and early return block
+		if (!_rpmalloc_span_is_fully_utilized(span))
 			return block;
-		}
-		//Swap in deferred free list
-		atomic_thread_fence_acquire();
-		if (atomic_load_ptr(&active_span->free_list_deferred)) {
-			heap_class->free_list = _memory_span_extract_deferred(active_span);
-			return free_list_pop(&heap_class->free_list);
-		}
 
-		//If the active span is fully allocated, mark span as free floating (fully allocated and not part of any list)
-		assert(!heap_class->free_list);
-		assert(active_span->free_list_limit >= active_span->block_count);
-		_memory_span_set_active_full(heap_class, active_span);
+		//The span is fully utilized, unlink from partial list and add to fully utilized list
+		_rpmalloc_span_double_link_list_pop_head(&heap_size_class->partial_span, span);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+		_rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span);
+#endif
+		++heap->full_span_count;
+		return block;
 	}
-	assert(!heap_class->free_list);
-
-	//Try promoting a semi-used span to active
-	active_span = heap_class->partial_span;
-	if (EXPECTED(active_span != 0)) {
-		_memory_span_set_partial_active(heap_class, active_span);
-		return free_list_pop(&heap_class->free_list);
-	}
-	assert(!heap_class->free_list);
-	assert(!heap_class->partial_span);
 
 	//Find a span in one of the cache levels
-	active_span = _memory_heap_extract_new_span(heap, 1, class_idx);
+	span = _rpmalloc_heap_extract_new_span(heap, heap_size_class, 1, class_idx);
+	if (EXPECTED(span != 0)) {
+		//Mark span as owned by this heap and set base data, return first block
+		return _rpmalloc_span_initialize_new(heap, heap_size_class, span, class_idx);
+	}
 
-	//Mark span as owned by this heap and set base data, return first block
-	return _memory_span_set_new_active(heap, heap_class, active_span, class_idx);
+	return 0;
 }
 
 //! Allocate a small sized memory block from the given heap
 static void*
-_memory_allocate_small(heap_t* heap, size_t size) {
+_rpmalloc_allocate_small(heap_t* heap, size_t size) {
+	rpmalloc_assert(heap, "No thread heap");
 	//Small sizes have unique size classes
 	const uint32_t class_idx = (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT);
-	_memory_statistics_inc_alloc(heap, class_idx);
-	if (EXPECTED(heap->span_class[class_idx].free_list != 0))
-		return free_list_pop(&heap->span_class[class_idx].free_list);
-	return _memory_allocate_from_heap_fallback(heap, class_idx);
+	heap_size_class_t* heap_size_class = heap->size_class + class_idx;
+	_rpmalloc_stat_inc_alloc(heap, class_idx);
+	if (EXPECTED(heap_size_class->free_list != 0))
+		return free_list_pop(&heap_size_class->free_list);
+	return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, class_idx);
 }
 
 //! Allocate a medium sized memory block from the given heap
 static void*
-_memory_allocate_medium(heap_t* heap, size_t size) {
+_rpmalloc_allocate_medium(heap_t* heap, size_t size) {
+	rpmalloc_assert(heap, "No thread heap");
 	//Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes)
 	const uint32_t base_idx = (uint32_t)(SMALL_CLASS_COUNT + ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT));
 	const uint32_t class_idx = _memory_size_class[base_idx].class_idx;
-	_memory_statistics_inc_alloc(heap, class_idx);
-	if (EXPECTED(heap->span_class[class_idx].free_list != 0))
-		return free_list_pop(&heap->span_class[class_idx].free_list);
-	return _memory_allocate_from_heap_fallback(heap, class_idx);
+	heap_size_class_t* heap_size_class = heap->size_class + class_idx;
+	_rpmalloc_stat_inc_alloc(heap, class_idx);
+	if (EXPECTED(heap_size_class->free_list != 0))
+		return free_list_pop(&heap_size_class->free_list);
+	return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, class_idx);
 }
 
 //! Allocate a large sized memory block from the given heap
 static void*
-_memory_allocate_large(heap_t* heap, size_t size) {
+_rpmalloc_allocate_large(heap_t* heap, size_t size) {
+	rpmalloc_assert(heap, "No thread heap");
 	//Calculate number of needed max sized spans (including header)
 	//Since this function is never called if size > LARGE_SIZE_LIMIT
 	//the span_count is guaranteed to be <= LARGE_CLASS_COUNT
@@ -1294,925 +2167,71 @@ _memory_allocate_large(heap_t* heap, size_t size) {
 	size_t span_count = size >> _memory_span_size_shift;
 	if (size & (_memory_span_size - 1))
 		++span_count;
-	size_t idx = span_count - 1;
 
 	//Find a span in one of the cache levels
-	span_t* span = _memory_heap_extract_new_span(heap, span_count, SIZE_CLASS_COUNT);
+	span_t* span = _rpmalloc_heap_extract_new_span(heap, 0, span_count, SIZE_CLASS_LARGE);
+	if (!span)
+		return span;
 
 	//Mark span as owned by this heap and set base data
-	assert(span->span_count == span_count);
-	span->size_class = (uint32_t)(SIZE_CLASS_COUNT + idx);
+	rpmalloc_assert(span->span_count >= span_count, "Internal failure");
+	span->size_class = SIZE_CLASS_LARGE;
 	span->heap = heap;
-	atomic_thread_fence_release();
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
+#endif
+	++heap->full_span_count;
 
 	return pointer_offset(span, SPAN_HEADER_SIZE);
 }
 
 //! Allocate a huge block by mapping memory pages directly
 static void*
-_memory_allocate_huge(size_t size) {
+_rpmalloc_allocate_huge(heap_t* heap, size_t size) {
+	rpmalloc_assert(heap, "No thread heap");
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
 	size += SPAN_HEADER_SIZE;
 	size_t num_pages = size >> _memory_page_size_shift;
 	if (size & (_memory_page_size - 1))
 		++num_pages;
 	size_t align_offset = 0;
-	span_t* span = (span_t*)_memory_map(num_pages * _memory_page_size, &align_offset);
+	span_t* span = (span_t*)_rpmalloc_mmap(num_pages * _memory_page_size, &align_offset);
 	if (!span)
 		return span;
+
 	//Store page count in span_count
-	span->size_class = (uint32_t)-1;
+	span->size_class = SIZE_CLASS_HUGE;
 	span->span_count = (uint32_t)num_pages;
 	span->align_offset = (uint32_t)align_offset;
-	_memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
+	span->heap = heap;
+	_rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
+#endif
+	++heap->full_span_count;
 
 	return pointer_offset(span, SPAN_HEADER_SIZE);
 }
 
-//! Allocate a block larger than medium size
-static void*
-_memory_allocate_oversized(heap_t* heap, size_t size) {
-	if (size <= LARGE_SIZE_LIMIT)
-		return _memory_allocate_large(heap, size);
-	return _memory_allocate_huge(size);
-}
-
 //! Allocate a block of the given size
 static void*
-_memory_allocate(heap_t* heap, size_t size) {
+_rpmalloc_allocate(heap_t* heap, size_t size) {
+	_rpmalloc_stat_add64(&_allocation_counter, 1);
 	if (EXPECTED(size <= SMALL_SIZE_LIMIT))
-		return _memory_allocate_small(heap, size);
+		return _rpmalloc_allocate_small(heap, size);
 	else if (size <= _memory_medium_size_limit)
-		return _memory_allocate_medium(heap, size);
-	return _memory_allocate_oversized(heap, size);
+		return _rpmalloc_allocate_medium(heap, size);
+	else if (size <= LARGE_SIZE_LIMIT)
+		return _rpmalloc_allocate_large(heap, size);
+	return _rpmalloc_allocate_huge(heap, size);
 }
 
-//! Allocate a new heap
-static heap_t*
-_memory_allocate_heap(void) {
-	void* raw_heap;
-	void* next_raw_heap;
-	uintptr_t orphan_counter;
-	heap_t* heap;
-	heap_t* next_heap;
-	//Try getting an orphaned heap
-	atomic_thread_fence_acquire();
-	do {
-		raw_heap = atomic_load_ptr(&_memory_orphan_heaps);
-		heap = (heap_t*)((uintptr_t)raw_heap & ~(uintptr_t)0x1FF);
-		if (!heap)
-			break;
-		next_heap = heap->next_orphan;
-		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
-		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & (uintptr_t)0x1FF));
-	} while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap));
-
-	if (!heap) {
-		//Map in pages for a new heap
-		size_t align_offset = 0;
-		heap = (heap_t*)_memory_map((1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, &align_offset);
-		if (!heap)
-			return heap;
-		memset((char*)heap, 0, sizeof(heap_t));
-		heap->align_offset = align_offset;
-
-		//Get a new heap ID
-		do {
-			heap->id = atomic_incr32(&_memory_heap_id);
-			if (_memory_heap_lookup(heap->id))
-				heap->id = 0;
-		} while (!heap->id);
-
-		//Link in heap in heap ID map
-		size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
-		do {
-			next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
-			heap->next_heap = next_heap;
-		} while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap));
-	}
-
-	return heap;
-}
-
-//! Deallocate the given small/medium memory block in the current thread local heap
-static void
-_memory_deallocate_direct(span_t* span, void* block) {
-	assert(span->heap == get_thread_heap_raw());
-	uint32_t state = span->state;
-	//Add block to free list
-	*((void**)block) = span->free_list;
-	span->free_list = block;
-	if (UNEXPECTED(state == SPAN_STATE_ACTIVE))
-		return;
-	uint32_t used = --span->used_count;
-	uint32_t free = span->list_size;
-	if (UNEXPECTED(used == free))
-		_memory_span_release_to_cache(span->heap, span);
-	else if (UNEXPECTED(state == SPAN_STATE_FULL))
-		_memory_span_set_full_partial(span->heap, span);
-}
-
-//! Put the block in the deferred free list of the owning span
-static void
-_memory_deallocate_defer(span_t* span, void* block) {
-	atomic_thread_fence_acquire();
-	if (span->state == SPAN_STATE_FULL) {
-		if ((span->list_size + 1) == span->block_count) {
-			//Span will be completely freed by deferred deallocations, no other thread can
-			//currently touch it. Safe to move to owner heap deferred cache
-			span_t* last_head;
-			heap_t* heap = span->heap;
-			do {
-				last_head = (span_t*)atomic_load_ptr(&heap->span_cache_deferred);
-				span->next = last_head;
-			} while (!atomic_cas_ptr(&heap->span_cache_deferred, span, last_head));
-			return;
-		}
-	}
-
-	void* free_list;
-	do {
-		atomic_thread_fence_acquire();
-		free_list = atomic_load_ptr(&span->free_list_deferred);
-		*((void**)block) = free_list;
-	} while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list));
-	++span->list_size;
-	atomic_store_ptr(&span->free_list_deferred, block);
-}
-
-static void
-_memory_deallocate_small_or_medium(span_t* span, void* p) {
-	_memory_statistics_inc_free(span->heap, span->size_class);
-	if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) {
-		//Realign pointer to block start
-		void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
-		uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
-		p = pointer_offset(p, -(int32_t)(block_offset % span->block_size));
-	}
-	//Check if block belongs to this heap or if deallocation should be deferred
-	if (span->heap == get_thread_heap_raw())
-		_memory_deallocate_direct(span, p);
-	else
-		_memory_deallocate_defer(span, p);
-}
-
-//! Deallocate the given large memory block to the current heap
-static void
-_memory_deallocate_large(span_t* span) {
-	//Decrease counter
-	assert(span->span_count == ((size_t)span->size_class - SIZE_CLASS_COUNT + 1));
-	assert(span->size_class >= SIZE_CLASS_COUNT);
-	assert(span->size_class - SIZE_CLASS_COUNT < LARGE_CLASS_COUNT);
-	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
-	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
-	//Large blocks can always be deallocated and transferred between heaps
-	//Investigate if it is better to defer large spans as well through span_cache_deferred,
-	//possibly with some heuristics to pick either scheme at runtime per deallocation
-	heap_t* heap = get_thread_heap();
-	if (!heap) return;
-#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
-	size_t idx = span->span_count - 1;
-	atomic_decr32(&span->heap->span_use[idx].current);
-#endif
-	if ((span->span_count > 1) && !heap->spans_reserved) {
-		heap->span_reserve = span;
-		heap->spans_reserved = span->span_count;
-		if (span->flags & SPAN_FLAG_MASTER) {
-			heap->span_reserve_master = span;
-		} else { //SPAN_FLAG_SUBSPAN
-			uint32_t distance = span->total_spans_or_distance;
-			span_t* master = (span_t*)pointer_offset(span, -(int32_t)(distance * _memory_span_size));
-			heap->span_reserve_master = master;
-			assert(master->flags & SPAN_FLAG_MASTER);
-			assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count);
-		}
-		_memory_statistics_inc(heap->span_use[idx].spans_to_reserved, 1);
-	} else {
-		//Insert into cache list
-		_memory_heap_cache_insert(heap, span);
-	}
-}
-
-//! Deallocate the given huge span
-static void
-_memory_deallocate_huge(span_t* span) {
-	//Oversized allocation, page count is stored in span_count
-	size_t num_pages = span->span_count;
-	_memory_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size);
-	_memory_statistics_sub(&_huge_pages_current, num_pages);
-}
-
-//! Deallocate the given block
-static void
-_memory_deallocate(void* p) {
-	//Grab the span (always at start of span, using span alignment)
-	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
-	if (UNEXPECTED(!span))
-		return;
-	if (EXPECTED(span->size_class < SIZE_CLASS_COUNT))
-		_memory_deallocate_small_or_medium(span, p);
-	else if (span->size_class != (uint32_t)-1)
-		_memory_deallocate_large(span);
-	else
-		_memory_deallocate_huge(span);
-}
-
-//! Reallocate the given block to the given size
 static void*
-_memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
-	if (p) {
-		//Grab the span using guaranteed span alignment
-		span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
-		if (span->heap) {
-			if (span->size_class < SIZE_CLASS_COUNT) {
-				//Small/medium sized block
-				assert(span->span_count == 1);
-				void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
-				uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
-				uint32_t block_idx = block_offset / span->block_size;
-				void* block = pointer_offset(blocks_start, block_idx * span->block_size);
-				if (!oldsize)
-					oldsize = span->block_size - (uint32_t)pointer_diff(p, block);
-				if ((size_t)span->block_size >= size) {
-					//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
-					if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
-						memmove(block, p, oldsize);
-					return block;
-				}
-			} else {
-				//Large block
-				size_t total_size = size + SPAN_HEADER_SIZE;
-				size_t num_spans = total_size >> _memory_span_size_shift;
-				if (total_size & (_memory_span_mask - 1))
-					++num_spans;
-				size_t current_spans = span->span_count;
-				assert(current_spans == ((span->size_class - SIZE_CLASS_COUNT) + 1));
-				void* block = pointer_offset(span, SPAN_HEADER_SIZE);
-				if (!oldsize)
-					oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
-				if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) {
-					//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
-					if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
-						memmove(block, p, oldsize);
-					return block;
-				}
-			}
-		} else {
-			//Oversized block
-			size_t total_size = size + SPAN_HEADER_SIZE;
-			size_t num_pages = total_size >> _memory_page_size_shift;
-			if (total_size & (_memory_page_size - 1))
-				++num_pages;
-			//Page count is stored in span_count
-			size_t current_pages = span->span_count;
-			void* block = pointer_offset(span, SPAN_HEADER_SIZE);
-			if (!oldsize)
-				oldsize = (current_pages * _memory_page_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
-			if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) {
-				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
-				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
-					memmove(block, p, oldsize);
-				return block;
-			}
-		}
-	} else {
-		oldsize = 0;
-	}
-
-	//Size is greater than block size, need to allocate a new block and deallocate the old
-	heap_t* heap = get_thread_heap();
-	//Avoid hysteresis by overallocating if increase is small (below 37%)
-	size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3);
-	size_t new_size = (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size);
-	void* block = _memory_allocate(heap, new_size);
-	if (p && block) {
-		if (!(flags & RPMALLOC_NO_PRESERVE))
-			memcpy(block, p, oldsize < new_size ? oldsize : new_size);
-		_memory_deallocate(p);
-	}
-
-	return block;
-}
-
-//! Get the usable size of the given block
-static size_t
-_memory_usable_size(void* p) {
-	//Grab the span using guaranteed span alignment
-	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
-	if (span->heap) {
-		//Small/medium block
-		if (span->size_class < SIZE_CLASS_COUNT) {
-			void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
-			return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size);
-		}
-
-		//Large block
-		size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1;
-		return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span);
-	}
-
-	//Oversized block, page count is stored in span_count
-	size_t current_pages = span->span_count;
-	return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span);
-}
-
-//! Adjust and optimize the size class properties for the given class
-static void
-_memory_adjust_size_class(size_t iclass) {
-	size_t block_size = _memory_size_class[iclass].block_size;
-	size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size;
-
-	_memory_size_class[iclass].block_count = (uint16_t)block_count;
-	_memory_size_class[iclass].class_idx = (uint16_t)iclass;
-
-	//Check if previous size classes can be merged
-	size_t prevclass = iclass;
-	while (prevclass > 0) {
-		--prevclass;
-		//A class can be merged if number of pages and number of blocks are equal
-		if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)
-			memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass]));
-		else
-			break;
-	}
-}
-
-static void
-_memory_heap_finalize(void* heapptr) {
-	heap_t* heap = (heap_t*)heapptr;
-	if (!heap)
-		return;
-	//Release thread cache spans back to global cache
-#if ENABLE_THREAD_CACHE
-	_memory_heap_cache_adopt_deferred(heap);
-	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		span_t* span = heap->span_cache[iclass];
-#if ENABLE_GLOBAL_CACHE
-		while (span) {
-			assert(span->span_count == (iclass + 1));
-			size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large);
-			span_t* next = _memory_span_list_split(span, (uint32_t)release_count);
-#if ENABLE_STATISTICS
-			heap->thread_to_global += (size_t)span->list_size * span->span_count * _memory_span_size;
-			heap->span_use[iclass].spans_to_global += span->list_size;
-#endif
-			_memory_global_cache_insert(span);
-			span = next;
-		}
-#else
-		if (span)
-			_memory_unmap_span_list(span);
-#endif
-		heap->span_cache[iclass] = 0;
-	}
-#endif
-
-	//Orphan the heap
-	void* raw_heap;
-	uintptr_t orphan_counter;
-	heap_t* last_heap;
-	do {
-		last_heap = (heap_t*)atomic_load_ptr(&_memory_orphan_heaps);
-		heap->next_orphan = (heap_t*)((uintptr_t)last_heap & ~(uintptr_t)0x1FF);
-		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
-		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & (uintptr_t)0x1FF));
-	} while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap));
-
-	set_thread_heap(0);
-
-#if ENABLE_STATISTICS
-	atomic_decr32(&_memory_active_heaps);
-	assert(atomic_load32(&_memory_active_heaps) >= 0);
-#endif
-}
-
-#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-#include <fibersapi.h>
-static DWORD fls_key;
-static void NTAPI
-rp_thread_destructor(void* value) {
-	if (value)
-		rpmalloc_thread_finalize();
-}
-#endif
-
-#if PLATFORM_POSIX
-#  include <sys/mman.h>
-#  include <sched.h>
-#  ifdef __FreeBSD__
-#    include <sys/sysctl.h>
-#    define MAP_HUGETLB MAP_ALIGNED_SUPER
-#  endif
-#  ifndef MAP_UNINITIALIZED
-#    define MAP_UNINITIALIZED 0
-#  endif
-#endif
-#include <errno.h>
-
-//! Initialize the allocator and setup global data
-TRACY_API int
-rpmalloc_initialize(void) {
-	if (_rpmalloc_initialized) {
-		rpmalloc_thread_initialize();
-		return 0;
-	}
-	memset(&_memory_config, 0, sizeof(rpmalloc_config_t));
-	return rpmalloc_initialize_config(0);
-}
-
-int
-rpmalloc_initialize_config(const rpmalloc_config_t* config) {
-	if (_rpmalloc_initialized) {
-		rpmalloc_thread_initialize();
-		return 0;
-	}
-	_rpmalloc_initialized = 1;
-
-	if (config)
-		memcpy(&_memory_config, config, sizeof(rpmalloc_config_t));
-
-	if (!_memory_config.memory_map || !_memory_config.memory_unmap) {
-		_memory_config.memory_map = _memory_map_os;
-		_memory_config.memory_unmap = _memory_unmap_os;
-	}
-
-#if RPMALLOC_CONFIGURABLE
-	_memory_page_size = _memory_config.page_size;
-#else
-	_memory_page_size = 0;
-#endif
-	_memory_huge_pages = 0;
-	_memory_map_granularity = _memory_page_size;
-	if (!_memory_page_size) {
-#if PLATFORM_WINDOWS
-		SYSTEM_INFO system_info;
-		memset(&system_info, 0, sizeof(system_info));
-		GetSystemInfo(&system_info);
-		_memory_page_size = system_info.dwPageSize;
-		_memory_map_granularity = system_info.dwAllocationGranularity;
-		if (config && config->enable_huge_pages) {
-			HANDLE token = 0;
-			size_t large_page_minimum = GetLargePageMinimum();
-			if (large_page_minimum)
-				OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
-			if (token) {
-				LUID luid;
-				if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) {
-					TOKEN_PRIVILEGES token_privileges;
-					memset(&token_privileges, 0, sizeof(token_privileges));
-					token_privileges.PrivilegeCount = 1;
-					token_privileges.Privileges[0].Luid = luid;
-					token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
-					if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) {
-						DWORD err = GetLastError();
-						if (err == ERROR_SUCCESS) {
-							_memory_huge_pages = 1;
-							_memory_page_size = large_page_minimum;
-							_memory_map_granularity = large_page_minimum;
-						}
-					}
-				}
-				CloseHandle(token);
-			}
-		}
-#else
-		_memory_page_size = (size_t)sysconf(_SC_PAGESIZE);
-		_memory_map_granularity = _memory_page_size;
-		if (config && config->enable_huge_pages) {
-#if defined(__linux__)
-			size_t huge_page_size = 0;
-			FILE* meminfo = fopen("/proc/meminfo", "r");
-			if (meminfo) {
-				char line[128];
-				while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) {
-					line[sizeof(line) - 1] = 0;
-					if (strstr(line, "Hugepagesize:"))
-						huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024;
-				}
-				fclose(meminfo);
-			}
-			if (huge_page_size) {
-				_memory_huge_pages = 1;
-				_memory_page_size = huge_page_size;
-				_memory_map_granularity = huge_page_size;
-			}
-#elif defined(__FreeBSD__)
-			int rc;
-			size_t sz = sizeof(rc);
-
-			if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && rc == 1) {
-				_memory_huge_pages = 1;
-				_memory_page_size = 2 * 1024 * 1024;
-				_memory_map_granularity = _memory_page_size;
-			}
-#elif defined(__APPLE__)
-			_memory_huge_pages = 1;
-			_memory_page_size = 2 * 1024 * 1024;
-			_memory_map_granularity = _memory_page_size;
-#endif
-		}
-#endif
-	} else {
-		if (config && config->enable_huge_pages)
-			_memory_huge_pages = 1;
-	}
-
-	//The ABA counter in heap orphan list is tied to using 512 (bitmask 0x1FF)
-	if (_memory_page_size < 512)
-		_memory_page_size = 512;
-	if (_memory_page_size > (64 * 1024 * 1024))
-		_memory_page_size = (64 * 1024 * 1024);
-	_memory_page_size_shift = 0;
-	size_t page_size_bit = _memory_page_size;
-	while (page_size_bit != 1) {
-		++_memory_page_size_shift;
-		page_size_bit >>= 1;
-	}
-	_memory_page_size = ((size_t)1 << _memory_page_size_shift);
-
-#if RPMALLOC_CONFIGURABLE
-	size_t span_size = _memory_config.span_size;
-	if (!span_size)
-		span_size = (64 * 1024);
-	if (span_size > (256 * 1024))
-		span_size = (256 * 1024);
-	_memory_span_size = 4096;
-	_memory_span_size_shift = 12;
-	while (_memory_span_size < span_size) {
-		_memory_span_size <<= 1;
-		++_memory_span_size_shift;
-	}
-	_memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
-#endif
-
-	_memory_span_map_count = ( _memory_config.span_map_count ? _memory_config.span_map_count : DEFAULT_SPAN_MAP_COUNT);
-	if ((_memory_span_size * _memory_span_map_count) < _memory_page_size)
-		_memory_span_map_count = (_memory_page_size / _memory_span_size);
-	if ((_memory_page_size >= _memory_span_size) && ((_memory_span_map_count * _memory_span_size) % _memory_page_size))
-		_memory_span_map_count = (_memory_page_size / _memory_span_size);
-
-	_memory_config.page_size = _memory_page_size;
-	_memory_config.span_size = _memory_span_size;
-	_memory_config.span_map_count = _memory_span_map_count;
-	_memory_config.enable_huge_pages = _memory_huge_pages;
-
-	_memory_span_release_count = (_memory_span_map_count > 4 ? ((_memory_span_map_count < 64) ? _memory_span_map_count : 64) : 4);
-	_memory_span_release_count_large = (_memory_span_release_count > 8 ? (_memory_span_release_count / 4) : 2);
-
-#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
-	if (pthread_key_create(&_memory_thread_heap, _memory_heap_finalize))
-		return -1;
-#endif
-#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-    fls_key = FlsAlloc(&rp_thread_destructor);
-#endif
-
-	atomic_store32(&_memory_heap_id, 0);
-	atomic_store32(&_memory_orphan_counter, 0);
-#if ENABLE_STATISTICS
-	atomic_store32(&_memory_active_heaps, 0);
-	atomic_store32(&_reserved_spans, 0);
-	atomic_store32(&_mapped_pages, 0);
-	_mapped_pages_peak = 0;
-	atomic_store32(&_mapped_total, 0);
-	atomic_store32(&_unmapped_total, 0);
-	atomic_store32(&_mapped_pages_os, 0);
-	atomic_store32(&_huge_pages_current, 0);
-	_huge_pages_peak = 0;
-#endif
-
-	//Setup all small and medium size classes
-	size_t iclass = 0;
-	_memory_size_class[iclass].block_size = SMALL_GRANULARITY;
-	_memory_adjust_size_class(iclass);
-	for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) {
-		size_t size = iclass * SMALL_GRANULARITY;
-		_memory_size_class[iclass].block_size = (uint32_t)size;
-		_memory_adjust_size_class(iclass);
-	}
-	//At least two blocks per span, then fall back to large allocations
-	_memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1;
-	if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT)
-		_memory_medium_size_limit = MEDIUM_SIZE_LIMIT;
-	for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) {
-		size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY);
-		if (size > _memory_medium_size_limit)
-			break;
-		_memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size;
-		_memory_adjust_size_class(SMALL_CLASS_COUNT + iclass);
-	}
-
-	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx)
-		atomic_store_ptr(&_memory_heaps[list_idx], 0);
-
-	//Initialize this thread
-	rpmalloc_thread_initialize();
-	return 0;
-}
-
-//! Finalize the allocator
-TRACY_API void
-rpmalloc_finalize(void) {
-	atomic_thread_fence_acquire();
-
-	rpmalloc_thread_finalize();
-	//rpmalloc_dump_statistics(stderr);
-
-	//Free all thread caches
-	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
-		heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
-		while (heap) {
-			if (heap->spans_reserved) {
-				span_t* span = _memory_map_spans(heap, heap->spans_reserved);
-				_memory_unmap_span(span);
-			}
-
-			for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-				heap_class_t* heap_class = heap->span_class + iclass;
-				span_t* span = heap_class->partial_span;
-				while (span) {
-					span_t* next = span->next;
-					if (span->state == SPAN_STATE_ACTIVE) {
-						uint32_t used_blocks = span->block_count;
-						if (span->free_list_limit < span->block_count)
-							used_blocks = span->free_list_limit;
-						uint32_t free_blocks = 0;
-						void* block = heap_class->free_list;
-						while (block) {
-							++free_blocks;
-							block = *((void**)block);
-						}
-						block = span->free_list;
-						while (block) {
-							++free_blocks;
-							block = *((void**)block);
-						}
-						if (used_blocks == (free_blocks + span->list_size))
-							_memory_heap_cache_insert(heap, span);
-					} else {
-						if (span->used_count == span->list_size)
-							_memory_heap_cache_insert(heap, span);
-					}
-					span = next;
-				}
-			}
-
-#if ENABLE_THREAD_CACHE
-			//Free span caches (other thread might have deferred after the thread using this heap finalized)
-			_memory_heap_cache_adopt_deferred(heap);
-			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-				if (heap->span_cache[iclass])
-					_memory_unmap_span_list(heap->span_cache[iclass]);
-			}
-#endif
-			heap_t* next_heap = heap->next_heap;
-			size_t heap_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
-			_memory_unmap(heap, heap_size, heap->align_offset, heap_size);
-			heap = next_heap;
-		}
-	}
-
-#if ENABLE_GLOBAL_CACHE
-	//Free global caches
-	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
-		_memory_cache_finalize(&_memory_span_cache[iclass]);
-#endif
-
-	atomic_store_ptr(&_memory_orphan_heaps, 0);
-	atomic_thread_fence_release();
-
-#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
-	pthread_key_delete(_memory_thread_heap);
-#endif
-#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-    FlsFree(fls_key);
-#endif
-
-#if ENABLE_STATISTICS
-	//If you hit these asserts you probably have memory leaks or double frees in your code
-	assert(!atomic_load32(&_mapped_pages));
-	assert(!atomic_load32(&_reserved_spans));
-	assert(!atomic_load32(&_mapped_pages_os));
-#endif
-
-	_rpmalloc_initialized = 0;
-}
-
-//! Initialize thread, assign heap
-TRACY_API void
-rpmalloc_thread_initialize(void) {
-	if (!get_thread_heap_raw()) {
-		heap_t* heap = _memory_allocate_heap();
-		if (heap) {
-			atomic_thread_fence_acquire();
-#if ENABLE_STATISTICS
-			atomic_incr32(&_memory_active_heaps);
-#endif
-			set_thread_heap(heap);
-#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-			FlsSetValue(fls_key, heap);
-#endif
-		}
-	}
-}
-
-//! Finalize thread, orphan heap
-TRACY_API void
-rpmalloc_thread_finalize(void) {
-	heap_t* heap = get_thread_heap_raw();
-	if (heap)
-		_memory_heap_finalize(heap);
-}
-
-int
-rpmalloc_is_thread_initialized(void) {
-	return (get_thread_heap_raw() != 0) ? 1 : 0;
-}
-
-const rpmalloc_config_t*
-rpmalloc_config(void) {
-	return &_memory_config;
-}
-
-//! Map new pages to virtual memory
-static void*
-_memory_map_os(size_t size, size_t* offset) {
-	//Either size is a heap (a single page) or a (multiple) span - we only need to align spans, and only if larger than map granularity
-	size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0;
-	assert(size >= _memory_page_size);
-#if PLATFORM_WINDOWS
-	//Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed"
-	void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
-	if (!ptr) {
-		assert(!"Failed to map virtual memory block");
-		return 0;
-	}
-#else
-	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED;
-#  if defined(__APPLE__)
-	int fd = (int)VM_MAKE_TAG(240U);
-	if (_memory_huge_pages)
-		fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
-	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0);
-#  elif defined(MAP_HUGETLB)
-	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0);
-#  else
-	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
-#  endif
-	if ((ptr == MAP_FAILED) || !ptr) {
-		assert("Failed to map virtual memory block" == 0);
-		return 0;
-	}
-#endif
-#if ENABLE_STATISTICS
-	atomic_add32(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift));
-#endif
-	if (padding) {
-		size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask);
-		assert(final_padding <= _memory_span_size);
-		assert(final_padding <= padding);
-		assert(!(final_padding % 8));
-		ptr = pointer_offset(ptr, final_padding);
-		*offset = final_padding >> 3;
-	}
-	assert((size < _memory_span_size) || !((uintptr_t)ptr & ~_memory_span_mask));
-	return ptr;
-}
-
-//! Unmap pages from virtual memory
-static void
-_memory_unmap_os(void* address, size_t size, size_t offset, size_t release) {
-	assert(release || (offset == 0));
-	assert(!release || (release >= _memory_page_size));
-	assert(size >= _memory_page_size);
-	if (release && offset) {
-		offset <<= 3;
-		address = pointer_offset(address, -(int32_t)offset);
-#if PLATFORM_POSIX
-		//Padding is always one span size
-		release += _memory_span_size;
-#endif
-	}
-#if !DISABLE_UNMAP
-#if PLATFORM_WINDOWS
-	if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) {
-		assert(!"Failed to unmap virtual memory block");
-	}
-#else
-	if (release) {
-		if (munmap(address, release)) {
-			assert("Failed to unmap virtual memory block" == 0);
-		}
-	}
-	else {
-#if defined(POSIX_MADV_FREE)
-		if (posix_madvise(address, size, POSIX_MADV_FREE))
-#endif
-#if defined(POSIX_MADV_DONTNEED)
-		if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) {
-			assert("Failed to madvise virtual memory block as free" == 0);
-		}
-#endif
-	}
-#endif
-#endif
-#if ENABLE_STATISTICS
-	if (release)
-		atomic_add32(&_mapped_pages_os, -(int32_t)(release >> _memory_page_size_shift));
-#endif
-}
-
-// Extern interface
-
-TRACY_API RPMALLOC_ALLOCATOR void*
-rpmalloc(size_t size) {
-#if ENABLE_VALIDATE_ARGS
-	if (size >= MAX_ALLOC_SIZE) {
-		errno = EINVAL;
-		return 0;
-	}
-#endif
-	heap_t* heap = get_thread_heap();
-	return _memory_allocate(heap, size);
-}
-
-TRACY_API void
-rpfree(void* ptr) {
-	_memory_deallocate(ptr);
-}
-
-extern inline RPMALLOC_ALLOCATOR void*
-rpcalloc(size_t num, size_t size) {
-	size_t total;
-#if ENABLE_VALIDATE_ARGS
-#if PLATFORM_WINDOWS
-	int err = SizeTMult(num, size, &total);
-	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
-		errno = EINVAL;
-		return 0;
-	}
-#else
-	int err = __builtin_umull_overflow(num, size, &total);
-	if (err || (total >= MAX_ALLOC_SIZE)) {
-		errno = EINVAL;
-		return 0;
-	}
-#endif
-#else
-	total = num * size;
-#endif
-	heap_t* heap = get_thread_heap();
-	void* block = _memory_allocate(heap, total);
-	memset(block, 0, total);
-	return block;
-}
-
-TRACY_API RPMALLOC_ALLOCATOR void*
-rprealloc(void* ptr, size_t size) {
-#if ENABLE_VALIDATE_ARGS
-	if (size >= MAX_ALLOC_SIZE) {
-		errno = EINVAL;
-		return ptr;
-	}
-#endif
-	return _memory_reallocate(ptr, size, 0, 0);
-}
-
-extern RPMALLOC_ALLOCATOR void*
-rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize,
-                  unsigned int flags) {
-#if ENABLE_VALIDATE_ARGS
-	if ((size + alignment < size) || (alignment > _memory_page_size)) {
-		errno = EINVAL;
-		return 0;
-	}
-#endif
-	void* block;
-	if (alignment > 32) {
-		size_t usablesize = _memory_usable_size(ptr);
-		if ((usablesize >= size) && (size >= (usablesize / 2)) && !((uintptr_t)ptr & (alignment - 1)))
-			return ptr;
-
-		block = rpaligned_alloc(alignment, size);
-		if (ptr) {
-			if (!oldsize)
-				oldsize = usablesize;
-			if (!(flags & RPMALLOC_NO_PRESERVE))
-				memcpy(block, ptr, oldsize < size ? oldsize : size);
-			rpfree(ptr);
-		}
-		//Mark as having aligned blocks
-		span_t* span = (span_t*)((uintptr_t)block & _memory_span_mask);
-		span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
-	} else {
-		block = _memory_reallocate(ptr, size, oldsize, flags);
-	}
-	return block;
-}
-
-extern RPMALLOC_ALLOCATOR void*
-rpaligned_alloc(size_t alignment, size_t size) {
-	if (alignment <= 16)
-		return rpmalloc(size);
+_rpmalloc_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
+	if (alignment <= SMALL_GRANULARITY)
+		return _rpmalloc_allocate(heap, size);
 
 #if ENABLE_VALIDATE_ARGS
 	if ((size + alignment) < size) {
@@ -2225,15 +2244,26 @@ rpaligned_alloc(size_t alignment, size_t size) {
 	}
 #endif
 
+	if ((alignment <= SPAN_HEADER_SIZE) && (size < _memory_medium_size_limit)) {
+		// If alignment is less or equal to span header size (which is power of two),
+		// and size aligned to span header size multiples is less than size + alignment,
+		// then use natural alignment of blocks to provide alignment
+		size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) & ~(uintptr_t)(SPAN_HEADER_SIZE - 1) : SPAN_HEADER_SIZE;
+		rpmalloc_assert(!(multiple_size % SPAN_HEADER_SIZE), "Failed alignment calculation");
+		if (multiple_size <= (size + alignment))
+			return _rpmalloc_allocate(heap, multiple_size);
+	}
+
 	void* ptr = 0;
 	size_t align_mask = alignment - 1;
-	if (alignment < _memory_page_size) {
-		ptr = rpmalloc(size + alignment);
-		if ((uintptr_t)ptr & align_mask)
+	if (alignment <= _memory_page_size) {
+		ptr = _rpmalloc_allocate(heap, size + alignment);
+		if ((uintptr_t)ptr & align_mask) {
 			ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
-		//Mark as having aligned blocks
-		span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask);
-		span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
+			//Mark as having aligned blocks
+			span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask);
+			span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
+		}
 		return ptr;
 	}
 
@@ -2277,7 +2307,7 @@ retry:
 	align_offset = 0;
 	mapped_size = num_pages * _memory_page_size;
 
-	span = (span_t*)_memory_map(mapped_size, &align_offset);
+	span = (span_t*)_rpmalloc_mmap(mapped_size, &align_offset);
 	if (!span) {
 		errno = ENOMEM;
 		return 0;
@@ -2290,7 +2320,7 @@ retry:
 	if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) ||
 	    (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) ||
 	    (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) {
-		_memory_unmap(span, mapped_size, align_offset, mapped_size);
+		_rpmalloc_unmap(span, mapped_size, align_offset, mapped_size);
 		++num_pages;
 		if (num_pages > limit_pages) {
 			errno = EINVAL;
@@ -2300,14 +2330,774 @@ retry:
 	}
 
 	//Store page count in span_count
-	span->size_class = (uint32_t)-1;
+	span->size_class = SIZE_CLASS_HUGE;
 	span->span_count = (uint32_t)num_pages;
 	span->align_offset = (uint32_t)align_offset;
-	_memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
+	span->heap = heap;
+	_rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
+#endif
+	++heap->full_span_count;
+
+	_rpmalloc_stat_add64(&_allocation_counter, 1);
 
 	return ptr;
 }
 
+
+////////////
+///
+/// Deallocation entry points
+///
+//////
+
+//! Deallocate the given small/medium memory block in the current thread local heap
+static void
+_rpmalloc_deallocate_direct_small_or_medium(span_t* span, void* block) {
+	heap_t* heap = span->heap;
+	rpmalloc_assert(heap->owner_thread == get_thread_id() || !heap->owner_thread || heap->finalize, "Internal failure");
+	//Add block to free list
+	if (UNEXPECTED(_rpmalloc_span_is_fully_utilized(span))) {
+		span->used_count = span->block_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+		_rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span);
+#endif
+		_rpmalloc_span_double_link_list_add(&heap->size_class[span->size_class].partial_span, span);
+		--heap->full_span_count;
+	}
+	*((void**)block) = span->free_list;
+	--span->used_count;
+	span->free_list = block;
+	if (UNEXPECTED(span->used_count == span->list_size)) {
+		// If there are no used blocks it is guaranteed that no other external thread is accessing the span
+		if (span->used_count) {
+			// Make sure we have synchronized the deferred list and list size by using acquire semantics
+			// and guarantee that no external thread is accessing span concurrently
+			void* free_list;
+			do {
+				free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+			} while (free_list == INVALID_POINTER);
+			atomic_store_ptr_release(&span->free_list_deferred, free_list);
+		}
+		_rpmalloc_span_double_link_list_remove(&heap->size_class[span->size_class].partial_span, span);
+		_rpmalloc_span_release_to_cache(heap, span);
+	}
+}
+
+static void
+_rpmalloc_deallocate_defer_free_span(heap_t* heap, span_t* span) {
+	if (span->size_class != SIZE_CLASS_HUGE)
+		_rpmalloc_stat_inc(&heap->span_use[span->span_count - 1].spans_deferred);
+	//This list does not need ABA protection, no mutable side state
+	do {
+		span->free_list = (void*)atomic_load_ptr(&heap->span_free_deferred);
+	} while (!atomic_cas_ptr(&heap->span_free_deferred, span, span->free_list));
+}
+
+//! Put the block in the deferred free list of the owning span
+static void
+_rpmalloc_deallocate_defer_small_or_medium(span_t* span, void* block) {
+	// The memory ordering here is a bit tricky, to avoid having to ABA protect
+	// the deferred free list to avoid desynchronization of list and list size
+	// we need to have acquire semantics on successful CAS of the pointer to
+	// guarantee the list_size variable validity + release semantics on pointer store
+	void* free_list;
+	do {
+		free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+	} while (free_list == INVALID_POINTER);
+	*((void**)block) = free_list;
+	uint32_t free_count = ++span->list_size;
+	int all_deferred_free = (free_count == span->block_count);
+	atomic_store_ptr_release(&span->free_list_deferred, block);
+	if (all_deferred_free) {
+		// Span was completely freed by this block. Due to the INVALID_POINTER spin lock
+		// no other thread can reach this state simultaneously on this span.
+		// Safe to move to owner heap deferred cache
+		_rpmalloc_deallocate_defer_free_span(span->heap, span);
+	}
+}
+
+static void
+_rpmalloc_deallocate_small_or_medium(span_t* span, void* p) {
+	_rpmalloc_stat_inc_free(span->heap, span->size_class);
+	if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) {
+		//Realign pointer to block start
+		void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+		uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
+		p = pointer_offset(p, -(int32_t)(block_offset % span->block_size));
+	}
+	//Check if block belongs to this heap or if deallocation should be deferred
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+	int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
+	if (!defer)
+		_rpmalloc_deallocate_direct_small_or_medium(span, p);
+	else
+		_rpmalloc_deallocate_defer_small_or_medium(span, p);
+}
+
+//! Deallocate the given large memory block to the current heap
+static void
+_rpmalloc_deallocate_large(span_t* span) {
+	rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, "Bad span size class");
+	rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted");
+	rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted");
+	//We must always defer (unless finalizing) if from another heap since we cannot touch the list or counters of another heap
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+	int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
+	if (defer) {
+		_rpmalloc_deallocate_defer_free_span(span->heap, span);
+		return;
+	}
+	rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted");
+	--span->heap->full_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span);
+#endif
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	//Decrease counter
+	size_t idx = span->span_count - 1;
+	atomic_decr32(&span->heap->span_use[idx].current);
+#endif
+	heap_t* heap = span->heap;
+	rpmalloc_assert(heap, "No thread heap");
+#if ENABLE_THREAD_CACHE
+	const int set_as_reserved = ((span->span_count > 1) && (heap->span_cache.count == 0) && !heap->finalize && !heap->spans_reserved);
+#else
+	const int set_as_reserved = ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved);
+#endif
+	if (set_as_reserved) {
+		heap->span_reserve = span;
+		heap->spans_reserved = span->span_count;
+		if (span->flags & SPAN_FLAG_MASTER) {
+			heap->span_reserve_master = span;
+		} else { //SPAN_FLAG_SUBSPAN
+			span_t* master = (span_t*)pointer_offset(span, -(intptr_t)((size_t)span->offset_from_master * _memory_span_size));
+			heap->span_reserve_master = master;
+			rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted");
+			rpmalloc_assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count, "Master span count corrupted");
+		}
+		_rpmalloc_stat_inc(&heap->span_use[idx].spans_to_reserved);
+	} else {
+		//Insert into cache list
+		_rpmalloc_heap_cache_insert(heap, span);
+	}
+}
+
+//! Deallocate the given huge span
+static void
+_rpmalloc_deallocate_huge(span_t* span) {
+	rpmalloc_assert(span->heap, "No span heap");
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+	int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
+	if (defer) {
+		_rpmalloc_deallocate_defer_free_span(span->heap, span);
+		return;
+	}
+	rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted");
+	--span->heap->full_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span);
+#endif
+
+	//Oversized allocation, page count is stored in span_count
+	size_t num_pages = span->span_count;
+	_rpmalloc_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size);
+	_rpmalloc_stat_sub(&_huge_pages_current, num_pages);
+}
+
+//! Deallocate the given block
+static void
+_rpmalloc_deallocate(void* p) {
+	_rpmalloc_stat_add64(&_deallocation_counter, 1);
+	//Grab the span (always at start of span, using span alignment)
+	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
+	if (UNEXPECTED(!span))
+		return;
+	if (EXPECTED(span->size_class < SIZE_CLASS_COUNT))
+		_rpmalloc_deallocate_small_or_medium(span, p);
+	else if (span->size_class == SIZE_CLASS_LARGE)
+		_rpmalloc_deallocate_large(span);
+	else
+		_rpmalloc_deallocate_huge(span);
+}
+
+////////////
+///
+/// Reallocation entry points
+///
+//////
+
+static size_t
+_rpmalloc_usable_size(void* p);
+
+//! Reallocate the given block to the given size
+static void*
+_rpmalloc_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigned int flags) {
+	if (p) {
+		//Grab the span using guaranteed span alignment
+		span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
+		if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
+			//Small/medium sized block
+			rpmalloc_assert(span->span_count == 1, "Span counter corrupted");
+			void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+			uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
+			uint32_t block_idx = block_offset / span->block_size;
+			void* block = pointer_offset(blocks_start, (size_t)block_idx * span->block_size);
+			if (!oldsize)
+				oldsize = (size_t)((ptrdiff_t)span->block_size - pointer_diff(p, block));
+			if ((size_t)span->block_size >= size) {
+				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block, p, oldsize);
+				return block;
+			}
+		} else if (span->size_class == SIZE_CLASS_LARGE) {
+			//Large block
+			size_t total_size = size + SPAN_HEADER_SIZE;
+			size_t num_spans = total_size >> _memory_span_size_shift;
+			if (total_size & (_memory_span_mask - 1))
+				++num_spans;
+			size_t current_spans = span->span_count;
+			void* block = pointer_offset(span, SPAN_HEADER_SIZE);
+			if (!oldsize)
+				oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
+			if ((current_spans >= num_spans) && (total_size >= (oldsize / 2))) {
+				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block, p, oldsize);
+				return block;
+			}
+		} else {
+			//Oversized block
+			size_t total_size = size + SPAN_HEADER_SIZE;
+			size_t num_pages = total_size >> _memory_page_size_shift;
+			if (total_size & (_memory_page_size - 1))
+				++num_pages;
+			//Page count is stored in span_count
+			size_t current_pages = span->span_count;
+			void* block = pointer_offset(span, SPAN_HEADER_SIZE);
+			if (!oldsize)
+				oldsize = (current_pages * _memory_page_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
+			if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) {
+				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block, p, oldsize);
+				return block;
+			}
+		}
+	} else {
+		oldsize = 0;
+	}
+
+	if (!!(flags & RPMALLOC_GROW_OR_FAIL))
+		return 0;
+
+	//Size is greater than block size, need to allocate a new block and deallocate the old
+	//Avoid hysteresis by overallocating if increase is small (below 37%)
+	size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3);
+	size_t new_size = (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size);
+	void* block = _rpmalloc_allocate(heap, new_size);
+	if (p && block) {
+		if (!(flags & RPMALLOC_NO_PRESERVE))
+			memcpy(block, p, oldsize < new_size ? oldsize : new_size);
+		_rpmalloc_deallocate(p);
+	}
+
+	return block;
+}
+
+static void*
+_rpmalloc_aligned_reallocate(heap_t* heap, void* ptr, size_t alignment, size_t size, size_t oldsize,
+                           unsigned int flags) {
+	if (alignment <= SMALL_GRANULARITY)
+		return _rpmalloc_reallocate(heap, ptr, size, oldsize, flags);
+
+	int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL);
+	size_t usablesize = (ptr ? _rpmalloc_usable_size(ptr) : 0);
+	if ((usablesize >= size) && !((uintptr_t)ptr & (alignment - 1))) {
+		if (no_alloc || (size >= (usablesize / 2)))
+			return ptr;
+	}
+	// Aligned alloc marks span as having aligned blocks
+	void* block = (!no_alloc ? _rpmalloc_aligned_allocate(heap, alignment, size) : 0);
+	if (EXPECTED(block != 0)) {
+		if (!(flags & RPMALLOC_NO_PRESERVE) && ptr) {
+			if (!oldsize)
+				oldsize = usablesize;
+			memcpy(block, ptr, oldsize < size ? oldsize : size);
+		}
+		_rpmalloc_deallocate(ptr);
+	}
+	return block;
+}
+
+
+////////////
+///
+/// Initialization, finalization and utility
+///
+//////
+
+//! Get the usable size of the given block
+static size_t
+_rpmalloc_usable_size(void* p) {
+	//Grab the span using guaranteed span alignment
+	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
+	if (span->size_class < SIZE_CLASS_COUNT) {
+		//Small/medium block
+		void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+		return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size);
+	}
+	if (span->size_class == SIZE_CLASS_LARGE) {
+		//Large block
+		size_t current_spans = span->span_count;
+		return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span);
+	}
+	//Oversized block, page count is stored in span_count
+	size_t current_pages = span->span_count;
+	return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span);
+}
+
+//! Adjust and optimize the size class properties for the given class
+static void
+_rpmalloc_adjust_size_class(size_t iclass) {
+	size_t block_size = _memory_size_class[iclass].block_size;
+	size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size;
+
+	_memory_size_class[iclass].block_count = (uint16_t)block_count;
+	_memory_size_class[iclass].class_idx = (uint16_t)iclass;
+
+	//Check if previous size classes can be merged
+	if (iclass >= SMALL_CLASS_COUNT) {
+		size_t prevclass = iclass;
+		while (prevclass > 0) {
+			--prevclass;
+			//A class can be merged if number of pages and number of blocks are equal
+			if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)
+				memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass]));
+			else
+				break;
+		}
+	}
+}
+
+//! Initialize the allocator and setup global data
+TRACY_API int
+rpmalloc_initialize(void) {
+	if (_rpmalloc_initialized) {
+		rpmalloc_thread_initialize();
+		return 0;
+	}
+	return rpmalloc_initialize_config(0);
+}
+
+int
+rpmalloc_initialize_config(const rpmalloc_config_t* config) {
+	if (_rpmalloc_initialized) {
+		rpmalloc_thread_initialize();
+		return 0;
+	}
+	_rpmalloc_initialized = 1;
+
+	if (config)
+		memcpy(&_memory_config, config, sizeof(rpmalloc_config_t));
+	else
+		memset(&_memory_config, 0, sizeof(rpmalloc_config_t));
+
+	if (!_memory_config.memory_map || !_memory_config.memory_unmap) {
+		_memory_config.memory_map = _rpmalloc_mmap_os;
+		_memory_config.memory_unmap = _rpmalloc_unmap_os;
+	}
+
+#if PLATFORM_WINDOWS
+	SYSTEM_INFO system_info;
+	memset(&system_info, 0, sizeof(system_info));
+	GetSystemInfo(&system_info);
+	_memory_map_granularity = system_info.dwAllocationGranularity;
+#else
+	_memory_map_granularity = (size_t)sysconf(_SC_PAGESIZE);
+#endif
+
+#if RPMALLOC_CONFIGURABLE
+	_memory_page_size = _memory_config.page_size;
+#else
+	_memory_page_size = 0;
+#endif
+	_memory_huge_pages = 0;
+	if (!_memory_page_size) {
+#if PLATFORM_WINDOWS
+		_memory_page_size = system_info.dwPageSize;
+#else
+		_memory_page_size = _memory_map_granularity;
+		if (_memory_config.enable_huge_pages) {
+#if defined(__linux__)
+			size_t huge_page_size = 0;
+			FILE* meminfo = fopen("/proc/meminfo", "r");
+			if (meminfo) {
+				char line[128];
+				while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) {
+					line[sizeof(line) - 1] = 0;
+					if (strstr(line, "Hugepagesize:"))
+						huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024;
+				}
+				fclose(meminfo);
+			}
+			if (huge_page_size) {
+				_memory_huge_pages = 1;
+				_memory_page_size = huge_page_size;
+				_memory_map_granularity = huge_page_size;
+			}
+#elif defined(__FreeBSD__)
+			int rc;
+			size_t sz = sizeof(rc);
+
+			if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && rc == 1) {
+				_memory_huge_pages = 1;
+				_memory_page_size = 2 * 1024 * 1024;
+				_memory_map_granularity = _memory_page_size;
+			}
+#elif defined(__APPLE__) || defined(__NetBSD__)
+			_memory_huge_pages = 1;
+			_memory_page_size = 2 * 1024 * 1024;
+			_memory_map_granularity = _memory_page_size;
+#endif
+		}
+#endif
+	} else {
+		if (_memory_config.enable_huge_pages)
+			_memory_huge_pages = 1;
+	}
+
+#if PLATFORM_WINDOWS
+	if (_memory_config.enable_huge_pages) {
+		HANDLE token = 0;
+		size_t large_page_minimum = GetLargePageMinimum();
+		if (large_page_minimum)
+			OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
+		if (token) {
+			LUID luid;
+			if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) {
+				TOKEN_PRIVILEGES token_privileges;
+				memset(&token_privileges, 0, sizeof(token_privileges));
+				token_privileges.PrivilegeCount = 1;
+				token_privileges.Privileges[0].Luid = luid;
+				token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+				if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) {
+					if (GetLastError() == ERROR_SUCCESS)
+						_memory_huge_pages = 1;
+				}
+			}
+			CloseHandle(token);
+		}
+		if (_memory_huge_pages) {
+			if (large_page_minimum > _memory_page_size)
+				_memory_page_size = large_page_minimum;
+			if (large_page_minimum > _memory_map_granularity)
+				_memory_map_granularity = large_page_minimum;
+		}
+	}
+#endif
+
+	size_t min_span_size = 256;
+	size_t max_page_size;
+#if UINTPTR_MAX > 0xFFFFFFFF
+	max_page_size = 4096ULL * 1024ULL * 1024ULL;
+#else
+	max_page_size = 4 * 1024 * 1024;
+#endif
+	if (_memory_page_size < min_span_size)
+		_memory_page_size = min_span_size;
+	if (_memory_page_size > max_page_size)
+		_memory_page_size = max_page_size;
+	_memory_page_size_shift = 0;
+	size_t page_size_bit = _memory_page_size;
+	while (page_size_bit != 1) {
+		++_memory_page_size_shift;
+		page_size_bit >>= 1;
+	}
+	_memory_page_size = ((size_t)1 << _memory_page_size_shift);
+
+#if RPMALLOC_CONFIGURABLE
+	if (!_memory_config.span_size) {
+		_memory_span_size = _memory_default_span_size;
+		_memory_span_size_shift = _memory_default_span_size_shift;
+		_memory_span_mask = _memory_default_span_mask;
+	} else {
+		size_t span_size = _memory_config.span_size;
+		if (span_size > (256 * 1024))
+			span_size = (256 * 1024);
+		_memory_span_size = 4096;
+		_memory_span_size_shift = 12;
+		while (_memory_span_size < span_size) {
+			_memory_span_size <<= 1;
+			++_memory_span_size_shift;
+		}
+		_memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
+	}
+#endif
+
+	_memory_span_map_count = ( _memory_config.span_map_count ? _memory_config.span_map_count : DEFAULT_SPAN_MAP_COUNT);
+	if ((_memory_span_size * _memory_span_map_count) < _memory_page_size)
+		_memory_span_map_count = (_memory_page_size / _memory_span_size);
+	if ((_memory_page_size >= _memory_span_size) && ((_memory_span_map_count * _memory_span_size) % _memory_page_size))
+		_memory_span_map_count = (_memory_page_size / _memory_span_size);
+	_memory_heap_reserve_count = (_memory_span_map_count > DEFAULT_SPAN_MAP_COUNT) ? DEFAULT_SPAN_MAP_COUNT : _memory_span_map_count;
+
+	_memory_config.page_size = _memory_page_size;
+	_memory_config.span_size = _memory_span_size;
+	_memory_config.span_map_count = _memory_span_map_count;
+	_memory_config.enable_huge_pages = _memory_huge_pages;
+
+#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || defined(__TINYC__)
+	if (pthread_key_create(&_memory_thread_heap, _rpmalloc_heap_release_raw_fc))
+		return -1;
+#endif
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+	fls_key = FlsAlloc(&_rpmalloc_thread_destructor);
+#endif
+
+	//Setup all small and medium size classes
+	size_t iclass = 0;
+	_memory_size_class[iclass].block_size = SMALL_GRANULARITY;
+	_rpmalloc_adjust_size_class(iclass);
+	for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) {
+		size_t size = iclass * SMALL_GRANULARITY;
+		_memory_size_class[iclass].block_size = (uint32_t)size;
+		_rpmalloc_adjust_size_class(iclass);
+	}
+	//At least two blocks per span, then fall back to large allocations
+	_memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1;
+	if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT)
+		_memory_medium_size_limit = MEDIUM_SIZE_LIMIT;
+	for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) {
+		size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY);
+		if (size > _memory_medium_size_limit)
+			break;
+		_memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size;
+		_rpmalloc_adjust_size_class(SMALL_CLASS_COUNT + iclass);
+	}
+
+	_memory_orphan_heaps = 0;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_memory_first_class_orphan_heaps = 0;
+#endif
+#if ENABLE_STATISTICS
+	atomic_store32(&_memory_active_heaps, 0);
+	atomic_store32(&_mapped_pages, 0);
+	_mapped_pages_peak = 0;
+	atomic_store32(&_master_spans, 0);
+	atomic_store32(&_mapped_total, 0);
+	atomic_store32(&_unmapped_total, 0);
+	atomic_store32(&_mapped_pages_os, 0);
+	atomic_store32(&_huge_pages_current, 0);
+	_huge_pages_peak = 0;
+#endif
+	memset(_memory_heaps, 0, sizeof(_memory_heaps));
+	atomic_store32_release(&_memory_global_lock, 0);
+
+	//Initialize this thread
+	rpmalloc_thread_initialize();
+	return 0;
+}
+
+//! Finalize the allocator
+TRACY_API void
+rpmalloc_finalize(void) {
+	rpmalloc_thread_finalize(1);
+	//rpmalloc_dump_statistics(stdout);
+
+	if (_memory_global_reserve) {
+		atomic_add32(&_memory_global_reserve_master->remaining_spans, -(int32_t)_memory_global_reserve_count);
+		_memory_global_reserve_master = 0;
+		_memory_global_reserve_count = 0;
+		_memory_global_reserve = 0;
+	}
+	atomic_store32_release(&_memory_global_lock, 0);	
+
+	//Free all thread caches and fully free spans
+	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
+		heap_t* heap = _memory_heaps[list_idx];
+		while (heap) {
+			heap_t* next_heap = heap->next_heap;
+			heap->finalize = 1;
+			_rpmalloc_heap_global_finalize(heap);
+			heap = next_heap;
+		}
+	}
+
+#if ENABLE_GLOBAL_CACHE
+	//Free global caches
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
+		_rpmalloc_global_cache_finalize(&_memory_span_cache[iclass]);
+#endif
+
+#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
+	pthread_key_delete(_memory_thread_heap);
+#endif
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+	FlsFree(fls_key);
+	fls_key = 0;
+#endif
+#if ENABLE_STATISTICS
+	//If you hit these asserts you probably have memory leaks (perhaps global scope data doing dynamic allocations) or double frees in your code
+	rpmalloc_assert(atomic_load32(&_mapped_pages) == 0, "Memory leak detected");
+	rpmalloc_assert(atomic_load32(&_mapped_pages_os) == 0, "Memory leak detected");
+#endif
+
+	_rpmalloc_initialized = 0;
+}
+
+//! Initialize thread, assign heap
+TRACY_API void
+rpmalloc_thread_initialize(void) {
+	if (!get_thread_heap_raw()) {
+		heap_t* heap = _rpmalloc_heap_allocate(0);
+		if (heap) {
+			_rpmalloc_stat_inc(&_memory_active_heaps);
+			set_thread_heap(heap);
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+			FlsSetValue(fls_key, heap);
+#endif
+		}
+	}
+}
+
+//! Finalize thread, orphan heap
+TRACY_API void
+rpmalloc_thread_finalize(int release_caches) {
+	heap_t* heap = get_thread_heap_raw();
+	if (heap)
+		_rpmalloc_heap_release_raw(heap, release_caches);
+	set_thread_heap(0);
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+	FlsSetValue(fls_key, 0);
+#endif
+}
+
+int
+rpmalloc_is_thread_initialized(void) {
+	return (get_thread_heap_raw() != 0) ? 1 : 0;
+}
+
+const rpmalloc_config_t*
+rpmalloc_config(void) {
+	return &_memory_config;
+}
+
+// Extern interface
+
+TRACY_API RPMALLOC_ALLOCATOR void*
+rpmalloc(size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return _rpmalloc_allocate(heap, size);
+}
+
+TRACY_API void
+rpfree(void* ptr) {
+	_rpmalloc_deallocate(ptr);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpcalloc(size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	heap_t* heap = get_thread_heap();
+	void* block = _rpmalloc_allocate(heap, total);
+	if (block)
+		memset(block, 0, total);
+	return block;
+}
+
+TRACY_API RPMALLOC_ALLOCATOR void*
+rprealloc(void* ptr, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return _rpmalloc_reallocate(heap, ptr, size, 0, 0);
+}
+
+extern RPMALLOC_ALLOCATOR void*
+rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize,
+                  unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, oldsize, flags);
+}
+
+extern RPMALLOC_ALLOCATOR void*
+rpaligned_alloc(size_t alignment, size_t size) {
+	heap_t* heap = get_thread_heap();
+	return _rpmalloc_aligned_allocate(heap, alignment, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpaligned_calloc(size_t alignment, size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	void* block = rpaligned_alloc(alignment, total);
+	if (block)
+		memset(block, 0, total);
+	return block;
+}
+
 extern inline RPMALLOC_ALLOCATOR void*
 rpmemalign(size_t alignment, size_t size) {
 	return rpaligned_alloc(alignment, size);
@@ -2324,7 +3114,7 @@ rpposix_memalign(void **memptr, size_t alignment, size_t size) {
 
 extern inline size_t
 rpmalloc_usable_size(void* ptr) {
-	return (ptr ? _memory_usable_size(ptr) : 0);
+	return (ptr ? _rpmalloc_usable_size(ptr) : 0);
 }
 
 extern inline void
@@ -2340,13 +3130,13 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
 		size_class_t* size_class = _memory_size_class + iclass;
-		heap_class_t* heap_class = heap->span_class + iclass;
-		span_t* span = heap_class->partial_span;
+		span_t* span = heap->size_class[iclass].partial_span;
 		while (span) {
-			atomic_thread_fence_acquire();
 			size_t free_count = span->list_size;
-			if (span->state == SPAN_STATE_PARTIAL)
-				free_count += (size_class->block_count - span->used_count);
+			size_t block_count = size_class->block_count;
+			if (span->free_list_limit < block_count)
+				block_count = span->free_list_limit;
+			free_count += (block_count - span->used_count);
 			stats->sizecache = free_count * size_class->block_size;
 			span = span->next;
 		}
@@ -2354,38 +3144,46 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		if (heap->span_cache[iclass])
-			stats->spancache = (size_t)heap->span_cache[iclass]->list_size * (iclass + 1) * _memory_span_size;
-		span_t* deferred_list = !iclass ? (span_t*)atomic_load_ptr(&heap->span_cache_deferred) : 0;
-		//TODO: Incorrect, for deferred lists the size is NOT stored in list_size
-		if (deferred_list)
-			stats->spancache = (size_t)deferred_list->list_size * (iclass + 1) * _memory_span_size;
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		stats->spancache = span_cache->count * (iclass + 1) * _memory_span_size;
 	}
 #endif
+
+	span_t* deferred = (span_t*)atomic_load_ptr(&heap->span_free_deferred);
+	while (deferred) {
+		if (deferred->size_class != SIZE_CLASS_HUGE)
+			stats->spancache = (size_t)deferred->span_count * _memory_span_size;
+		deferred = (span_t*)deferred->free_list;
+	}
+
 #if ENABLE_STATISTICS
-	stats->thread_to_global = heap->thread_to_global;
-	stats->global_to_thread = heap->global_to_thread;
+	stats->thread_to_global = (size_t)atomic_load64(&heap->thread_to_global);
+	stats->global_to_thread = (size_t)atomic_load64(&heap->global_to_thread);
 
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		stats->span_use[iclass].current = (size_t)atomic_load32(&heap->span_use[iclass].current);
-		stats->span_use[iclass].peak = (size_t)heap->span_use[iclass].high;
-		stats->span_use[iclass].to_global = (size_t)heap->span_use[iclass].spans_to_global;
-		stats->span_use[iclass].from_global = (size_t)heap->span_use[iclass].spans_from_global;
-		stats->span_use[iclass].to_cache = (size_t)heap->span_use[iclass].spans_to_cache;
-		stats->span_use[iclass].from_cache = (size_t)heap->span_use[iclass].spans_from_cache;
-		stats->span_use[iclass].to_reserved = (size_t)heap->span_use[iclass].spans_to_reserved;
-		stats->span_use[iclass].from_reserved = (size_t)heap->span_use[iclass].spans_from_reserved;
-		stats->span_use[iclass].map_calls = (size_t)heap->span_use[iclass].spans_map_calls;
+		stats->span_use[iclass].peak = (size_t)atomic_load32(&heap->span_use[iclass].high);
+		stats->span_use[iclass].to_global = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_global);
+		stats->span_use[iclass].from_global = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_global);
+		stats->span_use[iclass].to_cache = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache);
+		stats->span_use[iclass].from_cache = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache);
+		stats->span_use[iclass].to_reserved = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved);
+		stats->span_use[iclass].from_reserved = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved);
+		stats->span_use[iclass].map_calls = (size_t)atomic_load32(&heap->span_use[iclass].spans_map_calls);
 	}
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
 		stats->size_use[iclass].alloc_current = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current);
 		stats->size_use[iclass].alloc_peak = (size_t)heap->size_class_use[iclass].alloc_peak;
-		stats->size_use[iclass].alloc_total = (size_t)heap->size_class_use[iclass].alloc_total;
+		stats->size_use[iclass].alloc_total = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_total);
 		stats->size_use[iclass].free_total = (size_t)atomic_load32(&heap->size_class_use[iclass].free_total);
-		stats->size_use[iclass].spans_to_cache = (size_t)heap->size_class_use[iclass].spans_to_cache;
-		stats->size_use[iclass].spans_from_cache = (size_t)heap->size_class_use[iclass].spans_from_cache;
-		stats->size_use[iclass].spans_from_reserved = (size_t)heap->size_class_use[iclass].spans_from_reserved;
-		stats->size_use[iclass].map_calls = (size_t)heap->size_class_use[iclass].spans_map_calls;
+		stats->size_use[iclass].spans_to_cache = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache);
+		stats->size_use[iclass].spans_from_cache = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache);
+		stats->size_use[iclass].spans_from_reserved = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_reserved);
+		stats->size_use[iclass].map_calls = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_map_calls);
 	}
 #endif
 }
@@ -2402,94 +3200,319 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
 	stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size;
 #endif
 #if ENABLE_GLOBAL_CACHE
-	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		stats->cached += (size_t)atomic_load32(&_memory_span_cache[iclass].size) * (iclass + 1) * _memory_span_size;
-	}
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
+		stats->cached += _memory_span_cache[iclass].count * (iclass + 1) * _memory_span_size;
 #endif
 }
 
+#if ENABLE_STATISTICS
+
+static void
+_memory_heap_dump_statistics(heap_t* heap, void* file) {
+	fprintf(file, "Heap %d stats:\n", heap->id);
+	fprintf(file, "Class   CurAlloc  PeakAlloc   TotAlloc    TotFree  BlkSize BlkCount SpansCur SpansPeak  PeakAllocMiB  ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n");
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		if (!atomic_load32(&heap->size_class_use[iclass].alloc_total))
+			continue;
+		fprintf(file, "%3u:  %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass,
+			atomic_load32(&heap->size_class_use[iclass].alloc_current),
+			heap->size_class_use[iclass].alloc_peak,
+			atomic_load32(&heap->size_class_use[iclass].alloc_total),
+			atomic_load32(&heap->size_class_use[iclass].free_total),
+			_memory_size_class[iclass].block_size,
+			_memory_size_class[iclass].block_count,
+			atomic_load32(&heap->size_class_use[iclass].spans_current),
+			heap->size_class_use[iclass].spans_peak,
+			((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_reserved) * _memory_span_size) / (size_t)(1024 * 1024),
+			atomic_load32(&heap->size_class_use[iclass].spans_map_calls));
+	}
+	fprintf(file, "Spans  Current     Peak Deferred  PeakMiB  Cached  ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB  MmapCalls\n");
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		if (!atomic_load32(&heap->span_use[iclass].high) && !atomic_load32(&heap->span_use[iclass].spans_map_calls))
+			continue;
+		fprintf(file, "%4u: %8d %8u %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1),
+			atomic_load32(&heap->span_use[iclass].current),
+			atomic_load32(&heap->span_use[iclass].high),
+			atomic_load32(&heap->span_use[iclass].spans_deferred),
+			((size_t)atomic_load32(&heap->span_use[iclass].high) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+#if ENABLE_THREAD_CACHE
+			(unsigned int)(!iclass ? heap->span_cache.count : heap->span_large_cache[iclass - 1].count),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+#else
+			0, (size_t)0, (size_t)0,
+#endif
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_global) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_global) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+			atomic_load32(&heap->span_use[iclass].spans_map_calls));
+	}
+	fprintf(file, "Full spans: %zu\n", heap->full_span_count);
+	fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n");
+	fprintf(file, "%17zu %17zu\n", (size_t)atomic_load64(&heap->thread_to_global) / (size_t)(1024 * 1024), (size_t)atomic_load64(&heap->global_to_thread) / (size_t)(1024 * 1024));
+}
+
+#endif
+
 void
 rpmalloc_dump_statistics(void* file) {
 #if ENABLE_STATISTICS
-	//If you hit this assert, you still have active threads or forgot to finalize some thread(s)
-	assert(atomic_load32(&_memory_active_heaps) == 0);
-
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
-		heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
+		heap_t* heap = _memory_heaps[list_idx];
 		while (heap) {
-			fprintf(file, "Heap %d stats:\n", heap->id);
-			fprintf(file, "Class   CurAlloc  PeakAlloc   TotAlloc    TotFree  BlkSize BlkCount SpansCur SpansPeak  PeakAllocMiB  ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n");
-			for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-				if (!heap->size_class_use[iclass].alloc_total) {
-					assert(!atomic_load32(&heap->size_class_use[iclass].free_total));
-					assert(!heap->size_class_use[iclass].spans_map_calls);
+			int need_dump = 0;
+			for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT); ++iclass) {
+				if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) {
+					rpmalloc_assert(!atomic_load32(&heap->size_class_use[iclass].free_total), "Heap statistics counter mismatch");
+					rpmalloc_assert(!atomic_load32(&heap->size_class_use[iclass].spans_map_calls), "Heap statistics counter mismatch");
 					continue;
 				}
-				fprintf(file, "%3u:  %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass,
-					atomic_load32(&heap->size_class_use[iclass].alloc_current),
-					heap->size_class_use[iclass].alloc_peak,
-					heap->size_class_use[iclass].alloc_total,
-					atomic_load32(&heap->size_class_use[iclass].free_total),
-					_memory_size_class[iclass].block_size,
-					_memory_size_class[iclass].block_count,
-					heap->size_class_use[iclass].spans_current,
-					heap->size_class_use[iclass].spans_peak,
-					((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024),
-					((size_t)heap->size_class_use[iclass].spans_to_cache * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->size_class_use[iclass].spans_from_cache * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->size_class_use[iclass].spans_from_reserved * _memory_span_size) / (size_t)(1024 * 1024),
-					heap->size_class_use[iclass].spans_map_calls);
+				need_dump = 1;
 			}
-			fprintf(file, "Spans  Current     Peak  PeakMiB  Cached  ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB  MmapCalls\n");
-			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-				if (!heap->span_use[iclass].high && !heap->span_use[iclass].spans_map_calls)
+			for (size_t iclass = 0; !need_dump && (iclass < LARGE_CLASS_COUNT); ++iclass) {
+				if (!atomic_load32(&heap->span_use[iclass].high) && !atomic_load32(&heap->span_use[iclass].spans_map_calls))
 					continue;
-				fprintf(file, "%4u: %8d %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1),
-					atomic_load32(&heap->span_use[iclass].current),
-					heap->span_use[iclass].high,
-					((size_t)heap->span_use[iclass].high * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
-					heap->span_cache[iclass] ? heap->span_cache[iclass]->list_size : 0,
-					((size_t)heap->span_use[iclass].spans_to_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_from_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_to_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_from_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_to_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_from_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
-					heap->span_use[iclass].spans_map_calls);
+				need_dump = 1;
 			}
-			fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n");
-			fprintf(file, "%17zu %17zu\n", (size_t)heap->thread_to_global / (size_t)(1024 * 1024), (size_t)heap->global_to_thread / (size_t)(1024 * 1024));
+			if (need_dump)
+				_memory_heap_dump_statistics(heap, file);
 			heap = heap->next_heap;
 		}
 	}
-
 	fprintf(file, "Global stats:\n");
 	size_t huge_current = (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size;
 	size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size;
 	fprintf(file, "HugeCurrentMiB HugePeakMiB\n");
 	fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024), huge_peak / (size_t)(1024 * 1024));
 
+	fprintf(file, "GlobalCacheMiB\n");
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		global_cache_t* cache = _memory_span_cache + iclass;
+		size_t global_cache = (size_t)cache->count * iclass * _memory_span_size;
+
+		size_t global_overflow_cache = 0;
+		span_t* span = cache->overflow;
+		while (span) {
+			global_overflow_cache += iclass * _memory_span_size;
+			span = span->next;
+		}
+		if (global_cache || global_overflow_cache || cache->insert_count || cache->extract_count)
+			fprintf(file, "%4zu: %8zuMiB (%8zuMiB overflow) %14zu insert %14zu extract\n", iclass + 1, global_cache / (size_t)(1024 * 1024), global_overflow_cache / (size_t)(1024 * 1024), cache->insert_count, cache->extract_count);
+	}
+
 	size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size;
 	size_t mapped_os = (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size;
 	size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size;
 	size_t mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size;
 	size_t unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
-	size_t reserved_total = (size_t)atomic_load32(&_reserved_spans) * _memory_span_size;
-	fprintf(file, "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB ReservedTotalMiB\n");
-	fprintf(file, "%9zu %11zu %13zu %14zu %16zu %16zu\n",
+	fprintf(file, "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB\n");
+	fprintf(file, "%9zu %11zu %13zu %14zu %16zu\n",
 		mapped / (size_t)(1024 * 1024),
 		mapped_os / (size_t)(1024 * 1024),
 		mapped_peak / (size_t)(1024 * 1024),
 		mapped_total / (size_t)(1024 * 1024),
-		unmapped_total / (size_t)(1024 * 1024),
-		reserved_total / (size_t)(1024 * 1024));
+		unmapped_total / (size_t)(1024 * 1024));
 
 	fprintf(file, "\n");
-#else
+#if 0
+	int64_t allocated = atomic_load64(&_allocation_counter);
+	int64_t deallocated = atomic_load64(&_deallocation_counter);
+	fprintf(file, "Allocation count: %lli\n", allocated);
+	fprintf(file, "Deallocation count: %lli\n", deallocated);
+	fprintf(file, "Current allocations: %lli\n", (allocated - deallocated));
+	fprintf(file, "Master spans: %d\n", atomic_load32(&_master_spans));
+	fprintf(file, "Dangling master spans: %d\n", atomic_load32(&_unmapped_master_spans));
+#endif
+#endif
 	(void)sizeof(file);
+}
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
+extern inline rpmalloc_heap_t*
+rpmalloc_heap_acquire(void) {
+	// Must be a pristine heap from newly mapped memory pages, or else memory blocks
+	// could already be allocated from the heap which would (wrongly) be released when
+	// heap is cleared with rpmalloc_heap_free_all(). Also heaps guaranteed to be
+	// pristine from the dedicated orphan list can be used.
+	heap_t* heap = _rpmalloc_heap_allocate(1);
+	heap->owner_thread = 0;
+	_rpmalloc_stat_inc(&_memory_active_heaps);
+	return heap;
+}
+
+extern inline void
+rpmalloc_heap_release(rpmalloc_heap_t* heap) {
+	if (heap)
+		_rpmalloc_heap_release(heap, 1, 1);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return _rpmalloc_allocate(heap, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return _rpmalloc_aligned_allocate(heap, alignment, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) {
+	return rpmalloc_heap_aligned_calloc(heap, 0, num, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	void* block = _rpmalloc_aligned_allocate(heap, alignment, total);
+	if (block)
+		memset(block, 0, total);
+	return block;
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	return _rpmalloc_reallocate(heap, ptr, size, 0, flags);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, 0, flags);	
+}
+
+extern inline void
+rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr) {
+	(void)sizeof(heap);
+	_rpmalloc_deallocate(ptr);
+}
+
+extern inline void
+rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
+	span_t* span;
+	span_t* next_span;
+
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		span = heap->size_class[iclass].partial_span;
+		while (span) {
+			next_span = span->next;
+			_rpmalloc_heap_cache_insert(heap, span);
+			span = next_span;
+		}
+		heap->size_class[iclass].partial_span = 0;
+		span = heap->full_span[iclass];
+		while (span) {
+			next_span = span->next;
+			_rpmalloc_heap_cache_insert(heap, span);
+			span = next_span;
+		}
+	}
+	memset(heap->size_class, 0, sizeof(heap->size_class));
+	memset(heap->full_span, 0, sizeof(heap->full_span));
+
+	span = heap->large_huge_span;
+	while (span) {
+		next_span = span->next;
+		if (UNEXPECTED(span->size_class == SIZE_CLASS_HUGE))
+			_rpmalloc_deallocate_huge(span);
+		else
+			_rpmalloc_heap_cache_insert(heap, span);
+		span = next_span;
+	}
+	heap->large_huge_span = 0;
+	heap->full_span_count = 0;
+
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		if (!span_cache->count)
+			continue;
+#if ENABLE_GLOBAL_CACHE
+		_rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count);
+		_rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count);
+#else
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
+#endif
+		span_cache->count = 0;
+	}
+#endif
+
+#if ENABLE_STATISTICS
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		atomic_store32(&heap->size_class_use[iclass].alloc_current, 0);
+		atomic_store32(&heap->size_class_use[iclass].spans_current, 0);
+	}
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		atomic_store32(&heap->span_use[iclass].current, 0);
+	}
 #endif
 }
 
+extern inline void
+rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap) {
+	heap_t* prev_heap = get_thread_heap_raw();
+	if (prev_heap != heap) {
+		set_thread_heap(heap);
+		if (prev_heap)
+			rpmalloc_heap_release(prev_heap);
+	}
+}
+
+#endif
+
 }
 
 #endif
diff --git a/Source/ThirdParty/tracy/client/tracy_rpmalloc.hpp b/Source/ThirdParty/tracy/client/tracy_rpmalloc.hpp
index 3e8c4f1b5..2a743d709 100644
--- a/Source/ThirdParty/tracy/client/tracy_rpmalloc.hpp
+++ b/Source/ThirdParty/tracy/client/tracy_rpmalloc.hpp
@@ -20,11 +20,12 @@ namespace tracy
 #if defined(__clang__) || defined(__GNUC__)
 # define RPMALLOC_EXPORT __attribute__((visibility("default")))
 # define RPMALLOC_ALLOCATOR 
-# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__))
-# if defined(__clang_major__) && (__clang_major__ < 4)
+# if (defined(__clang_major__) && (__clang_major__ < 4)) || (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD)
+# define RPMALLOC_ATTRIB_MALLOC
 # define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
 # define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
 # else
+# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__))
 # define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size)))
 # define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)  __attribute__((alloc_size(count, size)))
 # endif
@@ -45,13 +46,24 @@ namespace tracy
 # define RPMALLOC_CDECL
 #endif
 
-//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes
+//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce
+//  a very small overhead due to some size calculations not being compile time constants
 #ifndef RPMALLOC_CONFIGURABLE
 #define RPMALLOC_CONFIGURABLE 0
 #endif
 
+//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_* functions).
+//  Will introduce a very small overhead to track fully allocated spans in heaps
+#ifndef RPMALLOC_FIRST_CLASS_HEAPS
+#define RPMALLOC_FIRST_CLASS_HEAPS 0
+#endif
+
 //! Flag to rpaligned_realloc to not preserve content in reallocation
 #define RPMALLOC_NO_PRESERVE    1
+//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be done in-place,
+//  in which case the original pointer is still valid (just like a call to realloc which failes to allocate
+//  a new block).
+#define RPMALLOC_GROW_OR_FAIL   2
 
 typedef struct rpmalloc_global_statistics_t {
 	//! Current amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1)
@@ -99,7 +111,7 @@ typedef struct rpmalloc_thread_statistics_t {
 		size_t from_reserved;
 		//! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls)
 		size_t map_calls;
-	} span_use[32];
+	} span_use[64];
 	//! Per size class statistics (only if ENABLE_STATISTICS=1)
 	struct {
 		//! Current number of allocations
@@ -131,7 +143,8 @@ typedef struct rpmalloc_config_t {
 	//  larger than 65535 (storable in an uint16_t), if it is you must use natural
 	//  alignment to shift it into 16 bits. If you set a memory_map function, you
 	//  must also set a memory_unmap function or else the default implementation will
-	//  be used for both.
+	//  be used for both. This function must be thread safe, it can be called by
+	//  multiple threads simultaneously.
 	void* (*memory_map)(size_t size, size_t* offset);
 	//! Unmap the memory pages starting at address and spanning the given number of bytes.
 	//  If release is set to non-zero, the unmap is for an entire span range as returned by
@@ -139,8 +152,18 @@ typedef struct rpmalloc_config_t {
 	//  release argument holds the size of the entire span range. If release is set to 0,
 	//  the unmap is a partial decommit of a subset of the mapped memory range.
 	//  If you set a memory_unmap function, you must also set a memory_map function or
-	//  else the default implementation will be used for both.
+	//  else the default implementation will be used for both. This function must be thread
+	//  safe, it can be called by multiple threads simultaneously.
 	void (*memory_unmap)(void* address, size_t size, size_t offset, size_t release);
+	//! Called when an assert fails, if asserts are enabled. Will use the standard assert()
+	//  if this is not set.
+	void (*error_callback)(const char* message);
+	//! Called when a call to map memory pages fails (out of memory). If this callback is
+	//  not set or returns zero the library will return a null pointer in the allocation
+	//  call. If this callback returns non-zero the map call will be retried. The argument
+	//  passed is the number of bytes that was requested in the map call. Only used if
+	//  the default system memory map function is used (memory_map callback is not set).
+	int (*map_fail_callback)(size_t size);
 	//! Size of memory pages. The page size MUST be a power of two. All memory mapping
 	//  requests to memory_map will be made with size set to a multiple of the page size.
 	//  Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system page size is used.
@@ -163,6 +186,10 @@ typedef struct rpmalloc_config_t {
 	//  For Windows, see https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support
 	//  For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
 	int enable_huge_pages;
+	//! Respectively allocated pages and huge allocated pages names for systems
+	//  supporting it to be able to distinguish among anonymous regions.
+	const char *page_name;
+	const char *huge_page_name;
 } rpmalloc_config_t;
 
 //! Initialize allocator with default configuration
@@ -187,7 +214,7 @@ rpmalloc_thread_initialize(void);
 
 //! Finalize allocator for calling thread
 TRACY_API void
-rpmalloc_thread_finalize(void);
+rpmalloc_thread_finalize(int release_caches);
 
 //! Perform deferred deallocations pending for the calling thread heap
 RPMALLOC_EXPORT void
@@ -240,6 +267,13 @@ rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsi
 RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
 rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
 
+//! Allocate a memory block of at least the given size and alignment, and zero initialize it.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpaligned_calloc(size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
 //! Allocate a memory block of at least the given size and alignment.
 //  Alignment must be a power of two and a multiple of sizeof(void*),
 //  and should ideally be less than memory page size. A caveat of rpmalloc
@@ -252,10 +286,78 @@ rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB
 //  and should ideally be less than memory page size. A caveat of rpmalloc
 //  internals is that this must also be strictly less than the span size (default 64KiB)
 RPMALLOC_EXPORT int
-rpposix_memalign(void **memptr, size_t alignment, size_t size);
+rpposix_memalign(void** memptr, size_t alignment, size_t size);
 
 //! Query the usable size of the given memory block (from given pointer to the end of block)
 RPMALLOC_EXPORT size_t
 rpmalloc_usable_size(void* ptr);
 
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
+//! Heap type
+typedef struct heap_t rpmalloc_heap_t;
+
+//! Acquire a new heap. Will reuse existing released heaps or allocate memory for a new heap
+//  if none available. Heap API is implemented with the strict assumption that only one single
+//  thread will call heap functions for a given heap at any given time, no functions are thread safe.
+RPMALLOC_EXPORT rpmalloc_heap_t*
+rpmalloc_heap_acquire(void);
+
+//! Release a heap (does NOT free the memory allocated by the heap, use rpmalloc_heap_free_all before destroying the heap).
+//  Releasing a heap will enable it to be reused by other threads. Safe to pass a null pointer.
+RPMALLOC_EXPORT void
+rpmalloc_heap_release(rpmalloc_heap_t* heap);
+
+//! Allocate a memory block of at least the given size using the given heap.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size using the given heap. The returned
+//  block will have the requested alignment. Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Allocate a memory block of at least the given size using the given heap and zero initialize it.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Allocate a memory block of at least the given size using the given heap and zero initialize it. The returned
+//  block will have the requested alignment. Alignment must either be zero, or a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Reallocate the given block to at least the given size. The memory block MUST be allocated
+//  by the same heap given to this function.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Reallocate the given block to at least the given size. The memory block MUST be allocated
+//  by the same heap given to this function. The returned block will have the requested alignment.
+//  Alignment must be either zero, or a power of two and a multiple of sizeof(void*), and should ideally be
+//  less than memory page size. A caveat of rpmalloc internals is that this must also be strictly less than
+//  the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(4);
+
+//! Free the given memory block from the given heap. The memory block MUST be allocated
+//  by the same heap given to this function.
+RPMALLOC_EXPORT void
+rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr);
+
+//! Free all memory allocated by the heap
+RPMALLOC_EXPORT void
+rpmalloc_heap_free_all(rpmalloc_heap_t* heap);
+
+//! Set the given heap as the current heap for the calling thread. A heap MUST only be current heap
+//  for a single thread, a heap can never be shared between multiple threads. The previous
+//  current heap for the calling thread is released to be reused by other threads.
+RPMALLOC_EXPORT void
+rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap);
+
+#endif
+
 }
diff --git a/Source/ThirdParty/tracy/common/TracyAlloc.hpp b/Source/ThirdParty/tracy/common/TracyAlloc.hpp
index a3cbec057..4e49df84d 100644
--- a/Source/ThirdParty/tracy/common/TracyAlloc.hpp
+++ b/Source/ThirdParty/tracy/common/TracyAlloc.hpp
@@ -3,16 +3,33 @@
 
 #include <stdlib.h>
 
-#ifdef TRACY_ENABLE
+#if defined TRACY_ENABLE && !defined __EMSCRIPTEN__
 #  include "../client/tracy_rpmalloc.hpp"
+#  define TRACY_USE_RPMALLOC
 #endif
 
 namespace tracy
 {
 
+#ifdef TRACY_USE_RPMALLOC
+TRACY_API void InitRpmalloc();
+#else
+static inline void InitRpmalloc() {}
+#endif
+
 static inline void* tracy_malloc( size_t size )
 {
-#ifdef TRACY_ENABLE
+#ifdef TRACY_USE_RPMALLOC
+    InitRpmalloc();
+    return rpmalloc( size );
+#else
+    return malloc( size );
+#endif
+}
+
+static inline void* tracy_malloc_fast( size_t size )
+{
+#ifdef TRACY_USE_RPMALLOC
     return rpmalloc( size );
 #else
     return malloc( size );
@@ -21,7 +38,17 @@ static inline void* tracy_malloc( size_t size )
 
 static inline void tracy_free( void* ptr )
 {
-#ifdef TRACY_ENABLE
+#ifdef TRACY_USE_RPMALLOC
+    InitRpmalloc();
+    rpfree( ptr );
+#else
+    free( ptr );
+#endif
+}
+
+static inline void tracy_free_fast( void* ptr )
+{
+#ifdef TRACY_USE_RPMALLOC
     rpfree( ptr );
 #else
     free( ptr );
@@ -30,7 +57,8 @@ static inline void tracy_free( void* ptr )
 
 static inline void* tracy_realloc( void* ptr, size_t size )
 {
-#ifdef TRACY_ENABLE
+#ifdef TRACY_USE_RPMALLOC
+    InitRpmalloc();
     return rprealloc( ptr, size );
 #else
     return realloc( ptr, size );
diff --git a/Source/ThirdParty/tracy/common/TracyProtocol.hpp b/Source/ThirdParty/tracy/common/TracyProtocol.hpp
index 2326a7f32..dd30e5391 100644
--- a/Source/ThirdParty/tracy/common/TracyProtocol.hpp
+++ b/Source/ThirdParty/tracy/common/TracyProtocol.hpp
@@ -9,8 +9,8 @@ namespace tracy
 
 constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; }
 
-enum : uint32_t { ProtocolVersion = 46 };
-enum : uint16_t { BroadcastVersion = 2 };
+enum : uint32_t { ProtocolVersion = 63 };
+enum : uint16_t { BroadcastVersion = 3 };
 
 using lz4sz_t = uint32_t;
 
@@ -34,7 +34,7 @@ enum HandshakeStatus : uint8_t
 enum { WelcomeMessageProgramNameSize = 64 };
 enum { WelcomeMessageHostInfoSize = 1024 };
 
-#pragma pack( 1 )
+#pragma pack( push, 1 )
 
 // Must increase left query space after handling!
 enum ServerQuery : uint8_t
@@ -44,14 +44,15 @@ enum ServerQuery : uint8_t
     ServerQueryThreadString,
     ServerQuerySourceLocation,
     ServerQueryPlotName,
-    ServerQueryCallstackFrame,
     ServerQueryFrameName,
-    ServerQueryDisconnect,
-    ServerQueryExternalName,
     ServerQueryParameter,
+    ServerQueryFiberName,
+    // Items above are high priority. Split order must be preserved. See IsQueryPrio().
+    ServerQueryDisconnect,
+    ServerQueryCallstackFrame,
+    ServerQueryExternalName,
     ServerQuerySymbol,
     ServerQuerySymbolCode,
-    ServerQueryCodeLocation,
     ServerQuerySourceCode,
     ServerQueryDataTransfer,
     ServerQueryDataTransferPart
@@ -77,6 +78,18 @@ enum CpuArchitecture : uint8_t
 };
 
 
+struct WelcomeFlag
+{
+    enum _t : uint8_t
+    {
+        OnDemand        = 1 << 0,
+        IsApple         = 1 << 1,
+        CodeTransfer    = 1 << 2,
+        CombineSamples  = 1 << 3,
+        IdentifySamples = 1 << 4,
+    };
+};
+
 struct WelcomeMessage
 {
     double timerMul;
@@ -88,10 +101,8 @@ struct WelcomeMessage
     uint64_t exectime;
     uint64_t pid;
     int64_t samplingPeriod;
-    uint8_t onDemand;
-    uint8_t isApple;
+    uint8_t flags;
     uint8_t cpuArch;
-    uint8_t codeTransfer;
     char cpuManufacturer[12];
     uint32_t cpuId;
     char programName[WelcomeMessageProgramNameSize];
@@ -115,13 +126,43 @@ struct BroadcastMessage
     uint16_t broadcastVersion;
     uint16_t listenPort;
     uint32_t protocolVersion;
+    uint64_t pid;
     int32_t activeTime;        // in seconds
     char programName[WelcomeMessageProgramNameSize];
 };
 
-enum { BroadcastMessageSize = sizeof( BroadcastMessage ) };
+struct BroadcastMessage_v2
+{
+    uint16_t broadcastVersion;
+    uint16_t listenPort;
+    uint32_t protocolVersion;
+    int32_t activeTime;
+    char programName[WelcomeMessageProgramNameSize];
+};
 
-#pragma pack()
+struct BroadcastMessage_v1
+{
+    uint32_t broadcastVersion;
+    uint32_t protocolVersion;
+    uint32_t listenPort;
+    uint32_t activeTime;
+    char programName[WelcomeMessageProgramNameSize];
+};
+
+struct BroadcastMessage_v0
+{
+    uint32_t broadcastVersion;
+    uint32_t protocolVersion;
+    uint32_t activeTime;
+    char programName[WelcomeMessageProgramNameSize];
+};
+
+enum { BroadcastMessageSize = sizeof( BroadcastMessage ) };
+enum { BroadcastMessageSize_v2 = sizeof( BroadcastMessage_v2 ) };
+enum { BroadcastMessageSize_v1 = sizeof( BroadcastMessage_v1 ) };
+enum { BroadcastMessageSize_v0 = sizeof( BroadcastMessage_v0 ) };
+
+#pragma pack( pop )
 
 }
 
diff --git a/Source/ThirdParty/tracy/common/TracyQueue.hpp b/Source/ThirdParty/tracy/common/TracyQueue.hpp
index d99945013..092d26969 100644
--- a/Source/ThirdParty/tracy/common/TracyQueue.hpp
+++ b/Source/ThirdParty/tracy/common/TracyQueue.hpp
@@ -21,6 +21,7 @@ enum class QueueType : uint8_t
     Callstack,
     CallstackAlloc,
     CallstackSample,
+    CallstackSampleContextSwitch,
     FrameImage,
     ZoneBegin,
     ZoneBeginCallstack,
@@ -50,11 +51,20 @@ enum class QueueType : uint8_t
     GpuZoneBeginAllocSrcLocSerial,
     GpuZoneBeginAllocSrcLocCallstackSerial,
     GpuZoneEndSerial,
-    PlotData,
+    PlotDataInt,
+    PlotDataFloat,
+    PlotDataDouble,
     ContextSwitch,
     ThreadWakeup,
     GpuTime,
     GpuContextName,
+    CallstackFrameSize,
+    SymbolInformation,
+    ExternalNameMetadata,
+    SymbolCodeMetadata,
+    SourceCodeMetadata,
+    FiberEnter,
+    FiberLeave,
     Terminate,
     KeepAlive,
     ThreadContext,
@@ -67,6 +77,7 @@ enum class QueueType : uint8_t
     FrameMarkMsg,
     FrameMarkMsgStart,
     FrameMarkMsgEnd,
+    FrameVsync,
     SourceLocation,
     LockAnnounce,
     LockTerminate,
@@ -76,16 +87,20 @@ enum class QueueType : uint8_t
     MessageLiteralCallstack,
     MessageLiteralColorCallstack,
     GpuNewContext,
-    CallstackFrameSize,
     CallstackFrame,
-    SymbolInformation,
-    CodeInformation,
     SysTimeReport,
     TidToPid,
+    HwSampleCpuCycle,
+    HwSampleInstructionRetired,
+    HwSampleCacheReference,
+    HwSampleCacheMiss,
+    HwSampleBranchRetired,
+    HwSampleBranchMiss,
     PlotConfig,
     ParamSetup,
     AckServerQueryNoop,
     AckSourceCodeNotAvailable,
+    AckSymbolCodeNotAvailable,
     CpuTopology,
     SingleStringData,
     SecondStringData,
@@ -102,14 +117,15 @@ enum class QueueType : uint8_t
     ExternalThreadName,
     SymbolCode,
     SourceCode,
+    FiberName,
     NUM_TYPES
 };
 
-#pragma pack( 1 )
+#pragma pack( push, 1 )
 
 struct QueueThreadContext
 {
-    uint64_t thread;
+    uint32_t thread;
 };
 
 struct QueueZoneBeginLean
@@ -122,16 +138,31 @@ struct QueueZoneBegin : public QueueZoneBeginLean
     uint64_t srcloc;    // ptr
 };
 
+struct QueueZoneBeginThread : public QueueZoneBegin
+{
+    uint32_t thread;
+};
+
 struct QueueZoneEnd
 {
     int64_t time;
 };
 
+struct QueueZoneEndThread : public QueueZoneEnd
+{
+    uint32_t thread;
+};
+
 struct QueueZoneValidation
 {
     uint32_t id;
 };
 
+struct QueueZoneValidationThread : public QueueZoneValidation
+{
+    uint32_t thread;
+};
+
 struct QueueZoneColor
 {
     uint8_t r;
@@ -139,11 +170,21 @@ struct QueueZoneColor
     uint8_t b;
 };
 
+struct QueueZoneColorThread : public QueueZoneColor
+{
+    uint32_t thread;
+};
+
 struct QueueZoneValue
 {
     uint64_t value;
 };
 
+struct QueueZoneValueThread : public QueueZoneValue
+{
+    uint32_t thread;
+};
+
 struct QueueStringTransfer
 {
     uint64_t ptr;
@@ -155,6 +196,12 @@ struct QueueFrameMark
     uint64_t name;      // ptr
 };
 
+struct QueueFrameVsync
+{
+    int64_t time;
+    uint32_t id;
+};
+
 struct QueueFrameImage
 {
     uint32_t frame;
@@ -185,6 +232,11 @@ struct QueueZoneTextFat
     uint16_t size;
 };
 
+struct QueueZoneTextFatThread : public QueueZoneTextFat
+{
+    uint32_t thread;
+};
+
 enum class LockType : uint8_t
 {
     Lockable,
@@ -199,6 +251,19 @@ struct QueueLockAnnounce
     LockType type;
 };
 
+struct QueueFiberEnter
+{
+    int64_t time;
+    uint64_t fiber;     // ptr
+    uint32_t thread;
+};
+
+struct QueueFiberLeave
+{
+    int64_t time;
+    uint32_t thread;
+};
+
 struct QueueLockTerminate
 {
     uint32_t id;
@@ -207,28 +272,32 @@ struct QueueLockTerminate
 
 struct QueueLockWait
 {
-    uint64_t thread;
+    uint32_t thread;
     uint32_t id;
     int64_t time;
 };
 
 struct QueueLockObtain
 {
-    uint64_t thread;
+    uint32_t thread;
     uint32_t id;
     int64_t time;
 };
 
 struct QueueLockRelease
 {
-    uint64_t thread;
     uint32_t id;
     int64_t time;
 };
 
+struct QueueLockReleaseShared : public QueueLockRelease
+{
+    uint32_t thread;
+};
+
 struct QueueLockMark
 {
-    uint64_t thread;
+    uint32_t thread;
     uint32_t id;
     uint64_t srcloc;    // ptr
 };
@@ -244,24 +313,25 @@ struct QueueLockNameFat : public QueueLockName
     uint16_t size;
 };
 
-enum class PlotDataType : uint8_t
-{
-    Float,
-    Double,
-    Int
-};
-
-struct QueuePlotData
+struct QueuePlotDataBase
 {
     uint64_t name;      // ptr
     int64_t time;
-    PlotDataType type;
-    union
-    {
-        double d;
-        float f;
-        int64_t i;
-    } data;
+};
+
+struct QueuePlotDataInt : public QueuePlotDataBase
+{
+    int64_t val;
+};
+
+struct QueuePlotDataFloat : public QueuePlotDataBase
+{
+    float val;
+};
+
+struct QueuePlotDataDouble : public QueuePlotDataBase
+{
+    double val;
 };
 
 struct QueueMessage
@@ -281,23 +351,43 @@ struct QueueMessageLiteral : public QueueMessage
     uint64_t text;      // ptr
 };
 
+struct QueueMessageLiteralThread : public QueueMessageLiteral
+{
+    uint32_t thread;
+};
+
 struct QueueMessageColorLiteral : public QueueMessageColor
 {
     uint64_t text;      // ptr
 };
 
+struct QueueMessageColorLiteralThread : public QueueMessageColorLiteral
+{
+    uint32_t thread;
+};
+
 struct QueueMessageFat : public QueueMessage
 {
     uint64_t text;      // ptr
     uint16_t size;
 };
 
+struct QueueMessageFatThread : public QueueMessageFat
+{
+    uint32_t thread;
+};
+
 struct QueueMessageColorFat : public QueueMessageColor
 {
     uint64_t text;      // ptr
     uint16_t size;
 };
 
+struct QueueMessageColorFatThread : public QueueMessageColorFat
+{
+    uint32_t thread;
+};
+
 // Don't change order, only add new entries at the end, this is also used on trace dumps!
 enum class GpuContextType : uint8_t
 {
@@ -305,7 +395,8 @@ enum class GpuContextType : uint8_t
     OpenGl,
     Vulkan,
     OpenCL,
-    Direct3D12
+    Direct3D12,
+    Direct3D11
 };
 
 enum GpuContextFlags : uint8_t
@@ -317,7 +408,7 @@ struct QueueGpuNewContext
 {
     int64_t cpuTime;
     int64_t gpuTime;
-    uint64_t thread;
+    uint32_t thread;
     float period;
     uint8_t context;
     GpuContextFlags flags;
@@ -327,7 +418,7 @@ struct QueueGpuNewContext
 struct QueueGpuZoneBeginLean
 {
     int64_t cpuTime;
-    uint64_t thread;
+    uint32_t thread;
     uint16_t queryId;
     uint8_t context;
 };
@@ -340,7 +431,7 @@ struct QueueGpuZoneBegin : public QueueGpuZoneBeginLean
 struct QueueGpuZoneEnd
 {
     int64_t cpuTime;
-    uint64_t thread;
+    uint32_t thread;
     uint16_t queryId;
     uint8_t context;
 };
@@ -379,7 +470,7 @@ struct QueueMemNamePayload
 struct QueueMemAlloc
 {
     int64_t time;
-    uint64_t thread;
+    uint32_t thread;
     uint64_t ptr;
     char size[6];
 };
@@ -387,7 +478,7 @@ struct QueueMemAlloc
 struct QueueMemFree
 {
     int64_t time;
-    uint64_t thread;
+    uint32_t thread;
     uint64_t ptr;
 };
 
@@ -396,16 +487,26 @@ struct QueueCallstackFat
     uint64_t ptr;
 };
 
+struct QueueCallstackFatThread : public QueueCallstackFat
+{
+    uint32_t thread;
+};
+
 struct QueueCallstackAllocFat
 {
     uint64_t ptr;
     uint64_t nativePtr;
 };
 
+struct QueueCallstackAllocFatThread : public QueueCallstackAllocFat
+{
+    uint32_t thread;
+};
+
 struct QueueCallstackSample
 {
     int64_t time;
-    uint64_t thread;
+    uint32_t thread;
 };
 
 struct QueueCallstackSampleFat : public QueueCallstackSample
@@ -419,6 +520,12 @@ struct QueueCallstackFrameSize
     uint8_t size;
 };
 
+struct QueueCallstackFrameSizeFat : public QueueCallstackFrameSize
+{
+    uint64_t data;
+    uint64_t imageName;
+};
+
 struct QueueCallstackFrame
 {
     uint32_t line;
@@ -432,10 +539,10 @@ struct QueueSymbolInformation
     uint64_t symAddr;
 };
 
-struct QueueCodeInformation
+struct QueueSymbolInformationFat : public QueueSymbolInformation
 {
-    uint64_t ptr;
-    uint32_t line;
+    uint64_t fileString;
+    uint8_t needFree;
 };
 
 struct QueueCrashReport
@@ -444,6 +551,11 @@ struct QueueCrashReport
     uint64_t text;      // ptr
 };
 
+struct QueueCrashReportThread
+{
+    uint32_t thread;
+};
+
 struct QueueSysTime
 {
     int64_t time;
@@ -453,8 +565,8 @@ struct QueueSysTime
 struct QueueContextSwitch
 {
     int64_t time;
-    uint64_t oldThread;
-    uint64_t newThread;
+    uint32_t oldThread;
+    uint32_t newThread;
     uint8_t cpu;
     uint8_t reason;
     uint8_t state;
@@ -463,7 +575,7 @@ struct QueueContextSwitch
 struct QueueThreadWakeup
 {
     int64_t time;
-    uint64_t thread;
+    uint32_t thread;
 };
 
 struct QueueTidToPid
@@ -472,10 +584,19 @@ struct QueueTidToPid
     uint64_t pid;
 };
 
+struct QueueHwSample
+{
+    uint64_t ip;
+    int64_t time;
+};
+
 struct QueuePlotConfig
 {
     uint64_t name;      // ptr
     uint8_t type;
+    uint8_t step;
+    uint8_t fill;
+    uint32_t color;
 };
 
 struct QueueParamSetup
@@ -486,6 +607,11 @@ struct QueueParamSetup
     int32_t val;
 };
 
+struct QueueSourceCodeNotAvailable
+{
+    uint32_t id;
+};
+
 struct QueueCpuTopology
 {
     uint32_t package;
@@ -493,6 +619,27 @@ struct QueueCpuTopology
     uint32_t thread;
 };
 
+struct QueueExternalNameMetadata
+{
+    uint64_t thread;
+    uint64_t name;
+    uint64_t threadName;
+};
+
+struct QueueSymbolCodeMetadata
+{
+    uint64_t symbol;
+    uint64_t ptr;
+    uint32_t size;
+};
+
+struct QueueSourceCodeMetadata
+{
+    uint64_t ptr;
+    uint32_t size;
+    uint32_t id;
+};
+
 struct QueueHeader
 {
     union
@@ -510,31 +657,45 @@ struct QueueItem
         QueueThreadContext threadCtx;
         QueueZoneBegin zoneBegin;
         QueueZoneBeginLean zoneBeginLean;
+        QueueZoneBeginThread zoneBeginThread;
         QueueZoneEnd zoneEnd;
+        QueueZoneEndThread zoneEndThread;
         QueueZoneValidation zoneValidation;
+        QueueZoneValidationThread zoneValidationThread;
         QueueZoneColor zoneColor;
+        QueueZoneColorThread zoneColorThread;
         QueueZoneValue zoneValue;
+        QueueZoneValueThread zoneValueThread;
         QueueStringTransfer stringTransfer;
         QueueFrameMark frameMark;
+        QueueFrameVsync frameVsync;
         QueueFrameImage frameImage;
         QueueFrameImageFat frameImageFat;
         QueueSourceLocation srcloc;
         QueueZoneTextFat zoneTextFat;
+        QueueZoneTextFatThread zoneTextFatThread;
         QueueLockAnnounce lockAnnounce;
         QueueLockTerminate lockTerminate;
         QueueLockWait lockWait;
         QueueLockObtain lockObtain;
         QueueLockRelease lockRelease;
+        QueueLockReleaseShared lockReleaseShared;
         QueueLockMark lockMark;
         QueueLockName lockName;
         QueueLockNameFat lockNameFat;
-        QueuePlotData plotData;
+        QueuePlotDataInt plotDataInt;
+        QueuePlotDataFloat plotDataFloat;
+        QueuePlotDataDouble plotDataDouble;
         QueueMessage message;
         QueueMessageColor messageColor;
         QueueMessageLiteral messageLiteral;
+        QueueMessageLiteralThread messageLiteralThread;
         QueueMessageColorLiteral messageColorLiteral;
+        QueueMessageColorLiteralThread messageColorLiteralThread;
         QueueMessageFat messageFat;
+        QueueMessageFatThread messageFatThread;
         QueueMessageColorFat messageColorFat;
+        QueueMessageColorFatThread messageColorFatThread;
         QueueGpuNewContext gpuNewContext;
         QueueGpuZoneBegin gpuZoneBegin;
         QueueGpuZoneBeginLean gpuZoneBeginLean;
@@ -547,24 +708,35 @@ struct QueueItem
         QueueMemFree memFree;
         QueueMemNamePayload memName;
         QueueCallstackFat callstackFat;
+        QueueCallstackFatThread callstackFatThread;
         QueueCallstackAllocFat callstackAllocFat;
+        QueueCallstackAllocFatThread callstackAllocFatThread;
         QueueCallstackSample callstackSample;
         QueueCallstackSampleFat callstackSampleFat;
         QueueCallstackFrameSize callstackFrameSize;
+        QueueCallstackFrameSizeFat callstackFrameSizeFat;
         QueueCallstackFrame callstackFrame;
         QueueSymbolInformation symbolInformation;
-        QueueCodeInformation codeInformation;
+        QueueSymbolInformationFat symbolInformationFat;
         QueueCrashReport crashReport;
+        QueueCrashReportThread crashReportThread;
         QueueSysTime sysTime;
         QueueContextSwitch contextSwitch;
         QueueThreadWakeup threadWakeup;
         QueueTidToPid tidToPid;
+        QueueHwSample hwSample;
         QueuePlotConfig plotConfig;
         QueueParamSetup paramSetup;
         QueueCpuTopology cpuTopology;
+        QueueExternalNameMetadata externalNameMetadata;
+        QueueSymbolCodeMetadata symbolCodeMetadata;
+        QueueSourceCodeMetadata sourceCodeMetadata;
+        QueueSourceCodeNotAvailable sourceCodeNotAvailable;
+        QueueFiberEnter fiberEnter;
+        QueueFiberLeave fiberLeave;
     };
 };
-#pragma pack()
+#pragma pack( pop )
 
 
 enum { QueueItemSize = sizeof( QueueItem ) };
@@ -583,6 +755,7 @@ static constexpr size_t QueueDataSize[] = {
     sizeof( QueueHeader ),                                  // callstack
     sizeof( QueueHeader ),                                  // callstack alloc
     sizeof( QueueHeader ) + sizeof( QueueCallstackSample ),
+    sizeof( QueueHeader ) + sizeof( QueueCallstackSample ), // context switch
     sizeof( QueueHeader ) + sizeof( QueueFrameImage ),
     sizeof( QueueHeader ) + sizeof( QueueZoneBegin ),
     sizeof( QueueHeader ) + sizeof( QueueZoneBegin ),       // callstack
@@ -592,7 +765,7 @@ static constexpr size_t QueueDataSize[] = {
     sizeof( QueueHeader ) + sizeof( QueueLockRelease ),
     sizeof( QueueHeader ) + sizeof( QueueLockWait ),        // shared
     sizeof( QueueHeader ) + sizeof( QueueLockObtain ),      // shared
-    sizeof( QueueHeader ) + sizeof( QueueLockRelease ),     // shared
+    sizeof( QueueHeader ) + sizeof( QueueLockReleaseShared ),
     sizeof( QueueHeader ) + sizeof( QueueLockName ),
     sizeof( QueueHeader ) + sizeof( QueueMemAlloc ),
     sizeof( QueueHeader ) + sizeof( QueueMemAlloc ),        // named
@@ -612,11 +785,20 @@ static constexpr size_t QueueDataSize[] = {
     sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// serial, allocated source location
     sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// serial, allocated source location, callstack
     sizeof( QueueHeader ) + sizeof( QueueGpuZoneEnd ),      // serial
-    sizeof( QueueHeader ) + sizeof( QueuePlotData ),
+    sizeof( QueueHeader ) + sizeof( QueuePlotDataInt ),
+    sizeof( QueueHeader ) + sizeof( QueuePlotDataFloat ),
+    sizeof( QueueHeader ) + sizeof( QueuePlotDataDouble ),
     sizeof( QueueHeader ) + sizeof( QueueContextSwitch ),
     sizeof( QueueHeader ) + sizeof( QueueThreadWakeup ),
     sizeof( QueueHeader ) + sizeof( QueueGpuTime ),
     sizeof( QueueHeader ) + sizeof( QueueGpuContextName ),
+    sizeof( QueueHeader ) + sizeof( QueueCallstackFrameSize ),
+    sizeof( QueueHeader ) + sizeof( QueueSymbolInformation ),
+    sizeof( QueueHeader ),                                  // ExternalNameMetadata - not for wire transfer
+    sizeof( QueueHeader ),                                  // SymbolCodeMetadata - not for wire transfer
+    sizeof( QueueHeader ),                                  // SourceCodeMetadata - not for wire transfer
+    sizeof( QueueHeader ) + sizeof( QueueFiberEnter ),
+    sizeof( QueueHeader ) + sizeof( QueueFiberLeave ),
     // above items must be first
     sizeof( QueueHeader ),                                  // terminate
     sizeof( QueueHeader ),                                  // keep alive
@@ -630,6 +812,7 @@ static constexpr size_t QueueDataSize[] = {
     sizeof( QueueHeader ) + sizeof( QueueFrameMark ),       // continuous frames
     sizeof( QueueHeader ) + sizeof( QueueFrameMark ),       // start
     sizeof( QueueHeader ) + sizeof( QueueFrameMark ),       // end
+    sizeof( QueueHeader ) + sizeof( QueueFrameVsync ),
     sizeof( QueueHeader ) + sizeof( QueueSourceLocation ),
     sizeof( QueueHeader ) + sizeof( QueueLockAnnounce ),
     sizeof( QueueHeader ) + sizeof( QueueLockTerminate ),
@@ -639,16 +822,20 @@ static constexpr size_t QueueDataSize[] = {
     sizeof( QueueHeader ) + sizeof( QueueMessageLiteral ),  // callstack
     sizeof( QueueHeader ) + sizeof( QueueMessageColorLiteral ), // callstack
     sizeof( QueueHeader ) + sizeof( QueueGpuNewContext ),
-    sizeof( QueueHeader ) + sizeof( QueueCallstackFrameSize ),
     sizeof( QueueHeader ) + sizeof( QueueCallstackFrame ),
-    sizeof( QueueHeader ) + sizeof( QueueSymbolInformation ),
-    sizeof( QueueHeader ) + sizeof( QueueCodeInformation ),
     sizeof( QueueHeader ) + sizeof( QueueSysTime ),
     sizeof( QueueHeader ) + sizeof( QueueTidToPid ),
+    sizeof( QueueHeader ) + sizeof( QueueHwSample ),        // cpu cycle
+    sizeof( QueueHeader ) + sizeof( QueueHwSample ),        // instruction retired
+    sizeof( QueueHeader ) + sizeof( QueueHwSample ),        // cache reference
+    sizeof( QueueHeader ) + sizeof( QueueHwSample ),        // cache miss
+    sizeof( QueueHeader ) + sizeof( QueueHwSample ),        // branch retired
+    sizeof( QueueHeader ) + sizeof( QueueHwSample ),        // branch miss
     sizeof( QueueHeader ) + sizeof( QueuePlotConfig ),
     sizeof( QueueHeader ) + sizeof( QueueParamSetup ),
     sizeof( QueueHeader ),                                  // server query acknowledgement
-    sizeof( QueueHeader ),                                  // source code not available
+    sizeof( QueueHeader ) + sizeof( QueueSourceCodeNotAvailable ),
+    sizeof( QueueHeader ),                                  // symbol code not available
     sizeof( QueueHeader ) + sizeof( QueueCpuTopology ),
     sizeof( QueueHeader ),                                  // single string data
     sizeof( QueueHeader ),                                  // second string data
@@ -666,6 +853,7 @@ static constexpr size_t QueueDataSize[] = {
     sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // external thread name
     sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // symbol code
     sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // source code
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // fiber name
 };
 
 static_assert( QueueItemSize == 32, "Queue item size not 32 bytes" );
diff --git a/Source/ThirdParty/tracy/common/TracySocket.cpp b/Source/ThirdParty/tracy/common/TracySocket.cpp
index f16569b06..176bbc7aa 100644
--- a/Source/ThirdParty/tracy/common/TracySocket.cpp
+++ b/Source/ThirdParty/tracy/common/TracySocket.cpp
@@ -8,6 +8,7 @@
 
 #include "TracyAlloc.hpp"
 #include "TracySocket.hpp"
+#include "TracySystem.hpp"
 
 #ifdef _WIN32
 #  ifndef NOMINMAX
@@ -454,7 +455,7 @@ static int addrinfo_and_socket_for_family( uint16_t port, int ai_family, struct
     hints.ai_family = ai_family;
     hints.ai_socktype = SOCK_STREAM;
 #ifndef TRACY_ONLY_LOCALHOST
-    const char* onlyLocalhost = getenv( "TRACY_ONLY_LOCALHOST" );
+    const char* onlyLocalhost = GetEnvVar( "TRACY_ONLY_LOCALHOST" );
     if( !onlyLocalhost || onlyLocalhost[0] != '1' )
     {
         hints.ai_flags = AI_PASSIVE;
@@ -475,7 +476,7 @@ bool ListenSocket::Listen( uint16_t port, int backlog )
     struct addrinfo* res = nullptr;
 
 #if !defined TRACY_ONLY_IPV4 && !defined TRACY_ONLY_LOCALHOST
-    const char* onlyIPv4 = getenv( "TRACY_ONLY_IPV4" );
+    const char* onlyIPv4 = GetEnvVar( "TRACY_ONLY_IPV4" );
     if( !onlyIPv4 || onlyIPv4[0] != '1' )
     {
         m_sock = addrinfo_and_socket_for_family( port, AF_INET6, &res );
@@ -488,7 +489,7 @@ bool ListenSocket::Listen( uint16_t port, int backlog )
         m_sock = addrinfo_and_socket_for_family( port, AF_INET, &res );
         if( m_sock == -1 ) return false;
     }
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
     unsigned long val = 0;
     setsockopt( m_sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*)&val, sizeof( val ) );
 #elif defined BSD
diff --git a/Source/ThirdParty/tracy/common/TracySocket.hpp b/Source/ThirdParty/tracy/common/TracySocket.hpp
index 4fbb3278a..4b3075e29 100644
--- a/Source/ThirdParty/tracy/common/TracySocket.hpp
+++ b/Source/ThirdParty/tracy/common/TracySocket.hpp
@@ -2,6 +2,7 @@
 #define __TRACYSOCKET_HPP__
 
 #include <atomic>
+#include <stddef.h>
 #include <stdint.h>
 
 struct addrinfo;
diff --git a/Source/ThirdParty/tracy/common/TracyStackFrames.cpp b/Source/ThirdParty/tracy/common/TracyStackFrames.cpp
new file mode 100644
index 000000000..7b0abace3
--- /dev/null
+++ b/Source/ThirdParty/tracy/common/TracyStackFrames.cpp
@@ -0,0 +1,122 @@
+#include "TracyStackFrames.hpp"
+
+namespace tracy
+{
+
+const char* s_tracyStackFrames_[] = {
+    "tracy::Callstack",
+    "tracy::Callstack(int)",
+    "tracy::GpuCtxScope::{ctor}",
+    "tracy::Profiler::SendCallstack",
+    "tracy::Profiler::SendCallstack(int)",
+    "tracy::Profiler::SendCallstack(int, unsigned long)",
+    "tracy::Profiler::MemAllocCallstack",
+    "tracy::Profiler::MemAllocCallstack(void const*, unsigned long, int)",
+    "tracy::Profiler::MemFreeCallstack",
+    "tracy::Profiler::MemFreeCallstack(void const*, int)",
+    "tracy::ScopedZone::{ctor}",
+    "tracy::ScopedZone::ScopedZone(tracy::SourceLocationData const*, int, bool)",
+    "tracy::Profiler::Message",
+    nullptr
+};
+
+const char** s_tracyStackFrames = s_tracyStackFrames_;
+
+const StringMatch s_tracySkipSubframes_[] = {
+    { "/include/arm_neon.h", 19 },
+    { "/include/adxintrin.h", 20 },
+    { "/include/ammintrin.h", 20 },
+    { "/include/amxbf16intrin.h", 24 },
+    { "/include/amxint8intrin.h", 24 },
+    { "/include/amxtileintrin.h", 24 },
+    { "/include/avx2intrin.h", 21 },
+    { "/include/avx5124fmapsintrin.h", 29 },
+    { "/include/avx5124vnniwintrin.h", 29 },
+    { "/include/avx512bf16intrin.h", 27 },
+    { "/include/avx512bf16vlintrin.h", 29 },
+    { "/include/avx512bitalgintrin.h", 29 },
+    { "/include/avx512bwintrin.h", 25 },
+    { "/include/avx512cdintrin.h", 25 },
+    { "/include/avx512dqintrin.h", 25 },
+    { "/include/avx512erintrin.h", 25 },
+    { "/include/avx512fintrin.h", 24 },
+    { "/include/avx512ifmaintrin.h", 27 },
+    { "/include/avx512ifmavlintrin.h", 29 },
+    { "/include/avx512pfintrin.h", 25 },
+    { "/include/avx512vbmi2intrin.h", 28 },
+    { "/include/avx512vbmi2vlintrin.h", 30 },
+    { "/include/avx512vbmiintrin.h", 27 },
+    { "/include/avx512vbmivlintrin.h", 29 },
+    { "/include/avx512vlbwintrin.h", 27 },
+    { "/include/avx512vldqintrin.h", 27 },
+    { "/include/avx512vlintrin.h", 25 },
+    { "/include/avx512vnniintrin.h", 27 },
+    { "/include/avx512vnnivlintrin.h", 29 },
+    { "/include/avx512vp2intersectintrin.h", 35 },
+    { "/include/avx512vp2intersectvlintrin.h", 37 },
+    { "/include/avx512vpopcntdqintrin.h", 32 },
+    { "/include/avx512vpopcntdqvlintrin.h", 34 },
+    { "/include/avxintrin.h", 20 },
+    { "/include/avxvnniintrin.h", 24 },
+    { "/include/bmi2intrin.h", 21 },
+    { "/include/bmiintrin.h", 20 },
+    { "/include/bmmintrin.h", 20 },
+    { "/include/cetintrin.h", 20 },
+    { "/include/cldemoteintrin.h", 25 },
+    { "/include/clflushoptintrin.h", 27 },
+    { "/include/clwbintrin.h", 21 },
+    { "/include/clzerointrin.h", 23 },
+    { "/include/emmintrin.h", 20 },
+    { "/include/enqcmdintrin.h", 23 },
+    { "/include/f16cintrin.h", 21 },
+    { "/include/fma4intrin.h", 21 },
+    { "/include/fmaintrin.h", 20 },
+    { "/include/fxsrintrin.h", 21 },
+    { "/include/gfniintrin.h", 21 },
+    { "/include/hresetintrin.h", 23 },
+    { "/include/ia32intrin.h", 21 },
+    { "/include/immintrin.h", 20 },
+    { "/include/keylockerintrin.h", 26 },
+    { "/include/lwpintrin.h", 20 },
+    { "/include/lzcntintrin.h", 22 },
+    { "/include/mmintrin.h", 19 },
+    { "/include/movdirintrin.h", 23 },
+    { "/include/mwaitxintrin.h", 23 },
+    { "/include/nmmintrin.h", 20 },
+    { "/include/pconfigintrin.h", 24 },
+    { "/include/pkuintrin.h", 20 },
+    { "/include/pmmintrin.h", 20 },
+    { "/include/popcntintrin.h", 23 },
+    { "/include/prfchwintrin.h", 23 },
+    { "/include/rdseedintrin.h", 23 },
+    { "/include/rtmintrin.h", 20 },
+    { "/include/serializeintrin.h", 26 },
+    { "/include/sgxintrin.h", 20 },
+    { "/include/shaintrin.h", 20 },
+    { "/include/smmintrin.h", 20 },
+    { "/include/tbmintrin.h", 20 },
+    { "/include/tmmintrin.h", 20 },
+    { "/include/tsxldtrkintrin.h", 25 },
+    { "/include/uintrintrin.h", 22 },
+    { "/include/vaesintrin.h", 21 },
+    { "/include/vpclmulqdqintrin.h", 27 },
+    { "/include/waitpkgintrin.h", 24 },
+    { "/include/wbnoinvdintrin.h", 25 },
+    { "/include/wmmintrin.h", 20 },
+    { "/include/x86gprintrin.h", 23 },
+    { "/include/x86intrin.h", 20 },
+    { "/include/xmmintrin.h", 20 },
+    { "/include/xopintrin.h", 20 },
+    { "/include/xsavecintrin.h", 23 },
+    { "/include/xsaveintrin.h", 22 },
+    { "/include/xsaveoptintrin.h", 25 },
+    { "/include/xsavesintrin.h", 23 },
+    { "/include/xtestintrin.h", 22 },
+    { "/bits/atomic_base.h", 19 },
+    { "/atomic", 7 },
+    {}
+};
+
+const StringMatch* s_tracySkipSubframes = s_tracySkipSubframes_;
+
+}
diff --git a/Source/ThirdParty/tracy/common/TracyStackFrames.hpp b/Source/ThirdParty/tracy/common/TracyStackFrames.hpp
new file mode 100644
index 000000000..9d4262c00
--- /dev/null
+++ b/Source/ThirdParty/tracy/common/TracyStackFrames.hpp
@@ -0,0 +1,22 @@
+#ifndef __TRACYSTACKFRAMES_HPP__
+#define __TRACYSTACKFRAMES_HPP__
+
+#include <stddef.h>
+
+namespace tracy
+{
+
+struct StringMatch
+{
+    const char* str;
+    size_t len;
+};
+
+extern const char** s_tracyStackFrames;
+extern const StringMatch* s_tracySkipSubframes;
+
+static constexpr int s_tracySkipSubframesMinLen = 7;
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/common/TracySystem.cpp b/Source/ThirdParty/tracy/common/TracySystem.cpp
index 25ccf9f8a..5ca8e1f45 100644
--- a/Source/ThirdParty/tracy/common/TracySystem.cpp
+++ b/Source/ThirdParty/tracy/common/TracySystem.cpp
@@ -1,16 +1,16 @@
-#if defined _MSC_VER || defined __CYGWIN__ || defined _WIN32
-# ifndef WIN32_LEAN_AND_MEAN
-#  define WIN32_LEAN_AND_MEAN
-# endif
-# ifndef NOMINMAX
-#  define NOMINMAX
-# endif
-#endif
 #ifdef _MSC_VER
 #  pragma warning(disable:4996)
 #endif
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
+#  ifndef WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN
+#  endif
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
 #  include <windows.h>
+#  include <malloc.h>
+#  include "TracyUwp.hpp"
 #else
 #  include <pthread.h>
 #  include <string.h>
@@ -39,7 +39,7 @@
 
 #include "TracySystem.hpp"
 
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
 extern "C" typedef HRESULT (WINAPI *t_SetThreadDescription)( HANDLE, PCWSTR );
 extern "C" typedef HRESULT (WINAPI *t_GetThreadDescription)( HANDLE, PWSTR* );
 #endif
@@ -55,19 +55,19 @@ namespace tracy
 namespace detail
 {
 
-TRACY_API uint64_t GetThreadHandleImpl()
+TRACY_API uint32_t GetThreadHandleImpl()
 {
-#if defined _WIN32 || defined __CYGWIN__
-    static_assert( sizeof( decltype( GetCurrentThreadId() ) ) <= sizeof( uint64_t ), "Thread handle too big to fit in protocol" );
-    return uint64_t( GetCurrentThreadId() );
+#if defined _WIN32
+    static_assert( sizeof( decltype( GetCurrentThreadId() ) ) <= sizeof( uint32_t ), "Thread handle too big to fit in protocol" );
+    return uint32_t( GetCurrentThreadId() );
 #elif defined __APPLE__
     uint64_t id;
     pthread_threadid_np( pthread_self(), &id );
-    return id;
+    return uint32_t( id );
 #elif defined __ANDROID__
-    return (uint64_t)gettid();
+    return (uint32_t)gettid();
 #elif defined __linux__
-    return (uint64_t)syscall( SYS_gettid );
+    return (uint32_t)syscall( SYS_gettid );
 #elif defined __FreeBSD__
     long id;
     thr_self( &id );
@@ -78,9 +78,17 @@ TRACY_API uint64_t GetThreadHandleImpl()
     return lwp_gettid();
 #elif defined __OpenBSD__
     return getthrid();
+#elif defined __EMSCRIPTEN__
+    // Not supported, but let it compile.
+    return 0;
 #else
-    static_assert( sizeof( decltype( pthread_self() ) ) <= sizeof( uint64_t ), "Thread handle too big to fit in protocol" );
-    return uint64_t( pthread_self() );
+    // To add support for a platform, retrieve and return the kernel thread identifier here.
+    //
+    // Note that pthread_t (as for example returned by pthread_self()) is *not* a kernel
+    // thread identifier. It is a pointer to a library-allocated data structure instead.
+    // Such pointers will be reused heavily, making the pthread_t non-unique. Additionally
+    // a 64-bit pointer cannot be reliably truncated to 32 bits.
+    #error "Unsupported platform!"
 #endif
 
 }
@@ -90,18 +98,44 @@ TRACY_API uint64_t GetThreadHandleImpl()
 #ifdef TRACY_ENABLE
 struct ThreadNameData
 {
-    uint64_t id;
+    uint32_t id;
     const char* name;
     ThreadNameData* next;
 };
 std::atomic<ThreadNameData*>& GetThreadNameData();
-TRACY_API void InitRPMallocThread();
+#endif
+
+#ifdef _MSC_VER
+#  pragma pack( push, 8 )
+struct THREADNAME_INFO
+{
+    DWORD dwType;
+    LPCSTR szName;
+    DWORD dwThreadID;
+    DWORD dwFlags;
+};
+#  pragma pack( pop )
+
+void ThreadNameMsvcMagic( const THREADNAME_INFO& info )
+{
+    __try
+    {
+        RaiseException( 0x406D1388, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info );
+    }
+    __except(EXCEPTION_EXECUTE_HANDLER)
+    {
+    }
+}
 #endif
 
 TRACY_API void SetThreadName( const char* name )
 {
-#if defined _WIN32 || defined __CYGWIN__
+#if defined _WIN32
+#  ifdef TRACY_UWP
+    static auto _SetThreadDescription = &::SetThreadDescription;
+#  else
     static auto _SetThreadDescription = (t_SetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "SetThreadDescription" );
+#  endif
     if( _SetThreadDescription )
     {
         wchar_t buf[256];
@@ -111,57 +145,45 @@ TRACY_API void SetThreadName( const char* name )
     else
     {
 #  if defined _MSC_VER
-        const DWORD MS_VC_EXCEPTION=0x406D1388;
-#    pragma pack( push, 8 )
-        struct THREADNAME_INFO
-        {
-            DWORD dwType;
-            LPCSTR szName;
-            DWORD dwThreadID;
-            DWORD dwFlags;
-        };
-#    pragma pack(pop)
-
-        DWORD ThreadId = GetCurrentThreadId();
         THREADNAME_INFO info;
         info.dwType = 0x1000;
         info.szName = name;
-        info.dwThreadID = ThreadId;
+        info.dwThreadID = GetCurrentThreadId();
         info.dwFlags = 0;
-
-        __try
-        {
-            RaiseException( MS_VC_EXCEPTION, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info );
-        }
-        __except(EXCEPTION_EXECUTE_HANDLER)
-        {
-        }
+        ThreadNameMsvcMagic( info );
 #  endif
     }
-#elif defined _GNU_SOURCE && !defined __EMSCRIPTEN__ && !defined __CYGWIN__
+#elif defined _GNU_SOURCE && !defined __EMSCRIPTEN__
     {
         const auto sz = strlen( name );
         if( sz <= 15 )
         {
+#if defined __APPLE__
+            pthread_setname_np( name );
+#else
             pthread_setname_np( pthread_self(), name );
+#endif
         }
         else
         {
             char buf[16];
             memcpy( buf, name, 15 );
             buf[15] = '\0';
+#if defined __APPLE__
+            pthread_setname_np( buf );
+#else
             pthread_setname_np( pthread_self(), buf );
+#endif
         }
     }
 #endif
 #ifdef TRACY_ENABLE
     {
-        InitRPMallocThread();
         const auto sz = strlen( name );
         char* buf = (char*)tracy_malloc( sz+1 );
         memcpy( buf, name, sz );
         buf[sz] = '\0';
-        auto data = (ThreadNameData*)tracy_malloc( sizeof( ThreadNameData ) );
+        auto data = (ThreadNameData*)tracy_malloc_fast( sizeof( ThreadNameData ) );
         data->id = detail::GetThreadHandleImpl();
         data->name = buf;
         data->next = GetThreadNameData().load( std::memory_order_relaxed );
@@ -170,7 +192,7 @@ TRACY_API void SetThreadName( const char* name )
 #endif
 }
 
-TRACY_API const char* GetThreadName( uint64_t id )
+TRACY_API const char* GetThreadName( uint32_t id )
 {
     static char buf[256];
 #ifdef TRACY_ENABLE
@@ -184,8 +206,12 @@ TRACY_API const char* GetThreadName( uint64_t id )
         ptr = ptr->next;
     }
 #else
-#  if defined _WIN32 || defined __CYGWIN__
+#  if defined _WIN32
+#    ifdef TRACY_UWP
+    static auto _GetThreadDescription = &::GetThreadDescription;
+#    else
     static auto _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" );
+#    endif
     if( _GetThreadDescription )
     {
         auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id );
@@ -210,7 +236,7 @@ TRACY_API const char* GetThreadName( uint64_t id )
     int tid = (int) syscall( SYS_gettid );
 #   endif
     snprintf( path, sizeof( path ), "/proc/self/task/%d/comm", tid );
-    sprintf( buf, "%" PRIu64, id );
+    sprintf( buf, "%" PRIu32, id );
 #   ifndef __ANDROID__
     pthread_setcancelstate( PTHREAD_CANCEL_DISABLE, &cs );
 #   endif
@@ -232,8 +258,40 @@ TRACY_API const char* GetThreadName( uint64_t id )
     return buf;
 #  endif
 #endif
-    sprintf( buf, "%" PRIu64, id );
+    sprintf( buf, "%" PRIu32, id );
     return buf;
 }
 
+TRACY_API const char* GetEnvVar( const char* name )
+{
+#if defined _WIN32
+    // unfortunately getenv() on Windows is just fundamentally broken.  It caches the entire
+    // environment block once on startup, then never refreshes it again.  If any environment
+    // strings are added or modified after startup of the CRT, those changes will not be
+    // seen by getenv().  This removes the possibility of an app using this SDK from
+    // programmatically setting any of the behaviour controlling envvars here.
+    //
+    // To work around this, we'll instead go directly to the Win32 environment strings APIs
+    // to get the current value.
+    static char buffer[1024];
+    DWORD const kBufferSize = DWORD(sizeof(buffer) / sizeof(buffer[0]));
+    DWORD count = GetEnvironmentVariableA(name, buffer, kBufferSize);
+
+    if( count == 0 )
+        return nullptr;
+
+    if( count >= kBufferSize )
+    {
+        char* buf = reinterpret_cast<char*>(_alloca(count + 1));
+        count = GetEnvironmentVariableA(name, buf, count + 1);
+        memcpy(buffer, buf, kBufferSize);
+        buffer[kBufferSize - 1] = 0;
+    }
+
+    return buffer;
+#else
+    return getenv(name);
+#endif
+}
+
 }
diff --git a/Source/ThirdParty/tracy/common/TracySystem.hpp b/Source/ThirdParty/tracy/common/TracySystem.hpp
index 8a699886b..edcc5cf31 100644
--- a/Source/ThirdParty/tracy/common/TracySystem.hpp
+++ b/Source/ThirdParty/tracy/common/TracySystem.hpp
@@ -32,7 +32,7 @@ enum class PlotFormatType : uint8_t
     Percentage
 };
 
-typedef void(*ParameterCallback)( uint32_t idx, int32_t val );
+typedef void(*ParameterCallback)( void* data, uint32_t idx, int32_t val );
 
 struct TRACY_API SourceLocationData
 {
@@ -80,20 +80,22 @@ private:
 
 namespace detail
 {
-TRACY_API uint64_t GetThreadHandleImpl();
+TRACY_API uint32_t GetThreadHandleImpl();
 }
 
 #ifdef TRACY_ENABLE
-TRACY_API uint64_t GetThreadHandle();
+TRACY_API uint32_t GetThreadHandle();
 #else
-static inline uint64_t GetThreadHandle()
+static inline uint32_t GetThreadHandle()
 {
     return detail::GetThreadHandleImpl();
 }
 #endif
 
 TRACY_API void SetThreadName( const char* name );
-TRACY_API const char* GetThreadName( uint64_t id );
+TRACY_API const char* GetThreadName( uint32_t id );
+
+TRACY_API const char* GetEnvVar(const char* name);
 
 }
 
diff --git a/Source/ThirdParty/tracy/common/TracyUwp.hpp b/Source/ThirdParty/tracy/common/TracyUwp.hpp
new file mode 100644
index 000000000..7dce96b96
--- /dev/null
+++ b/Source/ThirdParty/tracy/common/TracyUwp.hpp
@@ -0,0 +1,11 @@
+#ifndef __TRACYUWP_HPP__
+#define __TRACYUWP_HPP__
+
+#ifdef _WIN32
+#  include <winapifamily.h>
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) && !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#    define TRACY_UWP
+#  endif
+#endif
+
+#endif
diff --git a/Source/ThirdParty/tracy/common/TracyVersion.hpp b/Source/ThirdParty/tracy/common/TracyVersion.hpp
new file mode 100644
index 000000000..983d1c51f
--- /dev/null
+++ b/Source/ThirdParty/tracy/common/TracyVersion.hpp
@@ -0,0 +1,14 @@
+#ifndef __TRACYVERSION_HPP__
+#define __TRACYVERSION_HPP__
+
+namespace tracy
+{
+namespace Version
+{
+enum { Major = 0 };
+enum { Minor = 9 };
+enum { Patch = 0 };
+}
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/common/TracyYield.hpp b/Source/ThirdParty/tracy/common/TracyYield.hpp
new file mode 100644
index 000000000..703970dbb
--- /dev/null
+++ b/Source/ThirdParty/tracy/common/TracyYield.hpp
@@ -0,0 +1,26 @@
+#ifndef __TRACYYIELD_HPP__
+#define __TRACYYIELD_HPP__
+
+#if defined __SSE2__ || defined _M_AMD64 || (defined _M_IX86_FP && _M_IX86_FP == 2)
+#  include <emmintrin.h>
+#else
+#  include <thread>
+#endif
+
+namespace tracy
+{
+
+static tracy_force_inline void YieldThread()
+{
+#if defined __SSE2__ || defined _M_AMD64 || (defined _M_IX86_FP && _M_IX86_FP == 2)
+    _mm_pause();
+#elif defined __aarch64__
+    asm volatile( "isb" : : );
+#else
+    std::this_thread::yield();
+#endif
+}
+
+}
+
+#endif
diff --git a/Source/ThirdParty/tracy/libbacktrace/backtrace.hpp b/Source/ThirdParty/tracy/libbacktrace/backtrace.hpp
index d999803c8..e4be297a9 100644
--- a/Source/ThirdParty/tracy/libbacktrace/backtrace.hpp
+++ b/Source/ThirdParty/tracy/libbacktrace/backtrace.hpp
@@ -53,13 +53,14 @@ struct backtrace_state;
    invalid after this function returns.
 
    As a special case, the ERRNUM argument will be passed as -1 if no
-   debug info can be found for the executable, but the function
-   requires debug info (e.g., backtrace_full, backtrace_pcinfo).  The
-   MSG in this case will be something along the lines of "no debug
-   info".  Similarly, ERRNUM will be passed as -1 if there is no
-   symbol table, but the function requires a symbol table (e.g.,
-   backtrace_syminfo).  This may be used as a signal that some other
-   approach should be tried.  */
+   debug info can be found for the executable, or if the debug info
+   exists but has an unsupported version, but the function requires
+   debug info (e.g., backtrace_full, backtrace_pcinfo).  The MSG in
+   this case will be something along the lines of "no debug info".
+   Similarly, ERRNUM will be passed as -1 if there is no symbol table,
+   but the function requires a symbol table (e.g., backtrace_syminfo).
+   This may be used as a signal that some other approach should be
+   tried.  */
 
 typedef void (*backtrace_error_callback) (void *data, const char *msg,
 					  int errnum);
diff --git a/Source/ThirdParty/tracy/libbacktrace/dwarf.cpp b/Source/ThirdParty/tracy/libbacktrace/dwarf.cpp
index f76e03cfb..246cb9f36 100644
--- a/Source/ThirdParty/tracy/libbacktrace/dwarf.cpp
+++ b/Source/ThirdParty/tracy/libbacktrace/dwarf.cpp
@@ -52,6 +52,7 @@ enum dwarf_tag {
   DW_TAG_compile_unit = 0x11,
   DW_TAG_inlined_subroutine = 0x1d,
   DW_TAG_subprogram = 0x2e,
+  DW_TAG_skeleton_unit = 0x4a,
 };
 
 enum dwarf_form {
@@ -746,13 +747,13 @@ struct dwarf_data
 /* Report an error for a DWARF buffer.  */
 
 static void
-dwarf_buf_error (struct dwarf_buf *buf, const char *msg)
+dwarf_buf_error (struct dwarf_buf *buf, const char *msg, int errnum)
 {
   char b[200];
 
   snprintf (b, sizeof b, "%s in %s at %d",
 	    msg, buf->name, (int) (buf->buf - buf->start));
-  buf->error_callback (buf->data, b, 0);
+  buf->error_callback (buf->data, b, errnum);
 }
 
 /* Require at least COUNT bytes in BUF.  Return 1 if all is well, 0 on
@@ -766,7 +767,7 @@ require (struct dwarf_buf *buf, size_t count)
 
   if (!buf->reported_underflow)
     {
-      dwarf_buf_error (buf, "DWARF underflow");
+      dwarf_buf_error (buf, "DWARF underflow", 0);
       buf->reported_underflow = 1;
     }
 
@@ -928,7 +929,7 @@ read_address (struct dwarf_buf *buf, int addrsize)
     case 8:
       return read_uint64 (buf);
     default:
-      dwarf_buf_error (buf, "unrecognized address size");
+      dwarf_buf_error (buf, "unrecognized address size", 0);
       return 0;
     }
 }
@@ -979,7 +980,7 @@ read_uleb128 (struct dwarf_buf *buf)
 	ret |= ((uint64_t) (b & 0x7f)) << shift;
       else if (!overflow)
 	{
-	  dwarf_buf_error (buf, "LEB128 overflows uint64_t");
+	  dwarf_buf_error (buf, "LEB128 overflows uint64_t", 0);
 	  overflow = 1;
 	}
       shift += 7;
@@ -1014,7 +1015,7 @@ read_sleb128 (struct dwarf_buf *buf)
 	val |= ((uint64_t) (b & 0x7f)) << shift;
       else if (!overflow)
 	{
-	  dwarf_buf_error (buf, "signed LEB128 overflows uint64_t");
+	  dwarf_buf_error (buf, "signed LEB128 overflows uint64_t", 0);
 	  overflow = 1;
 	}
       shift += 7;
@@ -1154,7 +1155,7 @@ read_attribute (enum dwarf_form form, uint64_t implicit_val,
 	offset = read_offset (buf, is_dwarf64);
 	if (offset >= dwarf_sections->size[DEBUG_STR])
 	  {
-	    dwarf_buf_error (buf, "DW_FORM_strp out of range");
+	    dwarf_buf_error (buf, "DW_FORM_strp out of range", 0);
 	    return 0;
 	  }
 	val->encoding = ATTR_VAL_STRING;
@@ -1169,7 +1170,7 @@ read_attribute (enum dwarf_form form, uint64_t implicit_val,
 	offset = read_offset (buf, is_dwarf64);
 	if (offset >= dwarf_sections->size[DEBUG_LINE_STR])
 	  {
-	    dwarf_buf_error (buf, "DW_FORM_line_strp out of range");
+	    dwarf_buf_error (buf, "DW_FORM_line_strp out of range", 0);
 	    return 0;
 	  }
 	val->encoding = ATTR_VAL_STRING;
@@ -1216,7 +1217,8 @@ read_attribute (enum dwarf_form form, uint64_t implicit_val,
 	if (form == DW_FORM_implicit_const)
 	  {
 	    dwarf_buf_error (buf,
-			     "DW_FORM_indirect to DW_FORM_implicit_const");
+			     "DW_FORM_indirect to DW_FORM_implicit_const",
+			     0);
 	    return 0;
 	  }
 	return read_attribute ((enum dwarf_form) form, 0, buf, is_dwarf64,
@@ -1349,7 +1351,7 @@ read_attribute (enum dwarf_form form, uint64_t implicit_val,
 	  }
 	if (offset >= altlink->dwarf_sections.size[DEBUG_STR])
 	  {
-	    dwarf_buf_error (buf, "DW_FORM_strp_sup out of range");
+	    dwarf_buf_error (buf, "DW_FORM_strp_sup out of range", 0);
 	    return 0;
 	  }
 	val->encoding = ATTR_VAL_STRING;
@@ -1358,7 +1360,7 @@ read_attribute (enum dwarf_form form, uint64_t implicit_val,
 	return 1;
       }
     default:
-      dwarf_buf_error (buf, "unrecognized DWARF form");
+      dwarf_buf_error (buf, "unrecognized DWARF form", -1);
       return 0;
     }
 }
@@ -1407,7 +1409,9 @@ resolve_string (const struct dwarf_sections *dwarf_sections, int is_dwarf64,
 	offset = read_offset (&offset_buf, is_dwarf64);
 	if (offset >= dwarf_sections->size[DEBUG_STR])
 	  {
-	    dwarf_buf_error (&offset_buf, "DW_FORM_strx offset out of range");
+	    dwarf_buf_error (&offset_buf,
+				   "DW_FORM_strx offset out of range",
+				   0);
 	    return 0;
 	  }
 	*string = (const char *) dwarf_sections->data[DEBUG_STR] + offset;
@@ -2215,7 +2219,7 @@ add_ranges_from_rnglists (
 	  break;
 
 	default:
-	  dwarf_buf_error (&rnglists_buf, "unrecognized DW_RLE value");
+	  dwarf_buf_error (&rnglists_buf, "unrecognized DW_RLE value", -1);
 	  return 0;
 	}
     }
@@ -2322,14 +2326,16 @@ find_address_ranges (struct backtrace_state *state, uintptr_t base_address,
 	      break;
 
 	    case DW_AT_stmt_list:
-	      if (abbrev->tag == DW_TAG_compile_unit
+	      if ((abbrev->tag == DW_TAG_compile_unit
+		   || abbrev->tag == DW_TAG_skeleton_unit)
 		  && (val.encoding == ATTR_VAL_UINT
 		      || val.encoding == ATTR_VAL_REF_SECTION))
 		u->lineoff = val.u.uint;
 	      break;
 
 	    case DW_AT_name:
-	      if (abbrev->tag == DW_TAG_compile_unit)
+	      if (abbrev->tag == DW_TAG_compile_unit
+		  || abbrev->tag == DW_TAG_skeleton_unit)
 		{
 		  name_val = val;
 		  have_name_val = 1;
@@ -2337,7 +2343,8 @@ find_address_ranges (struct backtrace_state *state, uintptr_t base_address,
 	      break;
 
 	    case DW_AT_comp_dir:
-	      if (abbrev->tag == DW_TAG_compile_unit)
+	      if (abbrev->tag == DW_TAG_compile_unit
+		  || abbrev->tag == DW_TAG_skeleton_unit)
 		{
 		  comp_dir_val = val;
 		  have_comp_dir_val = 1;
@@ -2345,19 +2352,22 @@ find_address_ranges (struct backtrace_state *state, uintptr_t base_address,
 	      break;
 
 	    case DW_AT_str_offsets_base:
-	      if (abbrev->tag == DW_TAG_compile_unit
+	      if ((abbrev->tag == DW_TAG_compile_unit
+		   || abbrev->tag == DW_TAG_skeleton_unit)
 		  && val.encoding == ATTR_VAL_REF_SECTION)
 		u->str_offsets_base = val.u.uint;
 	      break;
 
 	    case DW_AT_addr_base:
-	      if (abbrev->tag == DW_TAG_compile_unit
+	      if ((abbrev->tag == DW_TAG_compile_unit
+		   || abbrev->tag == DW_TAG_skeleton_unit)
 		  && val.encoding == ATTR_VAL_REF_SECTION)
 		u->addr_base = val.u.uint;
 	      break;
 
 	    case DW_AT_rnglists_base:
-	      if (abbrev->tag == DW_TAG_compile_unit
+	      if ((abbrev->tag == DW_TAG_compile_unit
+		   || abbrev->tag == DW_TAG_skeleton_unit)
 		  && val.encoding == ATTR_VAL_REF_SECTION)
 		u->rnglists_base = val.u.uint;
 	      break;
@@ -2385,7 +2395,8 @@ find_address_ranges (struct backtrace_state *state, uintptr_t base_address,
 	}
 
       if (abbrev->tag == DW_TAG_compile_unit
-	  || abbrev->tag == DW_TAG_subprogram)
+	  || abbrev->tag == DW_TAG_subprogram
+	  || abbrev->tag == DW_TAG_skeleton_unit)
 	{
 	  if (!add_ranges (state, dwarf_sections, base_address,
 			   is_bigendian, u, pcrange.lowpc, &pcrange,
@@ -2393,9 +2404,10 @@ find_address_ranges (struct backtrace_state *state, uintptr_t base_address,
 			   (void *) addrs))
 	    return 0;
 
-	  /* If we found the PC range in the DW_TAG_compile_unit, we
-	     can stop now.  */
-	  if (abbrev->tag == DW_TAG_compile_unit
+	  /* If we found the PC range in the DW_TAG_compile_unit or
+	     DW_TAG_skeleton_unit, we can stop now.  */
+	  if ((abbrev->tag == DW_TAG_compile_unit
+	       || abbrev->tag == DW_TAG_skeleton_unit)
 	      && (pcrange.have_ranges
 		  || (pcrange.have_lowpc && pcrange.have_highpc)))
 	    return 1;
@@ -2482,7 +2494,7 @@ build_address_map (struct backtrace_state *state, uintptr_t base_address,
       version = read_uint16 (&unit_buf);
       if (version < 2 || version > 5)
 	{
-	  dwarf_buf_error (&unit_buf, "unrecognized DWARF version");
+	  dwarf_buf_error (&unit_buf, "unrecognized DWARF version", -1);
 	  goto fail;
 	}
 
@@ -2554,6 +2566,9 @@ build_address_map (struct backtrace_state *state, uintptr_t base_address,
       u->comp_dir = NULL;
       u->abs_filename = NULL;
       u->lineoff = 0;
+      u->str_offsets_base = 0;
+      u->addr_base = 0;
+      u->rnglists_base = 0;
 
       /* The actual line number mappings will be read as needed.  */
       u->lines = NULL;
@@ -2761,7 +2776,8 @@ read_v2_paths (struct backtrace_state *state, struct unit *u,
 	    {
 	      dwarf_buf_error (hdr_buf,
 			       ("invalid directory index in "
-				"line number program header"));
+				"line number program header"),
+			       0);
 	      return 0;
 	    }
 	  dir_len = strlen (dir);
@@ -2830,7 +2846,8 @@ read_lnct (struct backtrace_state *state, struct dwarf_data *ddata,
 		{
 		  dwarf_buf_error (hdr_buf,
 				   ("invalid directory index in "
-				    "line number program header"));
+				    "line number program header"),
+				   0);
 		  return 0;
 		}
 	      dir = hdr->dirs[val.u.uint];
@@ -2845,7 +2862,8 @@ read_lnct (struct backtrace_state *state, struct dwarf_data *ddata,
   if (path == NULL)
     {
       dwarf_buf_error (hdr_buf,
-		       "missing file name in line number program header");
+		       "missing file name in line number program header",
+		       0);
       return 0;
     }
 
@@ -2972,7 +2990,7 @@ read_line_header (struct backtrace_state *state, struct dwarf_data *ddata,
   hdr->version = read_uint16 (line_buf);
   if (hdr->version < 2 || hdr->version > 5)
     {
-      dwarf_buf_error (line_buf, "unsupported line number version");
+      dwarf_buf_error (line_buf, "unsupported line number version", -1);
       return 0;
     }
 
@@ -2986,7 +3004,8 @@ read_line_header (struct backtrace_state *state, struct dwarf_data *ddata,
       if (read_byte (line_buf) != 0)
 	{
 	  dwarf_buf_error (line_buf,
-			   "non-zero segment_selector_size not supported");
+			   "non-zero segment_selector_size not supported",
+			   -1);
 	  return 0;
 	}
     }
@@ -3127,7 +3146,8 @@ read_line_program (struct backtrace_state *state, struct dwarf_data *ddata,
 		      {
 			dwarf_buf_error (line_buf,
 					 ("invalid directory index "
-					  "in line number program"));
+					  "in line number program"),
+					 0);
 			return 0;
 		      }
 		    dir_len = strlen (dir);
@@ -3185,19 +3205,15 @@ read_line_program (struct backtrace_state *state, struct dwarf_data *ddata,
 		uint64_t fileno;
 
 		fileno = read_uleb128 (line_buf);
-		if (fileno == 0)
-		  filename = "";
-		else
+		if (fileno >= hdr->filenames_count)
 		  {
-		    if (fileno >= hdr->filenames_count)
-		      {
-			dwarf_buf_error (line_buf,
-					 ("invalid file number in "
-					  "line number program"));
-			return 0;
-		      }
-		    filename = hdr->filenames[fileno];
+		    dwarf_buf_error (line_buf,
+				     ("invalid file number in "
+				      "line number program"),
+				     0);
+		    return 0;
 		  }
+		filename = hdr->filenames[fileno];
 	      }
 	      break;
 	    case DW_LNS_set_column:
@@ -3428,7 +3444,9 @@ read_referenced_name (struct dwarf_data *ddata, struct unit *u,
   code = read_uleb128 (&unit_buf);
   if (code == 0)
     {
-      dwarf_buf_error (&unit_buf, "invalid abstract origin or specification");
+      dwarf_buf_error (&unit_buf,
+		      "invalid abstract origin or specification",
+		      0);
       return NULL;
     }
 
@@ -3601,7 +3619,8 @@ read_function_entry (struct backtrace_state *state, struct dwarf_data *ddata,
 
 	  /* The compile unit sets the base address for any address
 	     ranges in the function entries.  */
-	  if (abbrev->tag == DW_TAG_compile_unit
+	  if ((abbrev->tag == DW_TAG_compile_unit
+	       || abbrev->tag == DW_TAG_skeleton_unit)
 	      && abbrev->attrs[i].name == DW_AT_low_pc)
 	    {
 	      if (val.encoding == ATTR_VAL_ADDRESS)
@@ -3623,20 +3642,15 @@ read_function_entry (struct backtrace_state *state, struct dwarf_data *ddata,
 		case DW_AT_call_file:
 		  if (val.encoding == ATTR_VAL_UINT)
 		    {
-		      if (val.u.uint == 0)
-			function->caller_filename = "";
-		      else
+		      if (val.u.uint >= lhdr->filenames_count)
 			{
-			  if (val.u.uint >= lhdr->filenames_count)
-			    {
-			      dwarf_buf_error (unit_buf,
-					       ("invalid file number in "
-						"DW_AT_call_file attribute"));
-			      return 0;
-			    }
-			  function->caller_filename =
-			    lhdr->filenames[val.u.uint];
+			  dwarf_buf_error (unit_buf,
+					   ("invalid file number in "
+					    "DW_AT_call_file attribute"),
+					   0);
+			  return 0;
 			}
+		      function->caller_filename = lhdr->filenames[val.u.uint];
 		    }
 		  break;
 
@@ -3884,7 +3898,7 @@ read_function_info (struct backtrace_state *state, struct dwarf_data *ddata,
    Returns whatever CALLBACK returns, or 0 to keep going.  */
 
 static int
-report_inlined_functions (uintptr_t pc, struct function *function,
+report_inlined_functions (uintptr_t pc, struct function *function, const char* comp_dir,
 			  backtrace_full_callback callback, void *data,
 			  const char **filename, int *lineno)
 {
@@ -3938,13 +3952,22 @@ report_inlined_functions (uintptr_t pc, struct function *function,
   inlined = match->function;
 
   /* Report any calls inlined into this one.  */
-  ret = report_inlined_functions (pc, inlined, callback, data,
+  ret = report_inlined_functions (pc, inlined, comp_dir, callback, data,
 				  filename, lineno);
   if (ret != 0)
     return ret;
 
   /* Report this inlined call.  */
-  ret = callback (data, pc, match->low, *filename, *lineno, inlined->name);
+  if (*filename[0] != '/' && comp_dir)
+  {
+    char buf[1024];
+    snprintf (buf, 1024, "%s/%s", comp_dir, *filename);
+    ret = callback (data, pc, match->low, buf, *lineno, inlined->name);
+  }
+  else
+  {
+    ret = callback (data, pc, match->low, *filename, *lineno, inlined->name);
+  }
   if (ret != 0)
     return ret;
 
@@ -4211,12 +4234,21 @@ dwarf_lookup_pc (struct backtrace_state *state, struct dwarf_data *ddata,
   filename = ln->filename;
   lineno = ln->lineno;
 
-  ret = report_inlined_functions (pc, function, callback, data,
+  ret = report_inlined_functions (pc, function, entry->u->comp_dir, callback, data,
 				  &filename, &lineno);
   if (ret != 0)
     return ret;
 
-  return callback (data, pc, fmatch->low, filename, lineno, function->name);
+  if (filename[0] != '/' && entry->u->comp_dir)
+  {
+    char buf[1024];
+    snprintf (buf, 1024, "%s/%s", entry->u->comp_dir, filename);
+    return callback (data, pc, fmatch->low, buf, lineno, function->name);
+  }
+  else
+  {
+    return callback (data, pc, fmatch->low, filename, lineno, function->name);
+  }
 }
 
 
diff --git a/Source/ThirdParty/tracy/libbacktrace/elf.cpp b/Source/ThirdParty/tracy/libbacktrace/elf.cpp
index 50715bf95..9e62f090d 100644
--- a/Source/ThirdParty/tracy/libbacktrace/elf.cpp
+++ b/Source/ThirdParty/tracy/libbacktrace/elf.cpp
@@ -46,6 +46,9 @@ POSSIBILITY OF SUCH DAMAGE.  */
 #include "backtrace.hpp"
 #include "internal.hpp"
 
+#include "../client/TracyFastVector.hpp"
+#include "../common/TracyAlloc.hpp"
+
 #ifndef S_ISLNK
  #ifndef S_IFLNK
   #define S_IFLNK 0120000
@@ -70,6 +73,10 @@ POSSIBILITY OF SUCH DAMAGE.  */
 namespace tracy
 {
 
+#ifdef TRACY_DEBUGINFOD
+int GetDebugInfoDescriptor( const char* buildid_data, size_t buildid_size );
+#endif
+
 #if !defined(HAVE_DECL_STRNLEN) || !HAVE_DECL_STRNLEN
 
 /* If strnlen is not declared, provide our own version.  */
@@ -867,6 +874,7 @@ elf_readlink (struct backtrace_state *state, const char *filename,
 static int
 elf_open_debugfile_by_buildid (struct backtrace_state *state,
 			       const char *buildid_data, size_t buildid_size,
+             const char *filename,
 			       backtrace_error_callback error_callback,
 			       void *data)
 {
@@ -913,7 +921,14 @@ elf_open_debugfile_by_buildid (struct backtrace_state *state,
      That seems kind of pointless to me--why would it have the right
      name but not the right build ID?--so skipping the check.  */
 
+#ifdef TRACY_DEBUGINFOD
+  if (ret == -1)
+    return GetDebugInfoDescriptor( buildid_data, buildid_size, filename );
+  else
+    return ret;
+#else
   return ret;
+#endif
 }
 
 /* Try to open a file whose name is PREFIX (length PREFIX_LEN)
@@ -1803,7 +1818,7 @@ elf_zlib_inflate (const unsigned char *pin, size_t sin, uint16_t *zdebug_table,
 	      /* An uncompressed block.  */
 
 	      /* If we've read ahead more than a byte, back up.  */
-	      while (bits > 8)
+	      while (bits >= 8)
 		{
 		  --pin;
 		  bits -= 8;
@@ -4429,7 +4444,7 @@ elf_add (struct backtrace_state *state, const char *filename, int descriptor,
       int d;
 
       d = elf_open_debugfile_by_buildid (state, buildid_data, buildid_size,
-					 error_callback, data);
+					 filename, error_callback, data);
       if (d >= 0)
 	{
 	  int ret;
@@ -4812,12 +4827,34 @@ struct phdr_data
 /* Callback passed to dl_iterate_phdr.  Load debug info from shared
    libraries.  */
 
+struct PhdrIterate
+{
+  char* dlpi_name;
+  ElfW(Addr) dlpi_addr;
+};
+FastVector<PhdrIterate> s_phdrData(16);
+
+static int
+phdr_callback_mock (struct dl_phdr_info *info, size_t size ATTRIBUTE_UNUSED,
+  void *pdata)
+{
+  auto ptr = s_phdrData.push_next();
+  if (info->dlpi_name)
+  {
+    size_t sz = strlen (info->dlpi_name) + 1;
+    ptr->dlpi_name = (char*)tracy_malloc (sz);
+    memcpy (ptr->dlpi_name, info->dlpi_name, sz);
+  }
+  else ptr->dlpi_name = nullptr;
+  ptr->dlpi_addr = info->dlpi_addr;
+  return 0;
+}
+
 static int
 #ifdef __i386__
 __attribute__ ((__force_align_arg_pointer__))
 #endif
-phdr_callback (struct dl_phdr_info *info, size_t size ATTRIBUTE_UNUSED,
-	       void *pdata)
+phdr_callback (struct PhdrIterate *info, void *pdata)
 {
   struct phdr_data *pd = (struct phdr_data *) pdata;
   const char *filename;
@@ -4896,7 +4933,14 @@ backtrace_initialize (struct backtrace_state *state, const char *filename,
   pd.exe_filename = filename;
   pd.exe_descriptor = ret < 0 ? descriptor : -1;
 
-  dl_iterate_phdr (phdr_callback, (void *) &pd);
+  assert (s_phdrData.empty());
+  dl_iterate_phdr (phdr_callback_mock, nullptr);
+  for (auto& v : s_phdrData)
+  {
+    phdr_callback (&v, (void *) &pd);
+    tracy_free (v.dlpi_name);
+  }
+  s_phdrData.clear();
 
   if (!state->threaded)
     {
diff --git a/Source/ThirdParty/tracy/libbacktrace/macho.cpp b/Source/ThirdParty/tracy/libbacktrace/macho.cpp
index cb50dc5f7..6cccdabaa 100644
--- a/Source/ThirdParty/tracy/libbacktrace/macho.cpp
+++ b/Source/ThirdParty/tracy/libbacktrace/macho.cpp
@@ -1271,7 +1271,7 @@ backtrace_initialize (struct backtrace_state *state, const char *filename,
       mff = macho_nodebug;
       if (!macho_add (state, name, d, 0, NULL, base_address, 0,
 		      error_callback, data, &mff, &mfs))
-	return 0;
+	continue;
 
       if (mff != macho_nodebug)
 	macho_fileline_fn = mff;
@@ -1292,7 +1292,7 @@ backtrace_initialize (struct backtrace_state *state, const char *filename,
   else
     {
       if (found_sym)
-	backtrace_atomic_store_pointer (&state->syminfo_fn, macho_syminfo);
+	backtrace_atomic_store_pointer (&state->syminfo_fn, &macho_syminfo);
       else
 	(void) __sync_bool_compare_and_swap (&state->syminfo_fn, NULL,
 					     macho_nosyms);
@@ -1338,7 +1338,7 @@ backtrace_initialize (struct backtrace_state *state, const char *filename,
   else
     {
       if (found_sym)
-	backtrace_atomic_store_pointer (&state->syminfo_fn, macho_syminfo);
+	backtrace_atomic_store_pointer (&state->syminfo_fn, &macho_syminfo);
       else
 	(void) __sync_bool_compare_and_swap (&state->syminfo_fn, NULL,
 					     macho_nosyms);
diff --git a/Source/ThirdParty/tracy/tracy.Build.cs b/Source/ThirdParty/tracy/tracy.Build.cs
index 76d8854f6..d6119a5ca 100644
--- a/Source/ThirdParty/tracy/tracy.Build.cs
+++ b/Source/ThirdParty/tracy/tracy.Build.cs
@@ -34,11 +34,12 @@ public class tracy : ThirdPartyModule
 
         options.SourcePaths.Clear();
         options.SourceFiles.Clear();
-        options.SourceFiles.Add(Path.Combine(FolderPath, "Tracy.h"));
+        options.SourceFiles.Add(Path.Combine(FolderPath, "tracy", "Tracy.hpp"));
         options.SourceFiles.Add(Path.Combine(FolderPath, "TracyClient.cpp"));
 
         options.PublicDefinitions.Add("TRACY_ENABLE");
         options.PrivateDefinitions.Add("TRACY_NO_INVARIANT_CHECK");
+        options.PrivateDefinitions.Add("TRACY_NO_FRAME_IMAGE");
         if (options.Platform.Target == TargetPlatform.Windows)
         {
             options.PrivateDefinitions.Add("TRACY_DBGHELP_LOCK=DbgHelp");
@@ -54,7 +55,7 @@ public class tracy : ThirdPartyModule
     {
         base.GetFilesToDeploy(files);
 
-        files.Add(Path.Combine(FolderPath, "Tracy.h"));
+        files.Add(Path.Combine(FolderPath, "tracy", "Tracy.hpp"));
         files.Add(Path.Combine(FolderPath, "common", "TracySystem.hpp"));
         files.Add(Path.Combine(FolderPath, "client", "TracyCallstack.h"));
     }
diff --git a/Source/ThirdParty/tracy/Tracy.h b/Source/ThirdParty/tracy/tracy/Tracy.hpp
similarity index 69%
rename from Source/ThirdParty/tracy/Tracy.h
rename to Source/ThirdParty/tracy/tracy/Tracy.hpp
index 0b3469cd5..8ef26d59e 100644
--- a/Source/ThirdParty/tracy/Tracy.h
+++ b/Source/ThirdParty/tracy/tracy/Tracy.hpp
@@ -24,12 +24,14 @@
 #define ZoneColorV(x,y)
 #define ZoneValue(x)
 #define ZoneValueV(x,y)
+#define ZoneIsActive false
+#define ZoneIsActiveV(x) false
 
 #define FrameMark
 #define FrameMarkNamed(x)
 
 #define TracyPlot(x,y)
-#define TracyPlotConfig(x,y)
+#define TracyPlotConfig(x,y,z,w,a)
 
 #define TracyMessage(x,y)
 #define TracyMessageL(x)
@@ -75,15 +77,19 @@
 #define TracyMessageCS(x,y,z,w)
 #define TracyMessageLCS(x,y,z)
 
-#define TracyParameterRegister(x)
+#define TracySourceCallbackRegister(x,y)
+#define TracyParameterRegister(x,y)
 #define TracyParameterSetup(x,y,z,w)
 
+#define TracyFiberEnter(x)
+#define TracyFiberLeave
+
 #else
 
 #include <string.h>
 
-#include "common/TracySystem.hpp"
-#include "client/TracyCallstack.h"
+#include "../common/TracyQueue.hpp"
+#include "../common/TracySystem.hpp"
 
 namespace tracy
 {
@@ -91,10 +97,12 @@ class TRACY_API Profiler
 {
 public:
     static void SendFrameMark( const char* name );
+    static void SendFrameMark( const char* name, QueueType type );
+    static void SendFrameImage( const void* image, uint16_t w, uint16_t h, uint8_t offset, bool flip );
     static void PlotData( const char* name, int64_t val );
     static void PlotData( const char* name, float val );
     static void PlotData( const char* name, double val );
-    static void ConfigurePlot( const char* name, PlotFormatType type );
+    static void ConfigurePlot( const char* name, PlotFormatType type, bool step, bool fill, uint32_t color );
     static void Message( const char* txt, size_t size, int callstack );
     static void Message( const char* txt, int callstack );
     static void MessageColor( const char* txt, size_t size, uint32_t color, int callstack );
@@ -108,27 +116,29 @@ public:
     static void MemFreeNamed( const void* ptr, bool secure, const char* name );
     static void MemAllocCallstackNamed( const void* ptr, size_t size, int depth, bool secure, const char* name );
     static void MemFreeCallstackNamed( const void* ptr, int depth, bool secure, const char* name );
+    static void SendCallstack( int depth );
     static void ParameterRegister( ParameterCallback cb );
+    static void ParameterRegister( ParameterCallback cb, void* data );
     static void ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val );
 };
 }
 
 #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
-#  define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
-#  define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
-#  define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
-#  define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active );
+#  define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active )
+#  define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active )
+#  define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active )
+#  define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active )
 
-#  define ZoneTransient( varname, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, TRACY_CALLSTACK, active );
-#  define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), TRACY_CALLSTACK, active );
+#  define ZoneTransient( varname, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, TRACY_CALLSTACK, active )
+#  define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), TRACY_CALLSTACK, active )
 #else
-#  define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active );
-#  define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active );
-#  define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active );
-#  define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active );
+#  define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active )
+#  define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active )
+#  define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active )
+#  define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active )
 
-#  define ZoneTransient( varname, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, active );
-#  define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), active );
+#  define ZoneTransient( varname, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, active )
+#  define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), active )
 #endif
 
 #define ZoneScoped ZoneNamed( ___tracy_scoped_zone, true )
@@ -136,83 +146,98 @@ public:
 #define ZoneScopedC( color ) ZoneNamedC( ___tracy_scoped_zone, color, true )
 #define ZoneScopedNC( name, color ) ZoneNamedNC( ___tracy_scoped_zone, name, color, true )
 
-#define ZoneText( txt, size ) ___tracy_scoped_zone.Text( txt, size );
-#define ZoneTextV( varname, txt, size ) varname.Text( txt, size );
-#define ZoneName( txt, size ) ___tracy_scoped_zone.Name( txt, size );
-#define ZoneNameV( varname, txt, size ) varname.Name( txt, size );
-#define ZoneColor( color ) ___tracy_scoped_zone.Color( color );
-#define ZoneColorV( varname, color ) varname.Color( color );
-#define ZoneValue( value ) ___tracy_scoped_zone.Value( value );
-#define ZoneValueV( varname, value ) varname.Value( value );
+#define ZoneText( txt, size ) ___tracy_scoped_zone.Text( txt, size )
+#define ZoneTextV( varname, txt, size ) varname.Text( txt, size )
+#define ZoneName( txt, size ) ___tracy_scoped_zone.Name( txt, size )
+#define ZoneNameV( varname, txt, size ) varname.Name( txt, size )
+#define ZoneColor( color ) ___tracy_scoped_zone.Color( color )
+#define ZoneColorV( varname, color ) varname.Color( color )
+#define ZoneValue( value ) ___tracy_scoped_zone.Value( value )
+#define ZoneValueV( varname, value ) varname.Value( value )
+#define ZoneIsActive ___tracy_scoped_zone.IsActive()
+#define ZoneIsActiveV( varname ) varname.IsActive()
 
-#define FrameMark tracy::Profiler::SendFrameMark( nullptr );
-#define FrameMarkNamed( name ) tracy::Profiler::SendFrameMark( name );
+#define FrameMark tracy::Profiler::SendFrameMark( nullptr )
+#define FrameMarkNamed( name ) tracy::Profiler::SendFrameMark( name )
+#define FrameMarkStart( name ) tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgStart )
+#define FrameMarkEnd( name ) tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgEnd )
 
-#define TracyPlot( name, val ) tracy::Profiler::PlotData( name, val );
-#define TracyPlotConfig( name, type ) tracy::Profiler::ConfigurePlot( name, type );
+#define FrameImage( image, width, height, offset, flip ) tracy::Profiler::SendFrameImage( image, width, height, offset, flip )
 
-#define TracyAppInfo( txt, size ) tracy::Profiler::MessageAppInfo( txt, size );
+#define TracyLockable( type, varname ) tracy::Lockable<type> varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, #type " " #varname, __FILE__, __LINE__, 0 }; return &srcloc; }() }
+#define TracyLockableN( type, varname, desc ) tracy::Lockable<type> varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, desc, __FILE__, __LINE__, 0 }; return &srcloc; }() }
+#define TracySharedLockable( type, varname ) tracy::SharedLockable<type> varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, #type " " #varname, __FILE__, __LINE__, 0 }; return &srcloc; }() }
+#define TracySharedLockableN( type, varname, desc ) tracy::SharedLockable<type> varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, desc, __FILE__, __LINE__, 0 }; return &srcloc; }() }
+#define LockableBase( type ) tracy::Lockable<type>
+#define SharedLockableBase( type ) tracy::SharedLockable<type>
+#define LockMark( varname ) static constexpr tracy::SourceLocationData __tracy_lock_location_##varname { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; varname.Mark( &__tracy_lock_location_##varname )
+#define LockableName( varname, txt, size ) varname.CustomName( txt, size )
+
+#define TracyPlot( name, val ) tracy::Profiler::PlotData( name, val )
+#define TracyPlotConfig( name, type, step, fill, color ) tracy::Profiler::ConfigurePlot( name, type, step, fill, color )
+
+#define TracyAppInfo( txt, size ) tracy::Profiler::MessageAppInfo( txt, size )
 
 #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
-#  define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, TRACY_CALLSTACK );
-#  define TracyMessageL( txt ) tracy::Profiler::Message( txt, TRACY_CALLSTACK );
-#  define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, TRACY_CALLSTACK );
-#  define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, TRACY_CALLSTACK );
+#  define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, TRACY_CALLSTACK )
+#  define TracyMessageL( txt ) tracy::Profiler::Message( txt, TRACY_CALLSTACK )
+#  define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, TRACY_CALLSTACK )
+#  define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, TRACY_CALLSTACK )
 
-#  define TracyAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, false );
-#  define TracyFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, false );
-#  define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, true );
-#  define TracySecureFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, true );
+#  define TracyAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, false )
+#  define TracyFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, false )
+#  define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, true )
+#  define TracySecureFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, true )
 
-#  define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, false, name );
-#  define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, false, name );
-#  define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, true, name );
-#  define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, true, name );
+#  define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, false, name )
+#  define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, false, name )
+#  define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, true, name )
+#  define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, true, name )
 #else
-#  define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, 0 );
-#  define TracyMessageL( txt ) tracy::Profiler::Message( txt, 0 );
-#  define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, 0 );
-#  define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, 0 );
+#  define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, 0 )
+#  define TracyMessageL( txt ) tracy::Profiler::Message( txt, 0 )
+#  define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, 0 )
+#  define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, 0 )
 
-#  define TracyAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, false );
-#  define TracyFree( ptr ) tracy::Profiler::MemFree( ptr, false );
-#  define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, true );
-#  define TracySecureFree( ptr ) tracy::Profiler::MemFree( ptr, true );
+#  define TracyAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, false )
+#  define TracyFree( ptr ) tracy::Profiler::MemFree( ptr, false )
+#  define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, true )
+#  define TracySecureFree( ptr ) tracy::Profiler::MemFree( ptr, true )
 
-#  define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, false, name );
-#  define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, false, name );
-#  define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, true, name );
-#  define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, true, name );
+#  define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, false, name )
+#  define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, false, name )
+#  define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, true, name )
+#  define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, true, name )
 #endif
 
 #ifdef TRACY_HAS_CALLSTACK
-#  define ZoneNamedS( varname, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
-#  define ZoneNamedNS( varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
-#  define ZoneNamedCS( varname, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
-#  define ZoneNamedNCS( varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active );
+#  define ZoneNamedS( varname, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active )
+#  define ZoneNamedNS( varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active )
+#  define ZoneNamedCS( varname, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active )
+#  define ZoneNamedNCS( varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active )
 
-#  define ZoneTransientS( varname, depth, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, depth, active );
-#  define ZoneTransientNS( varname, name, depth, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), depth, active );
+#  define ZoneTransientS( varname, depth, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, depth, active )
+#  define ZoneTransientNS( varname, name, depth, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), depth, active )
 
 #  define ZoneScopedS( depth ) ZoneNamedS( ___tracy_scoped_zone, depth, true )
 #  define ZoneScopedNS( name, depth ) ZoneNamedNS( ___tracy_scoped_zone, name, depth, true )
 #  define ZoneScopedCS( color, depth ) ZoneNamedCS( ___tracy_scoped_zone, color, depth, true )
 #  define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color, depth, true )
 
-#  define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, false );
-#  define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, false );
-#  define TracySecureAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, true );
-#  define TracySecureFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, true );
+#  define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, false )
+#  define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, false )
+#  define TracySecureAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, true )
+#  define TracySecureFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, true )
 
-#  define TracyAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, false, name );
-#  define TracyFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, false, name );
-#  define TracySecureAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, true, name );
-#  define TracySecureFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, true, name );
+#  define TracyAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, false, name )
+#  define TracyFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, false, name )
+#  define TracySecureAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, true, name )
+#  define TracySecureFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, true, name )
 
-#  define TracyMessageS( txt, size, depth ) tracy::Profiler::Message( txt, size, depth );
-#  define TracyMessageLS( txt, depth ) tracy::Profiler::Message( txt, depth );
-#  define TracyMessageCS( txt, size, color, depth ) tracy::Profiler::MessageColor( txt, size, color, depth );
-#  define TracyMessageLCS( txt, color, depth ) tracy::Profiler::MessageColor( txt, color, depth );
+#  define TracyMessageS( txt, size, depth ) tracy::Profiler::Message( txt, size, depth )
+#  define TracyMessageLS( txt, depth ) tracy::Profiler::Message( txt, depth )
+#  define TracyMessageCS( txt, size, color, depth ) tracy::Profiler::MessageColor( txt, size, color, depth )
+#  define TracyMessageLCS( txt, color, depth ) tracy::Profiler::MessageColor( txt, color, depth )
 #else
 #  define ZoneNamedS( varname, depth, active ) ZoneNamed( varname, active )
 #  define ZoneNamedNS( varname, name, depth, active ) ZoneNamedN( varname, name, active )
@@ -232,10 +257,10 @@ public:
 #  define TracySecureAllocS( ptr, size, depth ) TracySecureAlloc( ptr, size )
 #  define TracySecureFreeS( ptr, depth ) TracySecureFree( ptr )
 
-#  define TracyAllocNS( ptr, size, depth, name ) TracyAlloc( ptr, size, name )
-#  define TracyFreeNS( ptr, depth, name ) TracyFree( ptr, name )
-#  define TracySecureAllocNS( ptr, size, depth, name ) TracySecureAlloc( ptr, size, name )
-#  define TracySecureFreeNS( ptr, depth, name ) TracySecureFree( ptr, name )
+#  define TracyAllocNS( ptr, size, depth, name ) TracyAllocN( ptr, size, name )
+#  define TracyFreeNS( ptr, depth, name ) TracyFreeN( ptr, name )
+#  define TracySecureAllocNS( ptr, size, depth, name ) TracySecureAllocN( ptr, size, name )
+#  define TracySecureFreeNS( ptr, depth, name ) TracySecureFreeN( ptr, name )
 
 #  define TracyMessageS( txt, size, depth ) TracyMessage( txt, size )
 #  define TracyMessageLS( txt, depth ) TracyMessageL( txt )
@@ -243,8 +268,15 @@ public:
 #  define TracyMessageLCS( txt, color, depth ) TracyMessageLC( txt, color )
 #endif
 
-#define TracyParameterRegister( cb ) tracy::Profiler::ParameterRegister( cb );
-#define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val );
+#define TracySourceCallbackRegister( cb, data ) tracy::Profiler::SourceCallbackRegister( cb, data )
+#define TracyParameterRegister( cb, data ) tracy::Profiler::ParameterRegister( cb, data )
+#define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val )
+#define TracyIsConnected tracy::GetProfiler().IsConnected()
+
+#ifdef TRACY_FIBERS
+#  define TracyFiberEnter( fiber ) tracy::Profiler::EnterFiber( fiber )
+#  define TracyFiberLeave tracy::Profiler::LeaveFiber()
+#endif
 
 #endif