diff --git a/Source/Engine/Engine/EngineService.cpp b/Source/Engine/Engine/EngineService.cpp index ac58ba40c..9a01b08b5 100644 --- a/Source/Engine/Engine/EngineService.cpp +++ b/Source/Engine/Engine/EngineService.cpp @@ -4,7 +4,7 @@ #include "Engine/Core/Log.h" #include "Engine/Core/Collections/Array.h" #include "Engine/Core/Collections/Sorting.h" -#include +#include static bool CompareEngineServices(EngineService* const& a, EngineService* const& b) { diff --git a/Source/Engine/Platform/Base/ThreadBase.cpp b/Source/Engine/Platform/Base/ThreadBase.cpp index 5739dbf3e..be45ceeab 100644 --- a/Source/Engine/Platform/Base/ThreadBase.cpp +++ b/Source/Engine/Platform/Base/ThreadBase.cpp @@ -7,7 +7,7 @@ #include "Engine/Scripting/ManagedCLR/MCore.h" #if TRACY_ENABLE #include "Engine/Core/Math/Math.h" -#include +#include #endif Delegate ThreadBase::ThreadStarting; diff --git a/Source/Engine/Profiler/ProfilerCPU.h b/Source/Engine/Profiler/ProfilerCPU.h index 4525b6714..000e41ccb 100644 --- a/Source/Engine/Profiler/ProfilerCPU.h +++ b/Source/Engine/Profiler/ProfilerCPU.h @@ -8,7 +8,7 @@ #include "Engine/Core/Math/Math.h" #include "Engine/Core/Collections/Array.h" #include "Engine/Scripting/ScriptingType.h" -#include +#include #if COMPILE_WITH_PROFILER diff --git a/Source/Engine/Scripting/Plugins/PluginManager.cpp b/Source/Engine/Scripting/Plugins/PluginManager.cpp index 4d205d6d6..16b0c5483 100644 --- a/Source/Engine/Scripting/Plugins/PluginManager.cpp +++ b/Source/Engine/Scripting/Plugins/PluginManager.cpp @@ -127,7 +127,7 @@ void PluginManagerService::InvokeDeinitialize(Plugin* plugin) return; StringAnsiView typeName = plugin->GetType().GetName(); PROFILE_CPU(); - ZoneName(typeName.Get(), typeName.Length()) + ZoneName(typeName.Get(), typeName.Length()); LOG(Info, "Unloading plugin {}", plugin->ToString()); diff --git a/Source/ThirdParty/tracy/TracyClient.cpp b/Source/ThirdParty/tracy/TracyClient.cpp index 7e170e5de..3548c5752 100644 --- a/Source/ThirdParty/tracy/TracyClient.cpp +++ b/Source/ThirdParty/tracy/TracyClient.cpp @@ -26,6 +26,8 @@ #include "client/TracySysTrace.cpp" #include "common/TracySocket.cpp" #include "client/tracy_rpmalloc.cpp" +#include "client/TracyAlloc.cpp" + #if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6 # include "libbacktrace/alloc.cpp" @@ -40,6 +42,7 @@ # else # include "libbacktrace/elf.cpp" # endif +# include "common/TracyStackFrames.cpp" #endif #ifdef _MSC_VER diff --git a/Source/ThirdParty/tracy/client/TracyAlloc.cpp b/Source/ThirdParty/tracy/client/TracyAlloc.cpp new file mode 100644 index 000000000..545a6062b --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyAlloc.cpp @@ -0,0 +1,42 @@ +#include "../common/TracyAlloc.hpp" + +#ifdef TRACY_USE_RPMALLOC + +#include + +#include "../common/TracyYield.hpp" + +namespace tracy +{ + +extern thread_local bool RpThreadInitDone; +extern std::atomic RpInitDone; +extern std::atomic RpInitLock; + +tracy_no_inline static void InitRpmallocPlumbing() +{ + const auto done = RpInitDone.load( std::memory_order_acquire ); + if( !done ) + { + int expected = 0; + while( !RpInitLock.compare_exchange_weak( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; YieldThread(); } + const auto done = RpInitDone.load( std::memory_order_acquire ); + if( !done ) + { + rpmalloc_initialize(); + RpInitDone.store( 1, std::memory_order_release ); + } + RpInitLock.store( 0, std::memory_order_release ); + } + rpmalloc_thread_initialize(); + RpThreadInitDone = true; +} + +TRACY_API void InitRpmalloc() +{ + if( !RpThreadInitDone ) InitRpmallocPlumbing(); +} + +} + +#endif diff --git a/Source/ThirdParty/tracy/client/TracyArmCpuTable.hpp b/Source/ThirdParty/tracy/client/TracyArmCpuTable.hpp index ff7d976c8..2b4459764 100644 --- a/Source/ThirdParty/tracy/client/TracyArmCpuTable.hpp +++ b/Source/ThirdParty/tracy/client/TracyArmCpuTable.hpp @@ -38,7 +38,7 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part ) static char buf[16]; switch( impl ) { - case 0x41: + case 0x41: // ARM switch( part ) { case 0x810: return "810"; @@ -61,8 +61,8 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part ) case 0xc09: return " Cortex-A9"; case 0xc0c: return " Cortex-A12"; case 0xc0d: return " Rockchip RK3288"; - case 0xc0f: return " Cortex-A15"; case 0xc0e: return " Cortex-A17"; + case 0xc0f: return " Cortex-A15"; case 0xc14: return " Cortex-R4"; case 0xc15: return " Cortex-R5"; case 0xc17: return " Cortex-R7"; @@ -92,14 +92,21 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part ) case 0xd13: return " Cortex-R52"; case 0xd20: return " Cortex-M23"; case 0xd21: return " Cortex-M33"; - case 0xd40: return " Zeus"; + case 0xd22: return " Cortex-M55"; + case 0xd40: return " Neoverse V1"; case 0xd41: return " Cortex-A78"; + case 0xd42: return " Cortex-A78AE"; case 0xd43: return " Cortex-A65AE"; case 0xd44: return " Cortex-X1"; + case 0xd47: return " Cortex-A710"; + case 0xd48: return " Cortex-X2"; + case 0xd49: return " Neoverse N2"; case 0xd4a: return " Neoverse E1"; + case 0xd4b: return " Cortex-A78C"; + case 0xd4c: return " Cortex-X1C"; default: break; } - case 0x42: + case 0x42: // Broadcom switch( part ) { case 0xf: return " Brahma B15"; @@ -107,7 +114,7 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part ) case 0x516: return " ThunderX2"; default: break; } - case 0x43: + case 0x43: // Cavium switch( part ) { case 0xa0: return " ThunderX"; @@ -121,30 +128,31 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part ) case 0xb3: return " OcteonTX2 F95"; case 0xb4: return " OcteonTX2 F95N"; case 0xb5: return " OcteonTX2 F95MM"; + case 0xb6: return " OcteonTX2 F95O"; case 0xb8: return " ThunderX3 T110"; default: break; } - case 0x44: + case 0x44: // DEC switch( part ) { case 0xa10: return " SA110"; case 0xa11: return " SA1100"; default: break; } - case 0x46: + case 0x46: // Fujitsu switch( part ) { case 0x1: return " A64FX"; default: break; } - case 0x48: + case 0x48: // HiSilicon switch( part ) { case 0xd01: return " TSV100"; case 0xd40: return " Kirin 980"; default: break; } - case 0x4e: + case 0x4e: // Nvidia switch( part ) { case 0x0: return " Denver"; @@ -152,13 +160,13 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part ) case 0x4: return " Carmel"; default: break; } - case 0x50: + case 0x50: // Applied Micro switch( part ) { case 0x0: return " X-Gene"; default: break; } - case 0x51: + case 0x51: // Qualcomm switch( part ) { case 0xf: return " Scorpion"; @@ -174,18 +182,27 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part ) case 0x802: return " Kryo 385 Gold"; case 0x803: return " Kryo 385 Silver"; case 0x804: return " Kryo 485 Gold"; + case 0x805: return " Kryo 4xx/5xx Silver"; case 0xc00: return " Falkor"; case 0xc01: return " Saphira"; default: break; } - case 0x53: + case 0x53: // Samsung switch( part ) { case 0x1: return " Exynos M1/M2"; case 0x2: return " Exynos M3"; + case 0x3: return " Exynos M4"; + case 0x4: return " Exynos M5"; default: break; } - case 0x56: + case 0x54: // Texas Instruments + switch( part ) + { + case 0x925: return " TI925"; + default: break; + } + case 0x56: // Marvell switch( part ) { case 0x131: return " Feroceon 88FR131"; @@ -193,7 +210,7 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part ) case 0x584: return " PJ4B-MP / PJ4C"; default: break; } - case 0x61: + case 0x61: // Apple switch( part ) { case 0x1: return " Cyclone"; @@ -203,21 +220,33 @@ static const char* DecodeArmPart( uint32_t impl, uint32_t part ) case 0x5: return " Twister/Elba/Malta"; case 0x6: return " Hurricane"; case 0x7: return " Hurricane/Myst"; + case 0x22: return " M1 Icestorm"; + case 0x23: return " M1 Firestorm"; + case 0x24: return " M1 Icestorm Pro"; + case 0x25: return " M1 Firestorm Pro"; + case 0x28: return " M1 Icestorm Max"; + case 0x29: return " M1 Firestorm Max"; default: break; } - case 0x66: + case 0x66: // Faraday switch( part ) { case 0x526: return " FA526"; case 0x626: return " FA626"; default: break; } - case 0x68: + case 0x68: // HXT switch( part ) { case 0x0: return " Phecda"; default: break; } + case 0xc0: // Ampere Computing + switch( part ) + { + case 0xac3: return " Ampere1"; + default: break; + } default: break; } sprintf( buf, " 0x%x", part ); @@ -267,6 +296,15 @@ static const char* DecodeIosDevice( const char* id ) "iPhone12,3", "iPhone 11 Pro", "iPhone12,5", "iPhone 11 Pro Max", "iPhone12,8", "iPhone SE 2nd Gen", + "iPhone13,1", "iPhone 12 Mini", + "iPhone13,2", "iPhone 12", + "iPhone13,3", "iPhone 12 Pro", + "iPhone13,4", "iPhone 12 Pro Max", + "iPhone14,2", "iPhone 13 Pro", + "iPhone14,3", "iPhone 13 Pro Max", + "iPhone14,4", "iPhone 13 Mini", + "iPhone14,5", "iPhone 13", + "iPhone14,6", "iPhone SE 3rd Gen", "iPad1,1", "iPad (A1219/A1337)", "iPad2,1", "iPad 2 (A1395)", "iPad2,2", "iPad 2 (A1396)", @@ -325,6 +363,20 @@ static const char* DecodeIosDevice( const char* id ) "iPad11,2", "iPad Mini 5th gen (A2124/A2125/A2126)", "iPad11,3", "iPad Air 3rd gen (A2152)", "iPad11,4", "iPad Air 3rd gen (A2123/A2153/A2154)", + "iPad11,6", "iPad 8th gen (WiFi)", + "iPad11,7", "iPad 8th gen (WiFi+Cellular)", + "iPad13,1", "iPad Air 4th gen (WiFi)", + "iPad13,2", "iPad Air 4th gen (WiFi+Cellular)", + "iPad13,4", "iPad Pro 11\" 3rd gen", + "iPad13,5", "iPad Pro 11\" 3rd gen", + "iPad13,6", "iPad Pro 11\" 3rd gen", + "iPad13,7", "iPad Pro 11\" 3rd gen", + "iPad13,8", "iPad Pro 12.9\" 5th gen", + "iPad13,9", "iPad Pro 12.9\" 5th gen", + "iPad13,10", "iPad Pro 12.9\" 5th gen", + "iPad13,11", "iPad Pro 12.9\" 5th gen", + "iPad13,16", "iPad Air 5th Gen (WiFi)", + "iPad13,17", "iPad Air 5th Gen (WiFi+Cellular)", "iPod1,1", "iPod Touch", "iPod2,1", "iPod Touch 2nd gen", "iPod3,1", "iPod Touch 3rd gen", diff --git a/Source/ThirdParty/tracy/client/TracyCallstack.cpp b/Source/ThirdParty/tracy/client/TracyCallstack.cpp index 10698cb19..ca19a543b 100644 --- a/Source/ThirdParty/tracy/client/TracyCallstack.cpp +++ b/Source/ThirdParty/tracy/client/TracyCallstack.cpp @@ -1,9 +1,12 @@ +#include #include #include #include #include "TracyCallstack.hpp" #include "TracyFastVector.hpp" +#include "TracyStringHelpers.hpp" #include "../common/TracyAlloc.hpp" +#include "TracyDebug.hpp" #ifdef TRACY_HAS_CALLSTACK @@ -13,6 +16,7 @@ # endif # include # include +# include # ifdef _MSC_VER # pragma warning( push ) # pragma warning( disable : 4091 ) @@ -23,8 +27,11 @@ # endif #elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6 # include "../libbacktrace/backtrace.hpp" +# include # include # include +# include +# include "TracyFastVector.hpp" #elif TRACY_HAS_CALLSTACK == 5 # include # include @@ -45,31 +52,50 @@ extern "C" }; #endif +#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 5 || TRACY_HAS_CALLSTACK == 6 +// If you want to use your own demangling functionality (e.g. for another language), +// define TRACY_DEMANGLE and provide your own implementation of the __tracy_demangle +// function. The input parameter is a function name. The demangle function must +// identify whether this name is mangled, and fail if it is not. Failure is indicated +// by returning nullptr. If demangling succeeds, a pointer to the C string containing +// demangled function must be returned. The demangling function is responsible for +// managing memory for this string. It is expected that it will be internally reused. +// When a call to ___tracy_demangle is made, previous contents of the string memory +// do not need to be preserved. Function may return string of any length, but the +// profiler can choose to truncate it. +extern "C" const char* ___tracy_demangle( const char* mangled ); + +#ifndef TRACY_DEMANGLE +constexpr size_t ___tracy_demangle_buffer_len = 1024*1024; +char* ___tracy_demangle_buffer; + +void ___tracy_init_demangle_buffer() +{ + ___tracy_demangle_buffer = (char*)tracy::tracy_malloc( ___tracy_demangle_buffer_len ); +} + +void ___tracy_free_demangle_buffer() +{ + tracy::tracy_free( ___tracy_demangle_buffer ); +} + +extern "C" const char* ___tracy_demangle( const char* mangled ) +{ + if( !mangled || mangled[0] != '_' ) return nullptr; + if( strlen( mangled ) > ___tracy_demangle_buffer_len ) return nullptr; + int status; + size_t len = ___tracy_demangle_buffer_len; + return abi::__cxa_demangle( mangled, ___tracy_demangle_buffer, &len, &status ); +} +#endif +#endif + namespace tracy { -static inline char* CopyString( const char* src, size_t sz ) -{ - assert( strlen( src ) == sz ); - auto dst = (char*)tracy_malloc( sz + 1 ); - memcpy( dst, src, sz ); - dst[sz] = '\0'; - return dst; -} - -static inline char* CopyString( const char* src ) -{ - const auto sz = strlen( src ); - auto dst = (char*)tracy_malloc( sz + 1 ); - memcpy( dst, src, sz ); - dst[sz] = '\0'; - return dst; -} - - #if TRACY_HAS_CALLSTACK == 1 -enum { MaxCbTrace = 16 }; +enum { MaxCbTrace = 64 }; enum { MaxNameSize = 8*1024 }; int cb_num; @@ -77,24 +103,19 @@ CallstackEntry cb_data[MaxCbTrace]; extern "C" { - typedef unsigned long (__stdcall *t_RtlWalkFrameChain)( void**, unsigned long, unsigned long ); - t_RtlWalkFrameChain RtlWalkFrameChain = 0; + typedef DWORD (__stdcall *t_SymAddrIncludeInlineTrace)( HANDLE hProcess, DWORD64 Address ); + typedef BOOL (__stdcall *t_SymQueryInlineTrace)( HANDLE hProcess, DWORD64 StartAddress, DWORD StartContext, DWORD64 StartRetAddress, DWORD64 CurAddress, LPDWORD CurContext, LPDWORD CurFrameIndex ); + typedef BOOL (__stdcall *t_SymFromInlineContext)( HANDLE hProcess, DWORD64 Address, ULONG InlineContext, PDWORD64 Displacement, PSYMBOL_INFO Symbol ); + typedef BOOL (__stdcall *t_SymGetLineFromInlineContext)( HANDLE hProcess, DWORD64 qwAddr, ULONG InlineContext, DWORD64 qwModuleBaseAddress, PDWORD pdwDisplacement, PIMAGEHLP_LINE64 Line64 ); + + TRACY_API ___tracy_t_RtlWalkFrameChain ___tracy_RtlWalkFrameChain = 0; + t_SymAddrIncludeInlineTrace _SymAddrIncludeInlineTrace = 0; + t_SymQueryInlineTrace _SymQueryInlineTrace = 0; + t_SymFromInlineContext _SymFromInlineContext = 0; + t_SymGetLineFromInlineContext _SymGetLineFromInlineContext = 0; } -#if defined __MINGW32__ && API_VERSION_NUMBER < 12 -extern "C" { -// Actual required API_VERSION_NUMBER is unknown because it is undocumented. These functions are not present in at least v11. -DWORD IMAGEAPI SymAddrIncludeInlineTrace(HANDLE hProcess, DWORD64 Address); -BOOL IMAGEAPI SymQueryInlineTrace(HANDLE hProcess, DWORD64 StartAddress, DWORD StartContext, DWORD64 StartRetAddress, - DWORD64 CurAddress, LPDWORD CurContext, LPDWORD CurFrameIndex); -BOOL IMAGEAPI SymFromInlineContext(HANDLE hProcess, DWORD64 Address, ULONG InlineContext, PDWORD64 Displacement, - PSYMBOL_INFO Symbol); -BOOL IMAGEAPI SymGetLineFromInlineContext(HANDLE hProcess, DWORD64 qwAddr, ULONG InlineContext, - DWORD64 qwModuleBaseAddress, PDWORD pdwDisplacement, PIMAGEHLP_LINE64 Line64); -}; -#endif -#ifndef __CYGWIN__ struct ModuleCache { uint64_t start; @@ -103,11 +124,30 @@ struct ModuleCache }; static FastVector* s_modCache; -#endif + + +struct KernelDriver +{ + uint64_t addr; + const char* mod; + const char* path; +}; + +KernelDriver* s_krnlCache = nullptr; +size_t s_krnlCacheCnt; + + +void InitCallstackCritical() +{ + ___tracy_RtlWalkFrameChain = (___tracy_t_RtlWalkFrameChain)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlWalkFrameChain" ); +} void InitCallstack() { - RtlWalkFrameChain = (t_RtlWalkFrameChain)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlWalkFrameChain" ); + _SymAddrIncludeInlineTrace = (t_SymAddrIncludeInlineTrace)GetProcAddress( GetModuleHandleA( "dbghelp.dll" ), "SymAddrIncludeInlineTrace" ); + _SymQueryInlineTrace = (t_SymQueryInlineTrace)GetProcAddress( GetModuleHandleA( "dbghelp.dll" ), "SymQueryInlineTrace" ); + _SymFromInlineContext = (t_SymFromInlineContext)GetProcAddress( GetModuleHandleA( "dbghelp.dll" ), "SymFromInlineContext" ); + _SymGetLineFromInlineContext = (t_SymGetLineFromInlineContext)GetProcAddress( GetModuleHandleA( "dbghelp.dll" ), "SymGetLineFromInlineContext" ); #ifdef TRACY_DBGHELP_LOCK DBGHELP_INIT; @@ -118,14 +158,63 @@ void InitCallstack() SymInitialize( GetCurrentProcess(), nullptr, true ); SymSetOptions( SYMOPT_LOAD_LINES ); -#ifndef __CYGWIN__ - HMODULE mod[1024]; DWORD needed; - HANDLE proc = GetCurrentProcess(); + LPVOID dev[4096]; + if( EnumDeviceDrivers( dev, sizeof(dev), &needed ) != 0 ) + { + char windir[MAX_PATH]; + if( !GetWindowsDirectoryA( windir, sizeof( windir ) ) ) memcpy( windir, "c:\\windows", 11 ); + const auto windirlen = strlen( windir ); + + const auto sz = needed / sizeof( LPVOID ); + s_krnlCache = (KernelDriver*)tracy_malloc( sizeof(KernelDriver) * sz ); + int cnt = 0; + for( size_t i=0; i", 2 ); + s_krnlCache[cnt] = KernelDriver { (uint64_t)dev[i], buf }; + + const auto len = GetDeviceDriverFileNameA( dev[i], fn, sizeof( fn ) ); + if( len != 0 ) + { + char full[MAX_PATH]; + char* path = fn; + + if( memcmp( fn, "\\SystemRoot\\", 12 ) == 0 ) + { + memcpy( full, windir, windirlen ); + strcpy( full + windirlen, fn + 11 ); + path = full; + } + + SymLoadModuleEx( GetCurrentProcess(), nullptr, path, nullptr, (DWORD64)dev[i], 0, nullptr, 0 ); + + const auto psz = strlen( path ); + auto pptr = (char*)tracy_malloc_fast( psz+1 ); + memcpy( pptr, path, psz ); + pptr[psz] = '\0'; + s_krnlCache[cnt].path = pptr; + } + + cnt++; + } + } + s_krnlCacheCnt = cnt; + std::sort( s_krnlCache, s_krnlCache + s_krnlCacheCnt, []( const KernelDriver& lhs, const KernelDriver& rhs ) { return lhs.addr > rhs.addr; } ); + } s_modCache = (FastVector*)tracy_malloc( sizeof( FastVector ) ); new(s_modCache) FastVector( 512 ); + HANDLE proc = GetCurrentProcess(); + HMODULE mod[1024]; if( EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 ) { const auto sz = needed / sizeof( HMODULE ); @@ -146,7 +235,7 @@ void InitCallstack() auto cache = s_modCache->push_next(); cache->start = base; cache->end = base + info.SizeOfImage; - cache->name = (char*)tracy_malloc( namelen+3 ); + cache->name = (char*)tracy_malloc_fast( namelen+3 ); cache->name[0] = '['; memcpy( cache->name+1, ptr, namelen ); cache->name[namelen+1] = ']'; @@ -155,19 +244,14 @@ void InitCallstack() } } } -#endif #ifdef TRACY_DBGHELP_LOCK DBGHELP_UNLOCK; #endif } -TRACY_API uintptr_t* CallTrace( int depth ) +void EndCallstack() { - auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) ); - const auto num = RtlWalkFrameChain( (void**)( trace + 1 ), depth, 0 ); - *trace = num; - return trace; } const char* DecodeCallstackPtrFast( uint64_t ptr ) @@ -198,11 +282,30 @@ const char* DecodeCallstackPtrFast( uint64_t ptr ) return ret; } -static const char* GetModuleName( uint64_t addr ) +const char* GetKernelModulePath( uint64_t addr ) { - if( ( addr & 0x8000000000000000 ) != 0 ) return "[kernel]"; + assert( addr >> 63 != 0 ); + if( !s_krnlCache ) return nullptr; + auto it = std::lower_bound( s_krnlCache, s_krnlCache + s_krnlCacheCnt, addr, []( const KernelDriver& lhs, const uint64_t& rhs ) { return lhs.addr > rhs; } ); + if( it == s_krnlCache + s_krnlCacheCnt ) return nullptr; + return it->path; +} + +static const char* GetModuleNameAndPrepareSymbols( uint64_t addr ) +{ + if( ( addr >> 63 ) != 0 ) + { + if( s_krnlCache ) + { + auto it = std::lower_bound( s_krnlCache, s_krnlCache + s_krnlCacheCnt, addr, []( const KernelDriver& lhs, const uint64_t& rhs ) { return lhs.addr > rhs; } ); + if( it != s_krnlCache + s_krnlCacheCnt ) + { + return it->mod; + } + } + return ""; + } -#ifndef __CYGWIN__ for( auto& v : *s_modCache ) { if( addr >= v.start && addr < v.end ) @@ -215,6 +318,7 @@ static const char* GetModuleName( uint64_t addr ) DWORD needed; HANDLE proc = GetCurrentProcess(); + InitRpmalloc(); if( EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 ) { const auto sz = needed / sizeof( HMODULE ); @@ -230,6 +334,8 @@ static const char* GetModuleName( uint64_t addr ) const auto res = GetModuleFileNameA( mod[i], name, 1021 ); if( res > 0 ) { + // since this is the first time we encounter this module, load its symbols (needed for modules loaded after SymInitialize) + SymLoadModuleEx(proc, NULL, name, NULL, (DWORD64)info.lpBaseOfDll, info.SizeOfImage, NULL, 0); auto ptr = name + res; while( ptr > name && *ptr != '\\' && *ptr != '/' ) ptr--; if( ptr > name ) ptr++; @@ -237,7 +343,7 @@ static const char* GetModuleName( uint64_t addr ) auto cache = s_modCache->push_next(); cache->start = base; cache->end = base + info.SizeOfImage; - cache->name = (char*)tracy_malloc( namelen+3 ); + cache->name = (char*)tracy_malloc_fast( namelen+3 ); cache->name[0] = '['; memcpy( cache->name+1, ptr, namelen ); cache->name[namelen+1] = ']'; @@ -248,8 +354,6 @@ static const char* GetModuleName( uint64_t addr ) } } } -#endif - return "[unknown]"; } @@ -263,69 +367,21 @@ CallstackSymbolData DecodeSymbolAddress( uint64_t ptr ) DBGHELP_LOCK; #endif const auto res = SymGetLineFromAddr64( GetCurrentProcess(), ptr, &displacement, &line ); -#ifdef TRACY_DBGHELP_LOCK - DBGHELP_UNLOCK; -#endif - if( res == 0 ) + if( res == 0 || line.LineNumber >= 0xF00000 ) { sym.file = "[unknown]"; sym.line = 0; + sym.needFree = false; } else { - sym.file = line.FileName; + sym.file = CopyString( line.FileName ); sym.line = line.LineNumber; - } - sym.needFree = false; - return sym; -} - -CallstackSymbolData DecodeCodeAddress( uint64_t ptr ) -{ - CallstackSymbolData sym; - const auto proc = GetCurrentProcess(); - bool done = false; - - IMAGEHLP_LINE64 line; - DWORD displacement = 0; - line.SizeOfStruct = sizeof(IMAGEHLP_LINE64); - -#ifdef TRACY_DBGHELP_LOCK - DBGHELP_LOCK; -#endif -#ifndef __CYGWIN__ - DWORD inlineNum = SymAddrIncludeInlineTrace( proc, ptr ); - DWORD ctx = 0; - DWORD idx; - BOOL doInline = FALSE; - if( inlineNum != 0 ) doInline = SymQueryInlineTrace( proc, ptr, 0, ptr, ptr, &ctx, &idx ); - if( doInline ) - { - if( SymGetLineFromInlineContext( proc, ptr, ctx, 0, &displacement, &line ) != 0 ) - { - sym.file = line.FileName; - sym.line = line.LineNumber; - done = true; - } - } -#endif - if( !done ) - { - if( SymGetLineFromAddr64( proc, ptr, &displacement, &line ) == 0 ) - { - sym.file = "[unknown]"; - sym.line = 0; - } - else - { - sym.file = line.FileName; - sym.line = line.LineNumber; - } + sym.needFree = true; } #ifdef TRACY_DBGHELP_LOCK DBGHELP_UNLOCK; #endif - sym.needFree = false; return sym; } @@ -333,16 +389,25 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) { int write; const auto proc = GetCurrentProcess(); + InitRpmalloc(); + #ifdef TRACY_DBGHELP_LOCK DBGHELP_LOCK; #endif -#ifndef __CYGWIN__ - DWORD inlineNum = SymAddrIncludeInlineTrace( proc, ptr ); - if( inlineNum > MaxCbTrace - 1 ) inlineNum = MaxCbTrace - 1; - DWORD ctx = 0; - DWORD idx; + + const auto moduleName = GetModuleNameAndPrepareSymbols(ptr); + +#if !defined TRACY_NO_CALLSTACK_INLINES BOOL doInline = FALSE; - if( inlineNum != 0 ) doInline = SymQueryInlineTrace( proc, ptr, 0, ptr, ptr, &ctx, &idx ); + DWORD ctx = 0; + DWORD inlineNum = 0; + if( _SymAddrIncludeInlineTrace ) + { + inlineNum = _SymAddrIncludeInlineTrace( proc, ptr ); + if( inlineNum > MaxCbTrace - 1 ) inlineNum = MaxCbTrace - 1; + DWORD idx; + if( inlineNum != 0 ) doInline = _SymQueryInlineTrace( proc, ptr, 0, ptr, ptr, &ctx, &idx ); + } if( doInline ) { write = inlineNum; @@ -360,7 +425,6 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) si->SizeOfStruct = sizeof( SYMBOL_INFO ); si->MaxNameLen = MaxNameSize; - const auto moduleName = GetModuleName( ptr ); const auto symValid = SymFromAddr( proc, ptr, nullptr, si ) != 0; IMAGEHLP_LINE64 line; @@ -369,7 +433,8 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) { const char* filename; - if( SymGetLineFromAddr64( proc, ptr, &displacement, &line ) == 0 ) + const auto res = SymGetLineFromAddr64( proc, ptr, &displacement, &line ); + if( res == 0 || line.LineNumber >= 0xF00000 ) { filename = "[unknown]"; cb_data[write].line = 0; @@ -380,8 +445,8 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) cb_data[write].line = line.LineNumber; } - cb_data[write].name = symValid ? CopyString( si->Name, si->NameLen ) : CopyString( moduleName ); - cb_data[write].file = CopyString( filename ); + cb_data[write].name = symValid ? CopyStringFast( si->Name, si->NameLen ) : CopyStringFast( moduleName ); + cb_data[write].file = CopyStringFast( filename ); if( symValid ) { cb_data[write].symLen = si->Size; @@ -394,15 +459,15 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) } } -#ifndef __CYGWIN__ +#if !defined TRACY_NO_CALLSTACK_INLINES if( doInline ) { for( DWORD i=0; iName, si->NameLen ) : CopyString( moduleName ); - cb.file = CopyString( filename ); + cb.name = symInlineValid ? CopyStringFast( si->Name, si->NameLen ) : CopyStringFast( moduleName ); + cb.file = CopyStringFast( filename ); if( symInlineValid ) { cb.symLen = si->Size; @@ -439,54 +504,285 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) #elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6 -enum { MaxCbTrace = 16 }; +enum { MaxCbTrace = 64 }; struct backtrace_state* cb_bts; int cb_num; CallstackEntry cb_data[MaxCbTrace]; int cb_fixup; -void InitCallstack() -{ - cb_bts = backtrace_create_state( nullptr, 0, nullptr, nullptr ); -} +#ifdef TRACY_DEBUGINFOD +debuginfod_client* s_debuginfod; -static int FastCallstackDataCb( void* data, uintptr_t pc, uintptr_t lowaddr, const char* fn, int lineno, const char* function ) +struct DebugInfo { - if( function ) + uint8_t* buildid; + size_t buildid_size; + char* filename; + int fd; +}; + +FastVector s_di_known( 16 ); +#endif + +#ifdef __linux +struct KernelSymbol +{ + uint64_t addr; + const char* name; + const char* mod; +}; + +KernelSymbol* s_kernelSym = nullptr; +size_t s_kernelSymCnt; + +static void InitKernelSymbols() +{ + FILE* f = fopen( "/proc/kallsyms", "rb" ); + if( !f ) return; + tracy::FastVector tmpSym( 1024 ); + size_t linelen = 16 * 1024; // linelen must be big enough to prevent reallocs in getline() + auto linebuf = (char*)tracy_malloc( linelen ); + ssize_t sz; + while( ( sz = getline( &linebuf, &linelen, f ) ) != -1 ) { - strcpy( (char*)data, function ); + auto ptr = linebuf; + uint64_t addr = 0; + while( *ptr != ' ' ) + { + auto v = *ptr; + if( v >= '0' && v <= '9' ) + { + v -= '0'; + } + else if( v >= 'a' && v <= 'f' ) + { + v -= 'a'; + v += 10; + } + else if( v >= 'A' && v <= 'F' ) + { + v -= 'A'; + v += 10; + } + else + { + assert( false ); + } + assert( ( v & ~0xF ) == 0 ); + addr <<= 4; + addr |= v; + ptr++; + } + if( addr == 0 ) continue; + ptr++; + if( *ptr != 'T' && *ptr != 't' ) continue; + ptr += 2; + const auto namestart = ptr; + while( *ptr != '\t' && *ptr != '\n' ) ptr++; + const auto nameend = ptr; + const char* modstart = nullptr; + const char* modend; + if( *ptr == '\t' ) + { + ptr += 2; + modstart = ptr; + while( *ptr != ']' ) ptr++; + modend = ptr; + } + + auto strname = (char*)tracy_malloc_fast( nameend - namestart + 1 ); + memcpy( strname, namestart, nameend - namestart ); + strname[nameend-namestart] = '\0'; + + char* strmod = nullptr; + if( modstart ) + { + strmod = (char*)tracy_malloc_fast( modend - modstart + 1 ); + memcpy( strmod, modstart, modend - modstart ); + strmod[modend-modstart] = '\0'; + } + + auto sym = tmpSym.push_next(); + sym->addr = addr; + sym->name = strname; + sym->mod = strmod; + } + tracy_free_fast( linebuf ); + fclose( f ); + if( tmpSym.empty() ) return; + + std::sort( tmpSym.begin(), tmpSym.end(), []( const KernelSymbol& lhs, const KernelSymbol& rhs ) { return lhs.addr > rhs.addr; } ); + s_kernelSymCnt = tmpSym.size(); + s_kernelSym = (KernelSymbol*)tracy_malloc_fast( sizeof( KernelSymbol ) * s_kernelSymCnt ); + memcpy( s_kernelSym, tmpSym.data(), sizeof( KernelSymbol ) * s_kernelSymCnt ); + TracyDebug( "Loaded %zu kernel symbols\n", s_kernelSymCnt ); +} +#endif + +char* NormalizePath( const char* path ) +{ + if( path[0] != '/' ) return nullptr; + + const char* ptr = path; + const char* end = path; + while( *end ) end++; + + char* res = (char*)tracy_malloc( end - ptr + 1 ); + size_t rsz = 0; + + while( ptr < end ) + { + const char* next = ptr; + while( next < end && *next != '/' ) next++; + size_t lsz = next - ptr; + switch( lsz ) + { + case 2: + if( memcmp( ptr, "..", 2 ) == 0 ) + { + const char* back = res + rsz - 1; + while( back > res && *back != '/' ) back--; + rsz = back - res; + ptr = next + 1; + continue; + } + break; + case 1: + if( *ptr == '.' ) + { + ptr = next + 1; + continue; + } + break; + case 0: + ptr = next + 1; + continue; + } + if( rsz != 1 ) res[rsz++] = '/'; + memcpy( res+rsz, ptr, lsz ); + rsz += lsz; + ptr = next + 1; + } + + if( rsz == 0 ) + { + memcpy( res, "/", 2 ); } else { - const char* symname = nullptr; - auto vptr = (void*)pc; - Dl_info dlinfo; - if( dladdr( vptr, &dlinfo ) ) - { - symname = dlinfo.dli_sname; - } - if( symname ) - { - strcpy( (char*)data, symname ); - } - else - { - *(char*)data = '\0'; - } + res[rsz] = '\0'; } - return 1; + return res; } -static void FastCallstackErrorCb( void* data, const char* /*msg*/, int /*errnum*/ ) +void InitCallstackCritical() { - *(char*)data = '\0'; +} + +void InitCallstack() +{ + cb_bts = backtrace_create_state( nullptr, 0, nullptr, nullptr ); + ___tracy_init_demangle_buffer(); + +#ifdef __linux + InitKernelSymbols(); +#endif +#ifdef TRACY_DEBUGINFOD + s_debuginfod = debuginfod_begin(); +#endif +} + +#ifdef TRACY_DEBUGINFOD +void ClearDebugInfoVector( FastVector& vec ) +{ + for( auto& v : vec ) + { + tracy_free( v.buildid ); + tracy_free( v.filename ); + if( v.fd >= 0 ) close( v.fd ); + } + vec.clear(); +} + +DebugInfo* FindDebugInfo( FastVector& vec, const uint8_t* buildid_data, size_t buildid_size ) +{ + for( auto& v : vec ) + { + if( v.buildid_size == buildid_size && memcmp( v.buildid, buildid_data, buildid_size ) == 0 ) + { + return &v; + } + } + return nullptr; +} + +int GetDebugInfoDescriptor( const char* buildid_data, size_t buildid_size, const char* filename ) +{ + auto buildid = (uint8_t*)buildid_data; + auto it = FindDebugInfo( s_di_known, buildid, buildid_size ); + if( it ) return it->fd >= 0 ? dup( it->fd ) : -1; + + int fd = debuginfod_find_debuginfo( s_debuginfod, buildid, buildid_size, nullptr ); + it = s_di_known.push_next(); + it->buildid_size = buildid_size; + it->buildid = (uint8_t*)tracy_malloc( buildid_size ); + memcpy( it->buildid, buildid, buildid_size ); + const auto fnsz = strlen( filename ) + 1; + it->filename = (char*)tracy_malloc( fnsz ); + memcpy( it->filename, filename, fnsz ); + it->fd = fd >= 0 ? fd : -1; + TracyDebug( "DebugInfo descriptor query: %i, fn: %s\n", fd, filename ); + return it->fd; +} + +const uint8_t* GetBuildIdForImage( const char* image, size_t& size ) +{ + assert( image ); + for( auto& v : s_di_known ) + { + if( strcmp( image, v.filename ) == 0 ) + { + size = v.buildid_size; + return v.buildid; + } + } + return nullptr; +} + +debuginfod_client* GetDebuginfodClient() +{ + return s_debuginfod; +} +#endif + +void EndCallstack() +{ + ___tracy_free_demangle_buffer(); +#ifdef TRACY_DEBUGINFOD + ClearDebugInfoVector( s_di_known ); + debuginfod_end( s_debuginfod ); +#endif } const char* DecodeCallstackPtrFast( uint64_t ptr ) { static char ret[1024]; - backtrace_pcinfo( cb_bts, ptr, FastCallstackDataCb, FastCallstackErrorCb, ret ); + auto vptr = (void*)ptr; + const char* symname = nullptr; + Dl_info dlinfo; + if( dladdr( vptr, &dlinfo ) && dlinfo.dli_sname ) + { + symname = dlinfo.dli_sname; + } + if( symname ) + { + strcpy( ret, symname ); + } + else + { + *ret = '\0'; + } return ret; } @@ -501,7 +797,8 @@ static int SymbolAddressDataCb( void* data, uintptr_t pc, uintptr_t lowaddr, con } else { - sym.file = CopyString( fn ); + sym.file = NormalizePath( fn ); + if( !sym.file ) sym.file = CopyString( fn ); sym.line = lineno; sym.needFree = true; } @@ -524,16 +821,8 @@ CallstackSymbolData DecodeSymbolAddress( uint64_t ptr ) return sym; } -CallstackSymbolData DecodeCodeAddress( uint64_t ptr ) -{ - return DecodeSymbolAddress( ptr ); -} - static int CallstackDataCb( void* /*data*/, uintptr_t pc, uintptr_t lowaddr, const char* fn, int lineno, const char* function ) { - enum { DemangleBufLen = 64*1024 }; - char demangled[DemangleBufLen]; - cb_data[cb_num].symLen = 0; cb_data[cb_num].symAddr = (uint64_t)lowaddr; @@ -548,38 +837,30 @@ static int CallstackDataCb( void* /*data*/, uintptr_t pc, uintptr_t lowaddr, con { symname = dlinfo.dli_sname; symoff = (char*)pc - (char*)dlinfo.dli_saddr; - - if( symname && symname[0] == '_' ) - { - size_t len = DemangleBufLen; - int status; - abi::__cxa_demangle( symname, demangled, &len, &status ); - if( status == 0 ) - { - symname = demangled; - } - } + const char* demangled = ___tracy_demangle( symname ); + if( demangled ) symname = demangled; } if( !symname ) symname = "[unknown]"; if( symoff == 0 ) { - cb_data[cb_num].name = CopyString( symname ); + const auto len = std::min( strlen( symname ), std::numeric_limits::max() ); + cb_data[cb_num].name = CopyStringFast( symname, len ); } else { char buf[32]; const auto offlen = sprintf( buf, " + %td", symoff ); - const auto namelen = strlen( symname ); - auto name = (char*)tracy_malloc( namelen + offlen + 1 ); + const auto namelen = std::min( strlen( symname ), std::numeric_limits::max() - offlen ); + auto name = (char*)tracy_malloc_fast( namelen + offlen + 1 ); memcpy( name, symname, namelen ); memcpy( name + namelen, buf, offlen ); name[namelen + offlen] = '\0'; cb_data[cb_num].name = name; } - cb_data[cb_num].file = CopyString( "[unknown]" ); + cb_data[cb_num].file = CopyStringFast( "[unknown]" ); cb_data[cb_num].line = 0; } else @@ -591,20 +872,14 @@ static int CallstackDataCb( void* /*data*/, uintptr_t pc, uintptr_t lowaddr, con } else { - if( function[0] == '_' ) - { - size_t len = DemangleBufLen; - int status; - abi::__cxa_demangle( function, demangled, &len, &status ); - if( status == 0 ) - { - function = demangled; - } - } + const char* demangled = ___tracy_demangle( function ); + if( demangled ) function = demangled; } - cb_data[cb_num].name = CopyString( function ); - cb_data[cb_num].file = CopyString( fn ); + const auto len = std::min( strlen( function ), std::numeric_limits::max() ); + cb_data[cb_num].name = CopyStringFast( function, len ); + cb_data[cb_num].file = NormalizePath( fn ); + if( !cb_data[cb_num].file ) cb_data[cb_num].file = CopyStringFast( fn ); cb_data[cb_num].line = lineno; } @@ -622,12 +897,12 @@ static void CallstackErrorCb( void* /*data*/, const char* /*msg*/, int /*errnum* { for( int i=0; i 0 ); + InitRpmalloc(); + if( ptr >> 63 == 0 ) + { + cb_num = 0; + backtrace_pcinfo( cb_bts, ptr, CallstackDataCb, CallstackErrorCb, nullptr ); + assert( cb_num > 0 ); - backtrace_syminfo( cb_bts, ptr, SymInfoCallback, SymInfoError, nullptr ); + backtrace_syminfo( cb_bts, ptr, SymInfoCallback, SymInfoError, nullptr ); - const char* symloc = nullptr; - Dl_info dlinfo; - if( dladdr( (void*)ptr, &dlinfo ) ) symloc = dlinfo.dli_fname; + const char* symloc = nullptr; + Dl_info dlinfo; + if( dladdr( (void*)ptr, &dlinfo ) ) symloc = dlinfo.dli_fname; - return { cb_data, uint8_t( cb_num ), symloc ? symloc : "[unknown]" }; + return { cb_data, uint8_t( cb_num ), symloc ? symloc : "[unknown]" }; + } +#ifdef __linux + else if( s_kernelSym ) + { + auto it = std::lower_bound( s_kernelSym, s_kernelSym + s_kernelSymCnt, ptr, []( const KernelSymbol& lhs, const uint64_t& rhs ) { return lhs.addr > rhs; } ); + if( it != s_kernelSym + s_kernelSymCnt ) + { + cb_data[0].name = CopyStringFast( it->name ); + cb_data[0].file = CopyStringFast( "" ); + cb_data[0].line = 0; + cb_data[0].symLen = 0; + cb_data[0].symAddr = it->addr; + return { cb_data, 1, it->mod ? it->mod : "" }; + } + } +#endif + + cb_data[0].name = CopyStringFast( "[unknown]" ); + cb_data[0].file = CopyStringFast( "" ); + cb_data[0].line = 0; + cb_data[0].symLen = 0; + cb_data[0].symAddr = 0; + return { cb_data, 1, "" }; } #elif TRACY_HAS_CALLSTACK == 5 +void InitCallstackCritical() +{ +} + void InitCallstack() { + ___tracy_init_demangle_buffer(); +} + +void EndCallstack() +{ + ___tracy_free_demangle_buffer(); } const char* DecodeCallstackPtrFast( uint64_t ptr ) @@ -693,12 +1004,7 @@ CallstackSymbolData DecodeSymbolAddress( uint64_t ptr ) Dl_info dlinfo; if( dladdr( (void*)ptr, &dlinfo ) ) symloc = dlinfo.dli_fname; if( !symloc ) symloc = "[unknown]"; - return CallstackSymbolData { symloc, 0, false }; -} - -CallstackSymbolData DecodeCodeAddress( uint64_t ptr ) -{ - return DecodeSymbolAddress( ptr ); + return CallstackSymbolData { symloc, 0, false, 0 }; } CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) @@ -706,7 +1012,6 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) static CallstackEntry cb; cb.line = 0; - char* demangled = nullptr; const char* symname = nullptr; const char* symloc = nullptr; auto vptr = (void*)ptr; @@ -720,17 +1025,8 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) symname = dlinfo.dli_sname; symoff = (char*)ptr - (char*)dlinfo.dli_saddr; symaddr = dlinfo.dli_saddr; - - if( symname && symname[0] == '_' ) - { - size_t len = 0; - int status; - demangled = abi::__cxa_demangle( symname, nullptr, &len, &status ); - if( status == 0 ) - { - symname = demangled; - } - } + const char* demangled = ___tracy_demangle( symname ); + if( demangled ) symname = demangled; } if( !symname ) symname = "[unknown]"; @@ -738,13 +1034,14 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) if( symoff == 0 ) { - cb.name = CopyString( symname ); + const auto len = std::min( strlen( symname ), std::numeric_limits::max() ); + cb.name = CopyString( symname, len ); } else { char buf[32]; const auto offlen = sprintf( buf, " + %td", symoff ); - const auto namelen = strlen( symname ); + const auto namelen = std::min( strlen( symname ), std::numeric_limits::max() - offlen ); auto name = (char*)tracy_malloc( namelen + offlen + 1 ); memcpy( name, symname, namelen ); memcpy( name + namelen, buf, offlen ); @@ -756,8 +1053,6 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) cb.symLen = 0; cb.symAddr = (uint64_t)symaddr; - if( demangled ) free( demangled ); - return { &cb, 1, symloc }; } diff --git a/Source/ThirdParty/tracy/client/TracyCallstack.h b/Source/ThirdParty/tracy/client/TracyCallstack.h index 87d8ce721..2c7ecad9f 100644 --- a/Source/ThirdParty/tracy/client/TracyCallstack.h +++ b/Source/ThirdParty/tracy/client/TracyCallstack.h @@ -1,28 +1,35 @@ #ifndef __TRACYCALLSTACK_H__ #define __TRACYCALLSTACK_H__ -#if !defined _WIN32 && !defined __CYGWIN__ -# include -#endif +#ifndef TRACY_NO_CALLSTACK -#if defined _WIN32 || defined __CYGWIN__ -# define TRACY_HAS_CALLSTACK 1 -#elif defined __ANDROID__ -# if !defined __arm__ || __ANDROID_API__ >= 21 -# define TRACY_HAS_CALLSTACK 2 -# else -# define TRACY_HAS_CALLSTACK 5 +# if !defined _WIN32 +# include # endif -#elif defined __linux -# if defined _GNU_SOURCE && defined __GLIBC__ -# define TRACY_HAS_CALLSTACK 3 -# else -# define TRACY_HAS_CALLSTACK 2 + +# if defined _WIN32 +# include "../common/TracyUwp.hpp" +# ifndef TRACY_UWP +# define TRACY_HAS_CALLSTACK 1 +# endif +# elif defined __ANDROID__ +# if !defined __arm__ || __ANDROID_API__ >= 21 +# define TRACY_HAS_CALLSTACK 2 +# else +# define TRACY_HAS_CALLSTACK 5 +# endif +# elif defined __linux +# if defined _GNU_SOURCE && defined __GLIBC__ +# define TRACY_HAS_CALLSTACK 3 +# else +# define TRACY_HAS_CALLSTACK 2 +# endif +# elif defined __APPLE__ +# define TRACY_HAS_CALLSTACK 4 +# elif defined BSD +# define TRACY_HAS_CALLSTACK 6 # endif -#elif defined __APPLE__ -# define TRACY_HAS_CALLSTACK 4 -#elif defined BSD -# define TRACY_HAS_CALLSTACK 6 + #endif #endif diff --git a/Source/ThirdParty/tracy/client/TracyCallstack.hpp b/Source/ThirdParty/tracy/client/TracyCallstack.hpp index 923eccc04..8cfede8fb 100644 --- a/Source/ThirdParty/tracy/client/TracyCallstack.hpp +++ b/Source/ThirdParty/tracy/client/TracyCallstack.hpp @@ -12,6 +12,10 @@ #ifdef TRACY_HAS_CALLSTACK +#ifdef TRACY_DEBUGINFOD +# include +#endif + #include #include @@ -25,6 +29,7 @@ struct CallstackSymbolData const char* file; uint32_t line; bool needFree; + uint64_t symAddr; }; struct CallstackEntry @@ -44,19 +49,33 @@ struct CallstackEntryData }; CallstackSymbolData DecodeSymbolAddress( uint64_t ptr ); -CallstackSymbolData DecodeCodeAddress( uint64_t ptr ); const char* DecodeCallstackPtrFast( uint64_t ptr ); CallstackEntryData DecodeCallstackPtr( uint64_t ptr ); void InitCallstack(); +void InitCallstackCritical(); +void EndCallstack(); +const char* GetKernelModulePath( uint64_t addr ); + +#ifdef TRACY_DEBUGINFOD +const uint8_t* GetBuildIdForImage( const char* image, size_t& size ); +debuginfod_client* GetDebuginfodClient(); +#endif #if TRACY_HAS_CALLSTACK == 1 -TRACY_API uintptr_t* CallTrace( int depth ); +extern "C" +{ + typedef unsigned long (__stdcall *___tracy_t_RtlWalkFrameChain)( void**, unsigned long, unsigned long ); + TRACY_API extern ___tracy_t_RtlWalkFrameChain ___tracy_RtlWalkFrameChain; +} static tracy_force_inline void* Callstack( int depth ) { assert( depth >= 1 && depth < 63 ); - return CallTrace( depth ); + auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) ); + const auto num = ___tracy_RtlWalkFrameChain( (void**)( trace + 1 ), depth, 0 ); + *trace = num; + return trace; } #elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5 diff --git a/Source/ThirdParty/tracy/client/TracyCpuid.hpp b/Source/ThirdParty/tracy/client/TracyCpuid.hpp new file mode 100644 index 000000000..9820be00b --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyCpuid.hpp @@ -0,0 +1,12 @@ +#ifndef __TRACYCPUID_HPP__ +#define __TRACYCPUID_HPP__ + +// Prior to GCC 11 the cpuid.h header did not have any include guards and thus +// including it more than once would cause a compiler error due to symbol +// redefinitions. In order to support older GCC versions, we have to wrap this +// include between custom include guards to prevent this issue. +// See also https://github.com/wolfpld/tracy/issues/452 + +#include + +#endif diff --git a/Source/ThirdParty/tracy/client/TracyDebug.hpp b/Source/ThirdParty/tracy/client/TracyDebug.hpp new file mode 100644 index 000000000..8723356f4 --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyDebug.hpp @@ -0,0 +1,11 @@ +#ifndef __TRACYPRINT_HPP__ +#define __TRACYPRINT_HPP__ + +#ifdef TRACY_VERBOSE +# include +# define TracyDebug(...) fprintf( stderr, __VA_ARGS__ ); +#else +# define TracyDebug(...) +#endif + +#endif diff --git a/Source/ThirdParty/tracy/client/TracyFastVector.hpp b/Source/ThirdParty/tracy/client/TracyFastVector.hpp index fc4108016..8cf1651eb 100644 --- a/Source/ThirdParty/tracy/client/TracyFastVector.hpp +++ b/Source/ThirdParty/tracy/client/TracyFastVector.hpp @@ -101,7 +101,7 @@ private: const auto size = size_t( m_write - m_ptr ); T* ptr = (T*)tracy_malloc( sizeof( T ) * cap ); memcpy( ptr, m_ptr, size * sizeof( T ) ); - tracy_free( m_ptr ); + tracy_free_fast( m_ptr ); m_ptr = ptr; m_write = m_ptr + size; m_end = m_ptr + cap; diff --git a/Source/ThirdParty/tracy/client/TracyLock.hpp b/Source/ThirdParty/tracy/client/TracyLock.hpp index e513cdc5d..296a41ba1 100644 --- a/Source/ThirdParty/tracy/client/TracyLock.hpp +++ b/Source/ThirdParty/tracy/client/TracyLock.hpp @@ -98,7 +98,6 @@ public: auto item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::LockRelease ); - MemWrite( &item->lockRelease.thread, GetThreadHandle() ); MemWrite( &item->lockRelease.id, m_id ); MemWrite( &item->lockRelease.time, Profiler::GetTime() ); Profiler::QueueSerialFinish(); @@ -313,7 +312,6 @@ public: auto item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::LockRelease ); - MemWrite( &item->lockRelease.thread, GetThreadHandle() ); MemWrite( &item->lockRelease.id, m_id ); MemWrite( &item->lockRelease.time, Profiler::GetTime() ); Profiler::QueueSerialFinish(); @@ -395,9 +393,9 @@ public: auto item = Profiler::QueueSerial(); MemWrite( &item->hdr.type, QueueType::LockSharedRelease ); - MemWrite( &item->lockRelease.thread, GetThreadHandle() ); - MemWrite( &item->lockRelease.id, m_id ); - MemWrite( &item->lockRelease.time, Profiler::GetTime() ); + MemWrite( &item->lockReleaseShared.thread, GetThreadHandle() ); + MemWrite( &item->lockReleaseShared.id, m_id ); + MemWrite( &item->lockReleaseShared.time, Profiler::GetTime() ); Profiler::QueueSerialFinish(); } diff --git a/Source/ThirdParty/tracy/client/TracyProfiler.cpp b/Source/ThirdParty/tracy/client/TracyProfiler.cpp index 5462a3573..46a9d36e4 100644 --- a/Source/ThirdParty/tracy/client/TracyProfiler.cpp +++ b/Source/ThirdParty/tracy/client/TracyProfiler.cpp @@ -9,24 +9,18 @@ # include # include # include +# include "../common/TracyUwp.hpp" #else # include # include #endif -#ifdef __CYGWIN__ -# include -# include -# include -#endif - #ifdef _GNU_SOURCE # include #endif #ifdef __linux__ # include -# include # include # include # include @@ -44,6 +38,7 @@ #ifdef __ANDROID__ # include +# include # include # include # include @@ -62,16 +57,20 @@ #include #include "../common/TracyAlign.hpp" +#include "../common/TracyAlloc.hpp" #include "../common/TracySocket.hpp" #include "../common/TracySystem.hpp" +#include "../common/TracyYield.hpp" #include "tracy_rpmalloc.hpp" #include "TracyCallstack.hpp" +#include "TracyDebug.hpp" #include "TracyScoped.hpp" #include "TracyProfiler.hpp" #include "TracyThread.hpp" #include "TracyArmCpuTable.hpp" #include "TracySysTrace.hpp" + #ifdef TRACY_PORT # ifndef TRACY_DATA_PORT # define TRACY_DATA_PORT TRACY_PORT @@ -91,7 +90,7 @@ # endif #endif -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 # include extern "C" typedef LONG (WINAPI *t_RtlGetVersion)( PRTL_OSVERSIONINFOW ); extern "C" typedef BOOL (WINAPI *t_GetLogicalProcessorInformationEx)( LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD ); @@ -104,55 +103,156 @@ extern "C" typedef BOOL (WINAPI *t_GetLogicalProcessorInformationEx)( LOGICAL_PR # include #endif -#if !defined _WIN32 && !defined __CYGWIN__ && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) -# include +#if !defined _WIN32 && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) +# include "TracyCpuid.hpp" #endif -#if !( ( ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA ) || defined __linux__ ) +#if !( ( defined _WIN32 && _WIN32_WINNT >= _WIN32_WINNT_VISTA ) || defined __linux__ ) # include #endif namespace tracy { -namespace +#ifdef __ANDROID__ +// Implementation helpers of EnsureReadable(address). +// This is so far only needed on Android, where it is common for libraries to be mapped +// with only executable, not readable, permissions. Typical example (line from /proc/self/maps): +/* +746b63b000-746b6dc000 --xp 00042000 07:48 35 /apex/com.android.runtime/lib64/bionic/libc.so +*/ +// See https://github.com/wolfpld/tracy/issues/125 . +// To work around this, we parse /proc/self/maps and we use mprotect to set read permissions +// on any mappings that contain symbols addresses hit by HandleSymbolCodeQuery. + +namespace { +// Holds some information about a single memory mapping. +struct MappingInfo { + // Start of address range. Inclusive. + uintptr_t start_address; + // End of address range. Exclusive, so the mapping is the half-open interval + // [start, end) and its length in bytes is `end - start`. As in /proc/self/maps. + uintptr_t end_address; + // Read/Write/Executable permissions. + bool perm_r, perm_w, perm_x; +}; +} // anonymous namespace + + // Internal implementation helper for LookUpMapping(address). + // + // Parses /proc/self/maps returning a vector. + // /proc/self/maps is assumed to be sorted by ascending address, so the resulting + // vector is sorted by ascending address too. +static std::vector ParseMappings() { -# if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA - BOOL CALLBACK InitOnceCallback( PINIT_ONCE /*initOnce*/, PVOID /*Parameter*/, PVOID* /*Context*/) + std::vector result; + FILE* file = fopen( "/proc/self/maps", "r" ); + if( !file ) return result; + char line[1024]; + while( fgets( line, sizeof( line ), file ) ) { - rpmalloc_initialize(); - return TRUE; + uintptr_t start_addr; + uintptr_t end_addr; + if( sscanf( line, "%lx-%lx", &start_addr, &end_addr ) != 2 ) continue; + char* first_space = strchr( line, ' ' ); + if( !first_space ) continue; + char* perm = first_space + 1; + char* second_space = strchr( perm, ' ' ); + if( !second_space || second_space - perm != 4 ) continue; + result.emplace_back(); + auto& mapping = result.back(); + mapping.start_address = start_addr; + mapping.end_address = end_addr; + mapping.perm_r = perm[0] == 'r'; + mapping.perm_w = perm[1] == 'w'; + mapping.perm_x = perm[2] == 'x'; } - INIT_ONCE InitOnce = INIT_ONCE_STATIC_INIT; -# elif defined __linux__ - void InitOnceCallback() - { - rpmalloc_initialize(); - } - pthread_once_t once_control = PTHREAD_ONCE_INIT; -# else - void InitOnceCallback() - { - rpmalloc_initialize(); - } - std::once_flag once_flag; -# endif + fclose( file ); + return result; } -struct RPMallocInit +// Internal implementation helper for LookUpMapping(address). +// +// Takes as input an `address` and a known vector `mappings`, assumed to be +// sorted by increasing addresses, as /proc/self/maps seems to be. +// Returns a pointer to the MappingInfo describing the mapping that this +// address belongs to, or nullptr if the address isn't in `mappings`. +static MappingInfo* LookUpMapping(std::vector& mappings, uintptr_t address) { - RPMallocInit() - { -# if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA - InitOnceExecuteOnce( &InitOnce, InitOnceCallback, nullptr, nullptr ); -# elif defined __linux__ - pthread_once( &once_control, InitOnceCallback ); -# else - std::call_once( once_flag, InitOnceCallback ); -# endif - rpmalloc_thread_initialize(); + // Comparison function for std::lower_bound. Returns true if all addresses in `m1` + // are lower than `addr`. + auto Compare = []( const MappingInfo& m1, uintptr_t addr ) { + // '<=' because the address ranges are half-open intervals, [start, end). + return m1.end_address <= addr; + }; + auto iter = std::lower_bound( mappings.begin(), mappings.end(), address, Compare ); + if( iter == mappings.end() || iter->start_address > address) { + return nullptr; } -}; + return &*iter; +} + +// Internal implementation helper for EnsureReadable(address). +// +// Takes as input an `address` and returns a pointer to a MappingInfo +// describing the mapping that this address belongs to, or nullptr if +// the address isn't in any known mapping. +// +// This function is stateful and not reentrant (assumes to be called from +// only one thread). It holds a vector of mappings parsed from /proc/self/maps. +// +// Attempts to react to mappings changes by re-parsing /proc/self/maps. +static MappingInfo* LookUpMapping(uintptr_t address) +{ + // Static state managed by this function. Not constant, we mutate that state as + // we turn some mappings readable. Initially parsed once here, updated as needed below. + static std::vector s_mappings = ParseMappings(); + MappingInfo* mapping = LookUpMapping( s_mappings, address ); + if( mapping ) return mapping; + + // This address isn't in any known mapping. Try parsing again, maybe + // mappings changed. + s_mappings = ParseMappings(); + return LookUpMapping( s_mappings, address ); +} + +// Internal implementation helper for EnsureReadable(address). +// +// Attempts to make the specified `mapping` readable if it isn't already. +// Returns true if and only if the mapping is readable. +static bool EnsureReadable( MappingInfo& mapping ) +{ + if( mapping.perm_r ) + { + // The mapping is already readable. + return true; + } + int prot = PROT_READ; + if( mapping.perm_w ) prot |= PROT_WRITE; + if( mapping.perm_x ) prot |= PROT_EXEC; + if( mprotect( reinterpret_cast( mapping.start_address ), + mapping.end_address - mapping.start_address, prot ) == -1 ) + { + // Failed to make the mapping readable. Shouldn't happen, hasn't + // been observed yet. If it happened in practice, we should consider + // adding a bool to MappingInfo to track this to avoid retrying mprotect + // everytime on such mappings. + return false; + } + // The mapping is now readable. Update `mapping` so the next call will be fast. + mapping.perm_r = true; + return true; +} + +// Attempts to set the read permission on the entire mapping containing the +// specified address. Returns true if and only if the mapping is now readable. +static bool EnsureReadable( uintptr_t address ) +{ + MappingInfo* mapping = LookUpMapping(address); + return mapping && EnsureReadable( *mapping ); +} + +#endif // defined __ANDROID__ #ifndef TRACY_DELAYED_INIT @@ -168,7 +268,7 @@ struct ProducerWrapper struct ThreadHandleWrapper { - uint64_t val; + uint32_t val; }; #endif @@ -177,7 +277,7 @@ struct ThreadHandleWrapper static inline void CpuId( uint32_t* regs, uint32_t leaf ) { memset(regs, 0, sizeof(uint32_t) * 4); -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 __cpuidex( (int*)regs, leaf, 0 ); #else __get_cpuid( leaf, regs, regs+1, regs+2, regs+3 ); @@ -186,7 +286,7 @@ static inline void CpuId( uint32_t* regs, uint32_t leaf ) static void InitFailure( const char* msg ) { -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 bool hasConsole = false; bool reopen = false; const auto attached = AttachConsole( ATTACH_PARENT_PROCESS ); @@ -214,33 +314,59 @@ static void InitFailure( const char* msg ) } else { +# ifndef TRACY_UWP MessageBoxA( nullptr, msg, "Tracy Profiler initialization failure", MB_ICONSTOP ); +# endif } #else fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg ); #endif - exit( 0 ); + exit( 1 ); } +static bool CheckHardwareSupportsInvariantTSC() +{ +#if defined TRACY_NO_INVARIANT_CHECK + return true; +#else + const char* noCheck = GetEnvVar( "TRACY_NO_INVARIANT_CHECK" ); + if( noCheck && noCheck[0] == '1' ) return true; + + uint32_t regs[4]; + CpuId( regs, 1 ); + if( !( regs[3] & ( 1 << 4 ) ) ) + { +#if !defined TRACY_TIMER_QPC && !defined TRACY_TIMER_FALLBACK + InitFailure( "CPU doesn't support RDTSC instruction." ); +#else + return false; +#endif + } + CpuId( regs, 0x80000007 ); + if( regs[3] & ( 1 << 8 ) ) return true; + + return false; +#endif +} + +#if defined TRACY_TIMER_FALLBACK && defined TRACY_HW_TIMER +bool HardwareSupportsInvariantTSC() +{ + static bool cachedResult = CheckHardwareSupportsInvariantTSC(); + return cachedResult; +} +#endif + static int64_t SetupHwTimer() { #if !defined TRACY_TIMER_QPC && !defined TRACY_TIMER_FALLBACK - uint32_t regs[4]; - CpuId( regs, 1 ); - if( !( regs[3] & ( 1 << 4 ) ) ) InitFailure( "CPU doesn't support RDTSC instruction." ); -#if !defined TRACY_NO_INVARIANT_CHECK - CpuId( regs, 0x80000007 ); - if( !( regs[3] & ( 1 << 8 ) ) ) + if( !CheckHardwareSupportsInvariantTSC() ) { - const char* noCheck = getenv( "TRACY_NO_INVARIANT_CHECK" ); - if( !noCheck || noCheck[0] != '1' ) - { -#if defined _WIN32 || defined __CYGWIN__ - InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*.\nAlternatively you may rebuild the application with the TRACY_TIMER_QPC or TRACY_TIMER_FALLBACK define to use lower resolution timer." ); +#if defined _WIN32 + InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*.\nAlternatively you may rebuild the application with the TRACY_TIMER_QPC or TRACY_TIMER_FALLBACK define to use lower resolution timer." ); #else - InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*.\nAlternatively you may rebuild the application with the TRACY_TIMER_FALLBACK define to use lower resolution timer." ); + InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*.\nAlternatively you may rebuild the application with the TRACY_TIMER_FALLBACK define to use lower resolution timer." ); #endif - } } #endif #endif @@ -270,7 +396,7 @@ static const char* GetProcessName() auto buf = getprogname(); if( buf ) processName = buf; # endif -#elif defined _GNU_SOURCE || defined __CYGWIN__ +#elif defined __linux__ && defined _GNU_SOURCE if( program_invocation_short_name ) processName = program_invocation_short_name; #elif defined __APPLE__ || defined BSD auto buf = getprogname(); @@ -287,7 +413,7 @@ static const char* GetProcessExecutablePath() return buf; #elif defined __ANDROID__ return nullptr; -#elif defined _GNU_SOURCE || defined __CYGWIN__ +#elif defined __linux__ && defined _GNU_SOURCE return program_invocation_name; #elif defined __APPLE__ static char buf[1024]; @@ -341,13 +467,15 @@ static const char* GetHostInfo() { static char buf[1024]; auto ptr = buf; -#if defined _WIN32 || defined __CYGWIN__ - t_RtlGetVersion RtlGetVersion = (t_RtlGetVersion)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlGetVersion" ); - if( !RtlGetVersion ) +#if defined _WIN32 +# ifdef TRACY_UWP + auto GetVersion = &::GetVersionEx; +# else + auto GetVersion = (t_RtlGetVersion)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlGetVersion" ); +# endif + if( !GetVersion ) { -# ifdef __CYGWIN__ - ptr += sprintf( ptr, "OS: Windows (Cygwin)\n" ); -# elif defined __MINGW32__ +# ifdef __MINGW32__ ptr += sprintf( ptr, "OS: Windows (MingW)\n" ); # else ptr += sprintf( ptr, "OS: Windows\n" ); @@ -356,11 +484,9 @@ static const char* GetHostInfo() else { RTL_OSVERSIONINFOW ver = { sizeof( RTL_OSVERSIONINFOW ) }; - RtlGetVersion( &ver ); + GetVersion( &ver ); -# ifdef __CYGWIN__ - ptr += sprintf( ptr, "OS: Windows %i.%i.%i (Cygwin)\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber ); -# elif defined __MINGW32__ +# ifdef __MINGW32__ ptr += sprintf( ptr, "OS: Windows %i.%i.%i (MingW)\n", (int)ver.dwMajorVersion, (int)ver.dwMinorVersion, (int)ver.dwBuildNumber ); # else ptr += sprintf( ptr, "OS: Windows %i.%i.%i\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber ); @@ -403,21 +529,24 @@ static const char* GetHostInfo() #elif defined __clang__ ptr += sprintf( ptr, "Compiler: clang %i.%i.%i\n", __clang_major__, __clang_minor__, __clang_patchlevel__ ); #elif defined __GNUC__ - ptr += sprintf( ptr, "Compiler: gcc %i.%i\n", __GNUC__, __GNUC_MINOR__ ); + ptr += sprintf( ptr, "Compiler: gcc %i.%i.%i\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ ); #else ptr += sprintf( ptr, "Compiler: unknown\n" ); #endif -#if defined _WIN32 || defined __CYGWIN__ -# ifndef __CYGWIN__ +#if defined _WIN32 InitWinSock(); -# endif + char hostname[512]; gethostname( hostname, 512 ); +# ifdef TRACY_UWP + const char* user = ""; +# else DWORD userSz = UNLEN+1; char user[UNLEN+1]; GetUserNameA( user, &userSz ); +# endif ptr += sprintf( ptr, "User: %s@%s\n", user, hostname ); #else @@ -456,7 +585,7 @@ static const char* GetHostInfo() #if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 uint32_t regs[4]; - char cpuModel[4*4*3]; + char cpuModel[4*4*3+1] = {}; auto modelPtr = cpuModel; for( uint32_t i=0x80000002; i<0x80000005; ++i ) { @@ -526,10 +655,17 @@ static const char* GetHostInfo() #else ptr += sprintf( ptr, "CPU: unknown\n" ); #endif +#ifdef __ANDROID__ + char deviceModel[PROP_VALUE_MAX+1]; + char deviceManufacturer[PROP_VALUE_MAX+1]; + __system_property_get( "ro.product.model", deviceModel ); + __system_property_get( "ro.product.manufacturer", deviceManufacturer ); + ptr += sprintf( ptr, "Device: %s %s\n", deviceManufacturer, deviceModel ); +#endif ptr += sprintf( ptr, "CPU cores: %i\n", std::thread::hardware_concurrency() ); -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 MEMORYSTATUSEX statex; statex.dwLength = sizeof( statex ); GlobalMemoryStatusEx( &statex ); @@ -561,7 +697,7 @@ static const char* GetHostInfo() static uint64_t GetPid() { -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 return uint64_t( GetCurrentProcessId() ); #else return uint64_t( getpid() ); @@ -576,12 +712,12 @@ void Profiler::AckServerQuery() AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::AckServerQueryNoop] ); } -void Profiler::AckSourceCodeNotAvailable() +void Profiler::AckSymbolCodeNotAvailable() { QueueItem item; - MemWrite( &item.hdr.type, QueueType::AckSourceCodeNotAvailable ); - NeedDataSize( QueueDataSize[(int)QueueType::AckSourceCodeNotAvailable] ); - AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::AckSourceCodeNotAvailable] ); + MemWrite( &item.hdr.type, QueueType::AckSymbolCodeNotAvailable ); + NeedDataSize( QueueDataSize[(int)QueueType::AckSymbolCodeNotAvailable] ); + AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::AckSymbolCodeNotAvailable] ); } static BroadcastMessage& GetBroadcastMessage( const char* procname, size_t pnsz, int& len, int port ) @@ -591,6 +727,7 @@ static BroadcastMessage& GetBroadcastMessage( const char* procname, size_t pnsz, msg.broadcastVersion = BroadcastVersion; msg.protocolVersion = ProtocolVersion; msg.listenPort = port; + msg.pid = GetPid(); memcpy( msg.programName, procname, pnsz ); memset( msg.programName + pnsz, 0, WelcomeMessageProgramNameSize - pnsz ); @@ -599,8 +736,9 @@ static BroadcastMessage& GetBroadcastMessage( const char* procname, size_t pnsz, return msg; } -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 && !defined TRACY_UWP && !defined TRACY_NO_CRASH_HANDLER static DWORD s_profilerThreadId = 0; +static DWORD s_symbolThreadId = 0; static char s_crashText[1024]; LONG WINAPI CrashFilter( PEXCEPTION_POINTERS pExp ) @@ -659,10 +797,10 @@ LONG WINAPI CrashFilter( PEXCEPTION_POINTERS pExp ) { GetProfiler().SendCallstack( 60, "KiUserExceptionDispatcher" ); - TracyLfqPrepare( QueueType::CrashReport ); + TracyQueuePrepare( QueueType::CrashReport ); item->crashReport.time = Profiler::GetTime(); item->crashReport.text = (uint64_t)s_crashText; - TracyLfqCommit; + TracyQueueCommit( crashReportThread ); } HANDLE h = CreateToolhelp32Snapshot( TH32CS_SNAPTHREAD, 0 ); @@ -680,7 +818,7 @@ LONG WINAPI CrashFilter( PEXCEPTION_POINTERS pExp ) do { - if( te.th32OwnerProcessID == pid && te.th32ThreadID != tid && te.th32ThreadID != s_profilerThreadId ) + if( te.th32OwnerProcessID == pid && te.th32ThreadID != tid && te.th32ThreadID != s_profilerThreadId && te.th32ThreadID != s_symbolThreadId ) { HANDLE th = OpenThread( THREAD_SUSPEND_RESUME, FALSE, te.th32ThreadID ); if( th != INVALID_HANDLE_VALUE ) @@ -702,14 +840,30 @@ LONG WINAPI CrashFilter( PEXCEPTION_POINTERS pExp ) GetProfiler().RequestShutdown(); while( !GetProfiler().HasShutdownFinished() ) { std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); }; - TerminateProcess( GetCurrentProcess(), 1 ); - return EXCEPTION_CONTINUE_SEARCH; } #endif -#ifdef __linux__ +static Profiler* s_instance = nullptr; +static Thread* s_thread; +#ifndef TRACY_NO_FRAME_IMAGE +static Thread* s_compressThread; +#endif +#ifdef TRACY_HAS_CALLSTACK +static Thread* s_symbolThread; +std::atomic s_symbolThreadGone { false }; +#endif +#ifdef TRACY_HAS_SYSTEM_TRACING +static Thread* s_sysTraceThread = nullptr; +#endif + +#if defined __linux__ && !defined TRACY_NO_CRASH_HANDLER +# ifndef TRACY_CRASH_SIGNAL +# define TRACY_CRASH_SIGNAL SIGPWR +# endif + static long s_profilerTid = 0; +static long s_symbolTid = 0; static char s_crashText[1024]; static std::atomic s_alreadyCrashed( false ); @@ -898,10 +1052,10 @@ static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ ) { GetProfiler().SendCallstack( 60, "__kernel_rt_sigreturn" ); - TracyLfqPrepare( QueueType::CrashReport ); + TracyQueuePrepare( QueueType::CrashReport ); item->crashReport.time = Profiler::GetTime(); item->crashReport.text = (uint64_t)s_crashText; - TracyLfqCommit; + TracyQueueCommit( crashReportThread ); } DIR* dp = opendir( "/proc/self/task" ); @@ -914,17 +1068,17 @@ static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ ) { if( ep->d_name[0] == '.' ) continue; int tid = atoi( ep->d_name ); - if( tid != selfTid && tid != s_profilerTid ) + if( tid != selfTid && tid != s_profilerTid && tid != s_symbolTid ) { - syscall( SYS_tkill, tid, SIGPWR ); + syscall( SYS_tkill, tid, TRACY_CRASH_SIGNAL ); } } closedir( dp ); - { - TracyLfqPrepare( QueueType::Crash ); - TracyLfqCommit; - } + if( selfTid == s_symbolTid ) s_symbolThreadGone.store( true, std::memory_order_release ); + + TracyLfqPrepare( QueueType::Crash ); + TracyLfqCommit; std::this_thread::sleep_for( std::chrono::milliseconds( 500 ) ); GetProfiler().RequestShutdown(); @@ -937,18 +1091,9 @@ static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ ) enum { QueuePrealloc = 256 * 1024 }; -static Profiler* s_instance = nullptr; -static Thread* s_thread; - -#ifdef TRACY_HAS_SYSTEM_TRACING -static Thread* s_sysTraceThread = nullptr; -#endif - -TRACY_API bool ProfilerAvailable() { return s_instance != nullptr; } - TRACY_API int64_t GetFrequencyQpc() { -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 LARGE_INTEGER t; QueryPerformanceFrequency( &t ); return t.QuadPart; @@ -960,18 +1105,10 @@ TRACY_API int64_t GetFrequencyQpc() #ifdef TRACY_DELAYED_INIT struct ThreadNameData; TRACY_API moodycamel::ConcurrentQueue& GetQueue(); -TRACY_API void InitRPMallocThread(); - -void InitRPMallocThread() -{ - RPMallocInit rpinit; - rpmalloc_thread_initialize(); -} struct ProfilerData { int64_t initTime = SetupHwTimer(); - RPMallocInit rpmalloc_init; moodycamel::ConcurrentQueue queue; Profiler profiler; std::atomic lockCounter { 0 }; @@ -989,7 +1126,6 @@ struct ProducerWrapper struct ProfilerThreadData { ProfilerThreadData( ProfilerData& data ) : token( data ), gpuCtx( { nullptr } ) {} - RPMallocInit rpmalloc_init; ProducerWrapper token; GpuCtxWrapper gpuCtx; # ifdef TRACY_ON_DEMAND @@ -997,23 +1133,34 @@ struct ProfilerThreadData # endif }; +std::atomic RpInitDone { 0 }; +std::atomic RpInitLock { 0 }; +thread_local bool RpThreadInitDone = false; +thread_local bool RpThreadShutdown = false; + # ifdef TRACY_MANUAL_LIFETIME ProfilerData* s_profilerData = nullptr; +static ProfilerThreadData& GetProfilerThreadData(); TRACY_API void StartupProfiler() { - s_profilerData = new ProfilerData; + s_profilerData = (ProfilerData*)tracy_malloc( sizeof( ProfilerData ) ); + new (s_profilerData) ProfilerData(); s_profilerData->profiler.SpawnWorkerThreads(); + GetProfilerThreadData().token = ProducerWrapper( *s_profilerData ); } static ProfilerData& GetProfilerData() { - assert(s_profilerData); + assert( s_profilerData ); return *s_profilerData; } TRACY_API void ShutdownProfiler() { - delete s_profilerData; + s_profilerData->~ProfilerData(); + tracy_free( s_profilerData ); s_profilerData = nullptr; rpmalloc_finalize(); + RpThreadInitDone = false; + RpInitDone.store( 0, std::memory_order_release ); } # else static std::atomic profilerDataLock { 0 }; @@ -1025,11 +1172,11 @@ static ProfilerData& GetProfilerData() if( !ptr ) { int expected = 0; - while( !profilerDataLock.compare_exchange_strong( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; } + while( !profilerDataLock.compare_exchange_weak( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; YieldThread(); } ptr = profilerData.load( std::memory_order_acquire ); if( !ptr ) { - ptr = (ProfilerData*)malloc( sizeof( ProfilerData ) ); + ptr = (ProfilerData*)tracy_malloc( sizeof( ProfilerData ) ); new (ptr) ProfilerData(); profilerData.store( ptr, std::memory_order_release ); } @@ -1039,11 +1186,60 @@ static ProfilerData& GetProfilerData() } # endif +// GCC prior to 8.4 had a bug with function-inline thread_local variables. Versions of glibc beginning with +// 2.18 may attempt to work around this issue, which manifests as a crash while running static destructors +// if this function is compiled into a shared object. Unfortunately, centos7 ships with glibc 2.17. If running +// on old GCC, use the old-fashioned way as a workaround +// See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85400 +#if !defined(__clang__) && defined(__GNUC__) && ((__GNUC__ < 8) || ((__GNUC__ == 8) && (__GNUC_MINOR__ < 4))) +struct ProfilerThreadDataKey +{ +public: + ProfilerThreadDataKey() + { + int val = pthread_key_create(&m_key, sDestructor); + static_cast(val); // unused + assert(val == 0); + } + ~ProfilerThreadDataKey() + { + int val = pthread_key_delete(m_key); + static_cast(val); // unused + assert(val == 0); + } + ProfilerThreadData& get() + { + void* p = pthread_getspecific(m_key); + if (!p) + { + p = (ProfilerThreadData*)tracy_malloc( sizeof( ProfilerThreadData ) ); + new (p) ProfilerThreadData(GetProfilerData()); + pthread_setspecific(m_key, p); + } + return *static_cast(p); + } +private: + pthread_key_t m_key; + + static void sDestructor(void* p) + { + ((ProfilerThreadData*)p)->~ProfilerThreadData(); + tracy_free(p); + } +}; + +static ProfilerThreadData& GetProfilerThreadData() +{ + static ProfilerThreadDataKey key; + return key.get(); +} +#else static ProfilerThreadData& GetProfilerThreadData() { thread_local ProfilerThreadData data( GetProfilerData() ); return data; } +#endif TRACY_API moodycamel::ConcurrentQueue::ExplicitProducer* GetToken() { return GetProfilerThreadData().token.ptr; } TRACY_API Profiler& GetProfiler() { return GetProfilerData().profiler; } @@ -1052,7 +1248,7 @@ TRACY_API int64_t GetInitTime() { return GetProfilerData().initTime; } TRACY_API std::atomic& GetLockCounter() { return GetProfilerData().lockCounter; } TRACY_API std::atomic& GetGpuCtxCounter() { return GetProfilerData().gpuCtxCounter; } TRACY_API GpuCtxWrapper& GetGpuCtx() { return GetProfilerThreadData().gpuCtx; } -TRACY_API uint64_t GetThreadHandle() { return detail::GetThreadHandleImpl(); } +TRACY_API uint32_t GetThreadHandle() { return detail::GetThreadHandleImpl(); } std::atomic& GetThreadNameData() { return GetProfilerData().threadNameData; } # ifdef TRACY_ON_DEMAND @@ -1067,18 +1263,12 @@ namespace # endif #else -TRACY_API void InitRPMallocThread() -{ - rpmalloc_thread_initialize(); -} // MSVC static initialization order solution. gcc/clang uses init_order() to avoid all this. // 1a. But s_queue is needed for initialization of variables in point 2. extern moodycamel::ConcurrentQueue s_queue; -thread_local RPMallocInit init_order(106) s_rpmalloc_thread_init; - // 2. If these variables would be in the .CRT$XCB section, they would be initialized only in main thread. thread_local moodycamel::ProducerToken init_order(107) s_token_detail( s_queue ); thread_local ProducerWrapper init_order(108) s_token { s_queue.get_explicit_producer( s_token_detail ) }; @@ -1091,7 +1281,10 @@ thread_local ThreadHandleWrapper init_order(104) s_threadHandle { detail::GetThr # endif static InitTimeWrapper init_order(101) s_initTime { SetupHwTimer() }; -static RPMallocInit init_order(102) s_rpmalloc_init; +std::atomic init_order(102) RpInitDone( 0 ); +std::atomic init_order(102) RpInitLock( 0 ); +thread_local bool RpThreadInitDone = false; +thread_local bool RpThreadShutdown = false; moodycamel::ConcurrentQueue init_order(103) s_queue( QueuePrealloc ); std::atomic init_order(104) s_lockCounter( 0 ); std::atomic init_order(104) s_gpuCtxCounter( 0 ); @@ -1115,12 +1308,7 @@ TRACY_API int64_t GetInitTime() { return s_initTime.val; } TRACY_API std::atomic& GetLockCounter() { return s_lockCounter; } TRACY_API std::atomic& GetGpuCtxCounter() { return s_gpuCtxCounter; } TRACY_API GpuCtxWrapper& GetGpuCtx() { return s_gpuCtx; } -# ifdef __CYGWIN__ -// Hackfix for cygwin reporting memory frees without matching allocations. WTF? -TRACY_API uint64_t GetThreadHandle() { return detail::GetThreadHandleImpl(); } -# else -TRACY_API uint64_t GetThreadHandle() { return s_threadHandle.val; } -# endif +TRACY_API uint32_t GetThreadHandle() { return s_threadHandle.val; } std::atomic& GetThreadNameData() { return s_threadNameData; } @@ -1129,6 +1317,9 @@ TRACY_API LuaZoneState& GetLuaZoneState() { return s_luaZoneState; } # endif #endif +TRACY_API bool ProfilerAvailable() { return s_instance != nullptr; } +TRACY_API bool ProfilerAllocatorAvailable() { return !RpThreadShutdown; } + Profiler::Profiler() : m_timeBegin( 0 ) , m_mainThread( detail::GetThreadHandleImpl() ) @@ -1149,6 +1340,11 @@ Profiler::Profiler() , m_lz4Buf( (char*)tracy_malloc( LZ4Size + sizeof( lz4sz_t ) ) ) , m_serialQueue( 1024*1024 ) , m_serialDequeue( 1024*1024 ) +#ifndef TRACY_NO_FRAME_IMAGE + , m_fiQueue( 16 ) + , m_fiDequeue( 16 ) +#endif + , m_symbolQueue( 8*1024 ) , m_frameCount( 0 ) , m_isConnected( false ) #ifdef TRACY_ON_DEMAND @@ -1156,7 +1352,10 @@ Profiler::Profiler() , m_deferredQueue( 64*1024 ) #endif , m_paramCallback( nullptr ) + , m_sourceCallback( nullptr ) + , m_queryImage( nullptr ) , m_queryData( nullptr ) + , m_crashHandlerInstalled( false ) { assert( !s_instance ); s_instance = this; @@ -1175,14 +1374,14 @@ Profiler::Profiler() ReportTopology(); #ifndef TRACY_NO_EXIT - const char* noExitEnv = getenv( "TRACY_NO_EXIT" ); + const char* noExitEnv = GetEnvVar( "TRACY_NO_EXIT" ); if( noExitEnv && noExitEnv[0] == '1' ) { m_noExit = true; } #endif - const char* userPort = getenv( "TRACY_PORT" ); + const char* userPort = GetEnvVar( "TRACY_PORT" ); if( userPort ) { m_userPort = atoi( userPort ); @@ -1195,9 +1394,6 @@ Profiler::Profiler() void Profiler::SpawnWorkerThreads() { - s_thread = (Thread*)tracy_malloc( sizeof( Thread ) ); - new(s_thread) Thread( LaunchWorker, this ); - #ifdef TRACY_HAS_SYSTEM_TRACING if( SysTraceStart( m_samplingPeriod ) ) { @@ -1207,29 +1403,47 @@ void Profiler::SpawnWorkerThreads() } #endif -#if defined _WIN32 || defined __CYGWIN__ - s_profilerThreadId = GetThreadId( s_thread->Handle() ); - AddVectoredExceptionHandler( 1, CrashFilter ); + s_thread = (Thread*)tracy_malloc( sizeof( Thread ) ); + new(s_thread) Thread( LaunchWorker, this ); + +#ifndef TRACY_NO_FRAME_IMAGE + s_compressThread = (Thread*)tracy_malloc( sizeof( Thread ) ); + new(s_compressThread) Thread( LaunchCompressWorker, this ); #endif -#ifdef __linux__ +#ifdef TRACY_HAS_CALLSTACK + s_symbolThread = (Thread*)tracy_malloc( sizeof( Thread ) ); + new(s_symbolThread) Thread( LaunchSymbolWorker, this ); +#endif + +#if defined _WIN32 && !defined TRACY_UWP && !defined TRACY_NO_CRASH_HANDLER + s_profilerThreadId = GetThreadId( s_thread->Handle() ); + s_symbolThreadId = GetThreadId( s_symbolThread->Handle() ); + m_exceptionHandler = AddVectoredExceptionHandler( 1, CrashFilter ); +#endif + +#if defined __linux__ && !defined TRACY_NO_CRASH_HANDLER struct sigaction threadFreezer = {}; threadFreezer.sa_handler = ThreadFreezer; - sigaction( SIGPWR, &threadFreezer, nullptr ); + sigaction( TRACY_CRASH_SIGNAL, &threadFreezer, &m_prevSignal.pwr ); struct sigaction crashHandler = {}; crashHandler.sa_sigaction = CrashHandler; crashHandler.sa_flags = SA_SIGINFO; - sigaction( SIGILL, &crashHandler, nullptr ); - sigaction( SIGFPE, &crashHandler, nullptr ); - sigaction( SIGSEGV, &crashHandler, nullptr ); - sigaction( SIGPIPE, &crashHandler, nullptr ); - sigaction( SIGBUS, &crashHandler, nullptr ); - sigaction( SIGABRT, &crashHandler, nullptr ); + sigaction( SIGILL, &crashHandler, &m_prevSignal.ill ); + sigaction( SIGFPE, &crashHandler, &m_prevSignal.fpe ); + sigaction( SIGSEGV, &crashHandler, &m_prevSignal.segv ); + sigaction( SIGPIPE, &crashHandler, &m_prevSignal.pipe ); + sigaction( SIGBUS, &crashHandler, &m_prevSignal.bus ); + sigaction( SIGABRT, &crashHandler, &m_prevSignal.abrt ); +#endif + +#ifndef TRACY_NO_CRASH_HANDLER + m_crashHandlerInstalled = true; #endif #ifdef TRACY_HAS_CALLSTACK - InitCallstack(); + InitCallstackCritical(); #endif m_timeBegin.store( GetTime(), std::memory_order_relaxed ); @@ -1239,6 +1453,23 @@ Profiler::~Profiler() { m_shutdown.store( true, std::memory_order_relaxed ); +#if defined _WIN32 && !defined TRACY_UWP + if( m_crashHandlerInstalled ) RemoveVectoredExceptionHandler( m_exceptionHandler ); +#endif + +#ifdef __linux__ + if( m_crashHandlerInstalled ) + { + sigaction( TRACY_CRASH_SIGNAL, &m_prevSignal.pwr, nullptr ); + sigaction( SIGILL, &m_prevSignal.ill, nullptr ); + sigaction( SIGFPE, &m_prevSignal.fpe, nullptr ); + sigaction( SIGSEGV, &m_prevSignal.segv, nullptr ); + sigaction( SIGPIPE, &m_prevSignal.pipe, nullptr ); + sigaction( SIGBUS, &m_prevSignal.bus, nullptr ); + sigaction( SIGABRT, &m_prevSignal.abrt, nullptr ); + } +#endif + #ifdef TRACY_HAS_SYSTEM_TRACING if( s_sysTraceThread ) { @@ -1248,9 +1479,23 @@ Profiler::~Profiler() } #endif +#ifdef TRACY_HAS_CALLSTACK + s_symbolThread->~Thread(); + tracy_free( s_symbolThread ); +#endif + +#ifndef TRACY_NO_FRAME_IMAGE + s_compressThread->~Thread(); + tracy_free( s_compressThread ); +#endif + s_thread->~Thread(); tracy_free( s_thread ); +#ifdef TRACY_HAS_CALLSTACK + EndCallstack(); +#endif + tracy_free( m_lz4Buf ); tracy_free( m_buffer ); LZ4_freeStream( (LZ4_stream_t*)m_stream ); @@ -1301,7 +1546,9 @@ void Profiler::Worker() while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); +#ifdef TRACY_USE_RPMALLOC rpmalloc_thread_initialize(); +#endif m_exectime = 0; const auto execname = GetProcessExecutablePath(); @@ -1322,16 +1569,22 @@ void Profiler::Worker() const uint64_t pid = GetPid(); -#ifdef TRACY_ON_DEMAND - uint8_t onDemand = 1; -#else - uint8_t onDemand = 0; -#endif + uint8_t flags = 0; +#ifdef TRACY_ON_DEMAND + flags |= WelcomeFlag::OnDemand; +#endif #ifdef __APPLE__ - uint8_t isApple = 1; -#else - uint8_t isApple = 0; + flags |= WelcomeFlag::IsApple; +#endif +#ifndef TRACY_NO_CODE_TRANSFER + flags |= WelcomeFlag::CodeTransfer; +#endif +#ifdef _WIN32 + flags |= WelcomeFlag::CombineSamples; +# ifndef TRACY_NO_CONTEXT_SWITCH + flags |= WelcomeFlag::IdentifySamples; +# endif #endif #if defined __i386 || defined _M_IX86 @@ -1346,12 +1599,6 @@ void Profiler::Worker() uint8_t cpuArch = CpuArchUnknown; #endif -#ifdef TRACY_NO_CODE_TRANSFER - uint8_t codeTransfer = 0; -#else - uint8_t codeTransfer = 1; -#endif - #if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 uint32_t regs[4]; char manufacturer[12]; @@ -1377,10 +1624,8 @@ void Profiler::Worker() MemWrite( &welcome.exectime, m_exectime ); MemWrite( &welcome.pid, pid ); MemWrite( &welcome.samplingPeriod, m_samplingPeriod ); - MemWrite( &welcome.onDemand, onDemand ); - MemWrite( &welcome.isApple, isApple ); + MemWrite( &welcome.flags, flags ); MemWrite( &welcome.cpuArch, cpuArch ); - MemWrite( &welcome.codeTransfer, codeTransfer ); memcpy( welcome.cpuManufacturer, manufacturer, 12 ); MemWrite( &welcome.cpuId, cpuId ); memcpy( welcome.programName, procname, pnsz ); @@ -1606,7 +1851,7 @@ void Profiler::Worker() keepAlive = 0; } - else + else if( !m_sock->HasData() ) { keepAlive++; std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); @@ -1618,9 +1863,10 @@ void Profiler::Worker() } bool connActive = true; - while( m_sock->HasData() && connActive ) + while( m_sock->HasData() ) { connActive = HandleServerQuery(); + if( !connActive ) break; } if( !connActive ) break; } @@ -1681,6 +1927,11 @@ void Profiler::Worker() } // End of connections loop + // Wait for symbols thread to terminate. Symbol resolution will continue in this thread. +#ifdef TRACY_HAS_CALLSTACK + while( s_symbolThreadGone.load() == false ) { YieldThread(); } +#endif + // Client is exiting. Send items remaining in queues. for(;;) { @@ -1705,6 +1956,16 @@ void Profiler::Worker() return; } } + +#ifdef TRACY_HAS_CALLSTACK + for(;;) + { + auto si = m_symbolQueue.front(); + if( !si ) break; + HandleSymbolQueueItem( *si ); + m_symbolQueue.pop(); + } +#endif } // Send client termination notice to the server @@ -1718,35 +1979,113 @@ void Profiler::Worker() // Handle remaining server queries for(;;) { - if( m_sock->HasData() ) + while( m_sock->HasData() ) { - while( m_sock->HasData() ) + if( !HandleServerQuery() ) { - if( !HandleServerQuery() ) - { - m_shutdownFinished.store( true, std::memory_order_relaxed ); - return; - } - } - while( Dequeue( token ) == DequeueStatus::DataDequeued ) {} - while( DequeueSerial() == DequeueStatus::DataDequeued ) {} - if( m_bufferOffset != m_bufferStart ) - { - if( !CommitData() ) - { - m_shutdownFinished.store( true, std::memory_order_relaxed ); - return; - } + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; } } - else +#ifdef TRACY_HAS_CALLSTACK + for(;;) { - if( m_bufferOffset != m_bufferStart ) CommitData(); - std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + auto si = m_symbolQueue.front(); + if( !si ) break; + HandleSymbolQueueItem( *si ); + m_symbolQueue.pop(); + } +#endif + const auto status = Dequeue( token ); + const auto serialStatus = DequeueSerial(); + if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } } } } +#ifndef TRACY_NO_FRAME_IMAGE +void Profiler::CompressWorker() +{ + ThreadExitHandler threadExitHandler; + SetThreadName( "Tracy DXT1" ); + while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + +#ifdef TRACY_USE_RPMALLOC + rpmalloc_thread_initialize(); +#endif + + for(;;) + { + const auto shouldExit = ShouldExit(); + + { + bool lockHeld = true; + while( !m_fiLock.try_lock() ) + { + if( m_shutdownManual.load( std::memory_order_relaxed ) ) + { + lockHeld = false; + break; + } + } + if( !m_fiQueue.empty() ) m_fiQueue.swap( m_fiDequeue ); + if( lockHeld ) + { + m_fiLock.unlock(); + } + } + + const auto sz = m_fiDequeue.size(); + if( sz > 0 ) + { + auto fi = m_fiDequeue.data(); + auto end = fi + sz; + while( fi != end ) + { + const auto w = fi->w; + const auto h = fi->h; + const auto csz = size_t( w * h / 2 ); + auto etc1buf = (char*)tracy_malloc( csz ); + CompressImageDxt1( (const char*)fi->image, etc1buf, w, h ); + tracy_free( fi->image ); + + TracyLfqPrepare( QueueType::FrameImage ); + MemWrite( &item->frameImageFat.image, (uint64_t)etc1buf ); + MemWrite( &item->frameImageFat.frame, fi->frame ); + MemWrite( &item->frameImageFat.w, w ); + MemWrite( &item->frameImageFat.h, h ); + uint8_t flip = fi->flip; + MemWrite( &item->frameImageFat.flip, flip ); + TracyLfqCommit; + + fi++; + } + m_fiDequeue.clear(); + } + else + { + std::this_thread::sleep_for( std::chrono::milliseconds( 20 ) ); + } + + if( shouldExit ) + { + return; + } + } +} +#endif + static void FreeAssociatedMemory( const QueueItem& item ) { if( item.hdr.idx >= (int)QueueType::Terminate ) return; @@ -1796,6 +2135,7 @@ static void FreeAssociatedMemory( const QueueItem& item ) tracy_free( (void*)ptr ); break; case QueueType::CallstackSample: + case QueueType::CallstackSampleContextSwitch: ptr = MemRead( &item.callstackSampleFat.ptr ); tracy_free( (void*)ptr ); break; @@ -1803,6 +2143,36 @@ static void FreeAssociatedMemory( const QueueItem& item ) ptr = MemRead( &item.frameImageFat.image ); tracy_free( (void*)ptr ); break; +#ifdef TRACY_HAS_CALLSTACK + case QueueType::CallstackFrameSize: + { + InitRpmalloc(); + auto size = MemRead( &item.callstackFrameSizeFat.size ); + auto data = (const CallstackEntry*)MemRead( &item.callstackFrameSizeFat.data ); + for( uint8_t i=0; i( &item.symbolInformationFat.needFree ); + if( needFree ) + { + ptr = MemRead( &item.symbolInformationFat.fileString ); + tracy_free( (void*)ptr ); + } + break; + } + case QueueType::SymbolCodeMetadata: + ptr = MemRead( &item.symbolCodeMetadata.ptr ); + tracy_free( (void*)ptr ); + break; +#endif #ifndef TRACY_ON_DEMAND case QueueType::LockName: ptr = MemRead( &item.lockNameFat.name ); @@ -1819,6 +2189,18 @@ static void FreeAssociatedMemory( const QueueItem& item ) // Don't free memory associated with deferred messages. break; #endif +#ifdef TRACY_HAS_SYSTEM_TRACING + case QueueType::ExternalNameMetadata: + ptr = MemRead( &item.externalNameMetadata.name ); + tracy_free( (void*)ptr ); + ptr = MemRead( &item.externalNameMetadata.threadName ); + tracy_free_fast( (void*)ptr ); + break; +#endif + case QueueType::SourceCodeMetadata: + ptr = MemRead( &item.sourceCodeMetadata.ptr ); + tracy_free( (void*)ptr ); + break; default: break; } @@ -1861,21 +2243,14 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token ) { bool connectionLost = false; const auto sz = GetQueue().try_dequeue_bulk_single( token, - [this, &connectionLost] ( const uint64_t& threadId ) + [this, &connectionLost] ( const uint32_t& threadId ) { - if( threadId != m_threadCtx ) - { - QueueItem item; - MemWrite( &item.hdr.type, QueueType::ThreadContext ); - MemWrite( &item.threadCtx.thread, threadId ); - if( !AppendData( &item, QueueDataSize[(int)QueueType::ThreadContext] ) ) connectionLost = true; - m_threadCtx = threadId; - m_refTimeThread = 0; - } + if( ThreadCtxCheck( threadId ) == ThreadCtxStatus::ConnectionLost ) connectionLost = true; }, [this, &connectionLost] ( QueueItem* item, size_t sz ) { if( connectionLost ) return; + InitRpmalloc(); assert( sz > 0 ); int64_t refThread = m_refTimeThread; int64_t refCtx = m_refTimeCtx; @@ -1894,28 +2269,28 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token ) ptr = MemRead( &item->zoneTextFat.text ); size = MemRead( &item->zoneTextFat.size ); SendSingleString( (const char*)ptr, size ); - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); break; case QueueType::Message: case QueueType::MessageCallstack: ptr = MemRead( &item->messageFat.text ); size = MemRead( &item->messageFat.size ); SendSingleString( (const char*)ptr, size ); - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); break; case QueueType::MessageColor: case QueueType::MessageColorCallstack: ptr = MemRead( &item->messageColorFat.text ); size = MemRead( &item->messageColorFat.size ); SendSingleString( (const char*)ptr, size ); - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); break; case QueueType::MessageAppInfo: ptr = MemRead( &item->messageFat.text ); size = MemRead( &item->messageFat.size ); SendSingleString( (const char*)ptr, size ); #ifndef TRACY_ON_DEMAND - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); #endif break; case QueueType::ZoneBeginAllocSrcLoc: @@ -1927,13 +2302,13 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token ) MemWrite( &item->zoneBegin.time, dt ); ptr = MemRead( &item->zoneBegin.srcloc ); SendSourceLocationPayload( ptr ); - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); break; } case QueueType::Callstack: ptr = MemRead( &item->callstackFat.ptr ); SendCallstackPayload( ptr ); - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); break; case QueueType::CallstackAlloc: ptr = MemRead( &item->callstackAllocFat.nativePtr ); @@ -1941,17 +2316,18 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token ) { CutCallstack( (void*)ptr, "lua_pcall" ); SendCallstackPayload( ptr ); - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); } ptr = MemRead( &item->callstackAllocFat.ptr ); SendCallstackAlloc( ptr ); - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); break; case QueueType::CallstackSample: + case QueueType::CallstackSampleContextSwitch: { ptr = MemRead( &item->callstackSampleFat.ptr ); SendCallstackPayload64( ptr ); - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); int64_t t = MemRead( &item->callstackSampleFat.time ); int64_t dt = t - refCtx; refCtx = t; @@ -1965,7 +2341,7 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token ) const auto h = MemRead( &item->frameImageFat.h ); const auto csz = size_t( w * h / 2 ); SendLongString( ptr, (const char*)ptr, csz, QueueType::FrameImageData ); - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); break; } case QueueType::ZoneBegin: @@ -2003,7 +2379,7 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token ) MemWrite( &item->gpuZoneBegin.cpuTime, dt ); ptr = MemRead( &item->gpuZoneBegin.srcloc ); SendSourceLocationPayload( ptr ); - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); break; } case QueueType::GpuZoneEnd: @@ -2019,15 +2395,17 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token ) size = MemRead( &item->gpuContextNameFat.size ); SendSingleString( (const char*)ptr, size ); #ifndef TRACY_ON_DEMAND - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); #endif break; - case QueueType::PlotData: + case QueueType::PlotDataInt: + case QueueType::PlotDataFloat: + case QueueType::PlotDataDouble: { - int64_t t = MemRead( &item->plotData.time ); + int64_t t = MemRead( &item->plotDataInt.time ); int64_t dt = t - refThread; refThread = t; - MemWrite( &item->plotData.time, dt ); + MemWrite( &item->plotDataInt.time, dt ); break; } case QueueType::ContextSwitch: @@ -2054,6 +2432,79 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token ) MemWrite( &item->gpuTime.gpuTime, dt ); break; } +#ifdef TRACY_HAS_CALLSTACK + case QueueType::CallstackFrameSize: + { + auto data = (const CallstackEntry*)MemRead( &item->callstackFrameSizeFat.data ); + auto datasz = MemRead( &item->callstackFrameSizeFat.size ); + auto imageName = (const char*)MemRead( &item->callstackFrameSizeFat.imageName ); + SendSingleString( imageName ); + AppendData( item++, QueueDataSize[idx] ); + + for( uint8_t i=0; i( &item->symbolInformationFat.fileString ); + auto needFree = MemRead( &item->symbolInformationFat.needFree ); + SendSingleString( fileString ); + if( needFree ) tracy_free_fast( (void*)fileString ); + break; + } + case QueueType::SymbolCodeMetadata: + { + auto symbol = MemRead( &item->symbolCodeMetadata.symbol ); + auto ptr = (const char*)MemRead( &item->symbolCodeMetadata.ptr ); + auto size = MemRead( &item->symbolCodeMetadata.size ); + SendLongString( symbol, ptr, size, QueueType::SymbolCode ); + tracy_free_fast( (void*)ptr ); + ++item; + continue; + } +#endif +#ifdef TRACY_HAS_SYSTEM_TRACING + case QueueType::ExternalNameMetadata: + { + auto thread = MemRead( &item->externalNameMetadata.thread ); + auto name = (const char*)MemRead( &item->externalNameMetadata.name ); + auto threadName = (const char*)MemRead( &item->externalNameMetadata.threadName ); + SendString( thread, threadName, QueueType::ExternalThreadName ); + SendString( thread, name, QueueType::ExternalName ); + tracy_free_fast( (void*)threadName ); + tracy_free_fast( (void*)name ); + ++item; + continue; + } +#endif + case QueueType::SourceCodeMetadata: + { + auto ptr = (const char*)MemRead( &item->sourceCodeMetadata.ptr ); + auto size = MemRead( &item->sourceCodeMetadata.size ); + auto id = MemRead( &item->sourceCodeMetadata.id ); + SendLongString( (uint64_t)id, ptr, size, QueueType::SourceCode ); + tracy_free_fast( (void*)ptr ); + ++item; + continue; + } default: assert( false ); break; @@ -2137,6 +2588,16 @@ Profiler::DequeueStatus Profiler::DequeueContextSwitches( tracy::moodycamel::Con return ( timeStop == -1 || sz > 0 ) ? DequeueStatus::DataDequeued : DequeueStatus::QueueEmpty; } +#define ThreadCtxCheckSerial( _name ) \ + uint32_t thread = MemRead( &item->_name.thread ); \ + switch( ThreadCtxCheck( thread ) ) \ + { \ + case ThreadCtxStatus::Same: break; \ + case ThreadCtxStatus::Changed: assert( m_refTimeThread == 0 ); refThread = 0; break; \ + case ThreadCtxStatus::ConnectionLost: return DequeueStatus::ConnectionLost; \ + default: assert( false ); break; \ + } + Profiler::DequeueStatus Profiler::DequeueSerial() { { @@ -2159,8 +2620,12 @@ Profiler::DequeueStatus Profiler::DequeueSerial() const auto sz = m_serialDequeue.size(); if( sz > 0 ) { + InitRpmalloc(); int64_t refSerial = m_refTimeSerial; int64_t refGpu = m_refTimeGpu; +#ifdef TRACY_FIBERS + int64_t refThread = m_refTimeThread; +#endif auto item = m_serialDequeue.data(); auto end = item + sz; while( item != end ) @@ -2174,7 +2639,7 @@ Profiler::DequeueStatus Profiler::DequeueSerial() case QueueType::CallstackSerial: ptr = MemRead( &item->callstackFat.ptr ); SendCallstackPayload( ptr ); - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); break; case QueueType::LockWait: case QueueType::LockSharedWait: @@ -2209,7 +2674,7 @@ Profiler::DequeueStatus Profiler::DequeueSerial() uint16_t size = MemRead( &item->lockNameFat.size ); SendSingleString( (const char*)ptr, size ); #ifndef TRACY_ON_DEMAND - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); #endif break; } @@ -2253,7 +2718,7 @@ Profiler::DequeueStatus Profiler::DequeueSerial() MemWrite( &item->gpuZoneBegin.cpuTime, dt ); ptr = MemRead( &item->gpuZoneBegin.srcloc ); SendSourceLocationPayload( ptr ); - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); break; } case QueueType::GpuZoneEndSerial: @@ -2278,20 +2743,170 @@ Profiler::DequeueStatus Profiler::DequeueSerial() uint16_t size = MemRead( &item->gpuContextNameFat.size ); SendSingleString( (const char*)ptr, size ); #ifndef TRACY_ON_DEMAND - tracy_free( (void*)ptr ); + tracy_free_fast( (void*)ptr ); #endif break; } +#ifdef TRACY_FIBERS + case QueueType::ZoneBegin: + case QueueType::ZoneBeginCallstack: + { + ThreadCtxCheckSerial( zoneBeginThread ); + int64_t t = MemRead( &item->zoneBegin.time ); + int64_t dt = t - refThread; + refThread = t; + MemWrite( &item->zoneBegin.time, dt ); + break; + } + case QueueType::ZoneBeginAllocSrcLoc: + case QueueType::ZoneBeginAllocSrcLocCallstack: + { + ThreadCtxCheckSerial( zoneBeginThread ); + int64_t t = MemRead( &item->zoneBegin.time ); + int64_t dt = t - refThread; + refThread = t; + MemWrite( &item->zoneBegin.time, dt ); + ptr = MemRead( &item->zoneBegin.srcloc ); + SendSourceLocationPayload( ptr ); + tracy_free_fast( (void*)ptr ); + break; + } + case QueueType::ZoneEnd: + { + ThreadCtxCheckSerial( zoneEndThread ); + int64_t t = MemRead( &item->zoneEnd.time ); + int64_t dt = t - refThread; + refThread = t; + MemWrite( &item->zoneEnd.time, dt ); + break; + } + case QueueType::ZoneText: + case QueueType::ZoneName: + { + ThreadCtxCheckSerial( zoneTextFatThread ); + ptr = MemRead( &item->zoneTextFat.text ); + uint16_t size = MemRead( &item->zoneTextFat.size ); + SendSingleString( (const char*)ptr, size ); + tracy_free_fast( (void*)ptr ); + break; + } + case QueueType::Message: + case QueueType::MessageCallstack: + { + ThreadCtxCheckSerial( messageFatThread ); + ptr = MemRead( &item->messageFat.text ); + uint16_t size = MemRead( &item->messageFat.size ); + SendSingleString( (const char*)ptr, size ); + tracy_free_fast( (void*)ptr ); + break; + } + case QueueType::MessageColor: + case QueueType::MessageColorCallstack: + { + ThreadCtxCheckSerial( messageColorFatThread ); + ptr = MemRead( &item->messageColorFat.text ); + uint16_t size = MemRead( &item->messageColorFat.size ); + SendSingleString( (const char*)ptr, size ); + tracy_free_fast( (void*)ptr ); + break; + } + case QueueType::Callstack: + { + ThreadCtxCheckSerial( callstackFatThread ); + ptr = MemRead( &item->callstackFat.ptr ); + SendCallstackPayload( ptr ); + tracy_free_fast( (void*)ptr ); + break; + } + case QueueType::CallstackAlloc: + { + ThreadCtxCheckSerial( callstackAllocFatThread ); + ptr = MemRead( &item->callstackAllocFat.nativePtr ); + if( ptr != 0 ) + { + CutCallstack( (void*)ptr, "lua_pcall" ); + SendCallstackPayload( ptr ); + tracy_free_fast( (void*)ptr ); + } + ptr = MemRead( &item->callstackAllocFat.ptr ); + SendCallstackAlloc( ptr ); + tracy_free_fast( (void*)ptr ); + break; + } + case QueueType::FiberEnter: + { + ThreadCtxCheckSerial( fiberEnter ); + int64_t t = MemRead( &item->fiberEnter.time ); + int64_t dt = t - refThread; + refThread = t; + MemWrite( &item->fiberEnter.time, dt ); + break; + } + case QueueType::FiberLeave: + { + ThreadCtxCheckSerial( fiberLeave ); + int64_t t = MemRead( &item->fiberLeave.time ); + int64_t dt = t - refThread; + refThread = t; + MemWrite( &item->fiberLeave.time, dt ); + break; + } +#endif default: assert( false ); break; } } +#ifdef TRACY_FIBERS + else + { + switch( (QueueType)idx ) + { + case QueueType::ZoneColor: + { + ThreadCtxCheckSerial( zoneColorThread ); + break; + } + case QueueType::ZoneValue: + { + ThreadCtxCheckSerial( zoneValueThread ); + break; + } + case QueueType::ZoneValidation: + { + ThreadCtxCheckSerial( zoneValidationThread ); + break; + } + case QueueType::MessageLiteral: + case QueueType::MessageLiteralCallstack: + { + ThreadCtxCheckSerial( messageLiteralThread ); + break; + } + case QueueType::MessageLiteralColor: + case QueueType::MessageLiteralColorCallstack: + { + ThreadCtxCheckSerial( messageColorLiteralThread ); + break; + } + case QueueType::CrashReport: + { + ThreadCtxCheckSerial( crashReportThread ); + break; + } + default: + break; + } + } +#endif if( !AppendData( item, QueueDataSize[idx] ) ) return DequeueStatus::ConnectionLost; item++; } m_refTimeSerial = refSerial; m_refTimeGpu = refGpu; +#ifdef TRACY_FIBERS + m_refTimeThread = refThread; +#endif m_serialDequeue.clear(); } else @@ -2301,6 +2916,18 @@ Profiler::DequeueStatus Profiler::DequeueSerial() return DequeueStatus::DataDequeued; } +Profiler::ThreadCtxStatus Profiler::ThreadCtxCheck( uint32_t threadId ) +{ + if( m_threadCtx == threadId ) return ThreadCtxStatus::Same; + QueueItem item; + MemWrite( &item.hdr.type, QueueType::ThreadContext ); + MemWrite( &item.threadCtx.thread, threadId ); + if( !AppendData( &item, QueueDataSize[(int)QueueType::ThreadContext] ) ) return ThreadCtxStatus::ConnectionLost; + m_threadCtx = threadId; + m_refTimeThread = 0; + return ThreadCtxStatus::Changed; +} + bool Profiler::CommitData() { bool ret = SendData( m_buffer + m_bufferStart, m_bufferOffset - m_bufferStart ); @@ -2323,7 +2950,8 @@ void Profiler::SendString( uint64_t str, const char* ptr, size_t len, QueueType type == QueueType::PlotName || type == QueueType::FrameName || type == QueueType::ExternalName || - type == QueueType::ExternalThreadName ); + type == QueueType::ExternalThreadName || + type == QueueType::FiberName ); QueueItem item; MemWrite( &item.hdr.type, type ); @@ -2495,43 +3123,207 @@ void Profiler::SendCallstackAlloc( uint64_t _ptr ) AppendDataUnsafe( ptr, len ); } -void Profiler::SendCallstackFrame( uint64_t ptr ) +void Profiler::QueueCallstackFrame( uint64_t ptr ) { #ifdef TRACY_HAS_CALLSTACK - const auto frameData = DecodeCallstackPtr( ptr ); - - { - SendSingleString( frameData.imageName ); - - QueueItem item; - MemWrite( &item.hdr.type, QueueType::CallstackFrameSize ); - MemWrite( &item.callstackFrameSize.ptr, ptr ); - MemWrite( &item.callstackFrameSize.size, frameData.size ); - - AppendData( &item, QueueDataSize[(int)QueueType::CallstackFrameSize] ); - } - - for( uint8_t i=0; i> 63 != 0 ) + { + SendSingleString( "" ); + QueueItem item; + MemWrite( &item.hdr.type, QueueType::SymbolInformation ); + MemWrite( &item.symbolInformation.line, 0 ); + MemWrite( &item.symbolInformation.symAddr, symbol ); + AppendData( &item, QueueDataSize[(int)QueueType::SymbolInformation] ); + } + else + { + m_symbolQueue.emplace( SymbolQueueItem { SymbolQueueItemType::SymbolQuery, symbol } ); + } +#else + AckServerQuery(); +#endif +} + +void Profiler::QueueExternalName( uint64_t ptr ) +{ +#ifdef TRACY_HAS_SYSTEM_TRACING + m_symbolQueue.emplace( SymbolQueueItem { SymbolQueueItemType::ExternalName, ptr } ); +#endif +} + +void Profiler::QueueKernelCode( uint64_t symbol, uint32_t size ) +{ + assert( symbol >> 63 != 0 ); +#ifdef TRACY_HAS_CALLSTACK + m_symbolQueue.emplace( SymbolQueueItem { SymbolQueueItemType::KernelCode, symbol, size } ); +#else + AckSymbolCodeNotAvailable(); +#endif +} + +void Profiler::QueueSourceCodeQuery( uint32_t id ) +{ + assert( m_exectime != 0 ); + assert( m_queryData ); + m_symbolQueue.emplace( SymbolQueueItem { SymbolQueueItemType::SourceCode, uint64_t( m_queryData ), uint64_t( m_queryImage ), id } ); + m_queryData = nullptr; + m_queryImage = nullptr; +} + +#ifdef TRACY_HAS_CALLSTACK +void Profiler::HandleSymbolQueueItem( const SymbolQueueItem& si ) +{ + switch( si.type ) + { + case SymbolQueueItemType::CallstackFrame: + { + const auto frameData = DecodeCallstackPtr( si.ptr ); + auto data = tracy_malloc_fast( sizeof( CallstackEntry ) * frameData.size ); + memcpy( data, frameData.data, sizeof( CallstackEntry ) * frameData.size ); + TracyLfqPrepare( QueueType::CallstackFrameSize ); + MemWrite( &item->callstackFrameSizeFat.ptr, si.ptr ); + MemWrite( &item->callstackFrameSizeFat.size, frameData.size ); + MemWrite( &item->callstackFrameSizeFat.data, (uint64_t)data ); + MemWrite( &item->callstackFrameSizeFat.imageName, (uint64_t)frameData.imageName ); + TracyLfqCommit; + break; + } + case SymbolQueueItemType::SymbolQuery: + { +#ifdef __ANDROID__ + // On Android it's common for code to be in mappings that are only executable + // but not readable. + if( !EnsureReadable( si.ptr ) ) + { + TracyLfqPrepare( QueueType::AckServerQueryNoop ); + TracyLfqCommit; + break; + } +#endif + const auto sym = DecodeSymbolAddress( si.ptr ); + TracyLfqPrepare( QueueType::SymbolInformation ); + MemWrite( &item->symbolInformationFat.line, sym.line ); + MemWrite( &item->symbolInformationFat.symAddr, si.ptr ); + MemWrite( &item->symbolInformationFat.fileString, (uint64_t)sym.file ); + MemWrite( &item->symbolInformationFat.needFree, (uint8_t)sym.needFree ); + TracyLfqCommit; + break; + } +#ifdef TRACY_HAS_SYSTEM_TRACING + case SymbolQueueItemType::ExternalName: + { + const char* threadName; + const char* name; + SysTraceGetExternalName( si.ptr, threadName, name ); + TracyLfqPrepare( QueueType::ExternalNameMetadata ); + MemWrite( &item->externalNameMetadata.thread, si.ptr ); + MemWrite( &item->externalNameMetadata.name, (uint64_t)name ); + MemWrite( &item->externalNameMetadata.threadName, (uint64_t)threadName ); + TracyLfqCommit; + break; + } +#endif + case SymbolQueueItemType::KernelCode: + { +#ifdef _WIN32 + auto mod = GetKernelModulePath( si.ptr ); + if( mod ) + { + auto fn = DecodeCallstackPtrFast( si.ptr ); + if( *fn ) + { + auto hnd = LoadLibraryExA( mod, nullptr, DONT_RESOLVE_DLL_REFERENCES ); + if( hnd ) + { + auto ptr = (const void*)GetProcAddress( hnd, fn ); + if( ptr ) + { + auto buf = (char*)tracy_malloc( si.extra ); + memcpy( buf, ptr, si.extra ); + FreeLibrary( hnd ); + TracyLfqPrepare( QueueType::SymbolCodeMetadata ); + MemWrite( &item->symbolCodeMetadata.symbol, si.ptr ); + MemWrite( &item->symbolCodeMetadata.ptr, (uint64_t)buf ); + MemWrite( &item->symbolCodeMetadata.size, (uint32_t)si.extra ); + TracyLfqCommit; + break; + } + FreeLibrary( hnd ); + } + } + } +#endif + TracyLfqPrepare( QueueType::AckSymbolCodeNotAvailable ); + TracyLfqCommit; + break; + } + case SymbolQueueItemType::SourceCode: + HandleSourceCodeQuery( (char*)si.ptr, (char*)si.extra, si.id ); + break; + default: + assert( false ); + break; + } +} + +void Profiler::SymbolWorker() +{ +#if defined __linux__ && !defined TRACY_NO_CRASH_HANDLER + s_symbolTid = syscall( SYS_gettid ); +#endif + + ThreadExitHandler threadExitHandler; + SetThreadName( "Tracy Symbol Worker" ); +#ifdef TRACY_USE_RPMALLOC + InitRpmalloc(); +#endif + InitCallstack(); + while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + + for(;;) + { + const auto shouldExit = ShouldExit(); +#ifdef TRACY_ON_DEMAND + if( !IsConnected() ) + { + if( shouldExit ) + { + s_symbolThreadGone.store( true, std::memory_order_release ); + return; + } + while( m_symbolQueue.front() ) m_symbolQueue.pop(); + std::this_thread::sleep_for( std::chrono::milliseconds( 20 ) ); + continue; + } +#endif + auto si = m_symbolQueue.front(); + if( si ) + { + HandleSymbolQueueItem( *si ); + m_symbolQueue.pop(); + } + else + { + if( shouldExit ) + { + s_symbolThreadGone.store( true, std::memory_order_release ); + return; + } + std::this_thread::sleep_for( std::chrono::milliseconds( 20 ) ); + } + } +} +#endif bool Profiler::HandleServerQuery() { @@ -2569,7 +3361,7 @@ bool Profiler::HandleServerQuery() case ServerQueryTerminate: return false; case ServerQueryCallstackFrame: - SendCallstackFrame( ptr ); + QueueCallstackFrame( ptr ); break; case ServerQueryFrameName: SendString( ptr, (const char*)ptr, QueueType::FrameName ); @@ -2579,28 +3371,29 @@ bool Profiler::HandleServerQuery() return false; #ifdef TRACY_HAS_SYSTEM_TRACING case ServerQueryExternalName: - SysTraceSendExternalName( ptr ); + QueueExternalName( ptr ); break; #endif case ServerQueryParameter: HandleParameter( ptr ); break; case ServerQuerySymbol: - HandleSymbolQuery( ptr ); + QueueSymbolQuery( ptr ); break; #ifndef TRACY_NO_CODE_TRANSFER case ServerQuerySymbolCode: HandleSymbolCodeQuery( ptr, extra ); break; #endif - case ServerQueryCodeLocation: - SendCodeLocation( ptr ); - break; case ServerQuerySourceCode: - HandleSourceCodeQuery(); + QueueSourceCodeQuery( uint32_t( ptr ) ); break; case ServerQueryDataTransfer: - assert( !m_queryData ); + if( m_queryData ) + { + assert( !m_queryImage ); + m_queryImage = m_queryData; + } m_queryDataPtr = m_queryData = (char*)tracy_malloc( ptr + 11 ); AckServerQuery(); break; @@ -2610,6 +3403,11 @@ bool Profiler::HandleServerQuery() m_queryDataPtr += 12; AckServerQuery(); break; +#ifdef TRACY_FIBERS + case ServerQueryFiberName: + SendString( ptr, (const char*)ptr, QueueType::FiberName ); + break; +#endif default: assert( false ); break; @@ -2702,23 +3500,32 @@ void Profiler::HandleDisconnect() void Profiler::CalibrateTimer() { -#ifdef TRACY_HW_TIMER - std::atomic_signal_fence( std::memory_order_acq_rel ); - const auto t0 = std::chrono::high_resolution_clock::now(); - const auto r0 = GetTime(); - std::atomic_signal_fence( std::memory_order_acq_rel ); - std::this_thread::sleep_for( std::chrono::milliseconds( 200 ) ); - std::atomic_signal_fence( std::memory_order_acq_rel ); - const auto t1 = std::chrono::high_resolution_clock::now(); - const auto r1 = GetTime(); - std::atomic_signal_fence( std::memory_order_acq_rel ); - - const auto dt = std::chrono::duration_cast( t1 - t0 ).count(); - const auto dr = r1 - r0; - - m_timerMul = double( dt ) / double( dr ); -#else m_timerMul = 1.; + +#ifdef TRACY_HW_TIMER + +# if !defined TRACY_TIMER_QPC && defined TRACY_TIMER_FALLBACK + const bool needCalibration = HardwareSupportsInvariantTSC(); +# else + const bool needCalibration = true; +# endif + if( needCalibration ) + { + std::atomic_signal_fence( std::memory_order_acq_rel ); + const auto t0 = std::chrono::high_resolution_clock::now(); + const auto r0 = GetTime(); + std::atomic_signal_fence( std::memory_order_acq_rel ); + std::this_thread::sleep_for( std::chrono::milliseconds( 200 ) ); + std::atomic_signal_fence( std::memory_order_acq_rel ); + const auto t1 = std::chrono::high_resolution_clock::now(); + const auto r1 = GetTime(); + std::atomic_signal_fence( std::memory_order_acq_rel ); + + const auto dt = std::chrono::duration_cast( t1 - t0 ).count(); + const auto dr = r1 - r0; + + m_timerMul = double( dt ) / double( dr ); + } #endif } @@ -2784,8 +3591,12 @@ void Profiler::ReportTopology() uint32_t thread; }; -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 +# ifdef TRACY_UWP + t_GetLogicalProcessorInformationEx _GetLogicalProcessorInformationEx = &::GetLogicalProcessorInformationEx; +# else t_GetLogicalProcessorInformationEx _GetLogicalProcessorInformationEx = (t_GetLogicalProcessorInformationEx)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetLogicalProcessorInformationEx" ); +# endif if( !_GetLogicalProcessorInformationEx ) return; DWORD psz = 0; @@ -2919,10 +3730,11 @@ void Profiler::SendFrameMark( const char* name ) #ifdef TRACY_ON_DEMAND if( !GetProfiler().IsConnected() ) return; #endif - TracyLfqPrepare( QueueType::FrameMarkMsg ); + auto item = QueueSerial(); + MemWrite( &item->hdr.type, QueueType::FrameMarkMsg ); MemWrite( &item->frameMark.time, GetTime() ); MemWrite( &item->frameMark.name, uint64_t( name ) ); - TracyLfqCommit; + QueueSerialFinish(); } void Profiler::SendFrameMark( const char* name, QueueType type ) @@ -2938,16 +3750,39 @@ void Profiler::SendFrameMark( const char* name, QueueType type ) QueueSerialFinish(); } +void Profiler::SendFrameImage( const void* image, uint16_t w, uint16_t h, uint8_t offset, bool flip ) +{ +#ifndef TRACY_NO_FRAME_IMAGE + auto& profiler = GetProfiler(); + assert( profiler.m_frameCount.load( std::memory_order_relaxed ) < std::numeric_limits::max() ); +# ifdef TRACY_ON_DEMAND + if( !profiler.IsConnected() ) return; +# endif + const auto sz = size_t( w ) * size_t( h ) * 4; + auto ptr = (char*)tracy_malloc( sz ); + memcpy( ptr, image, sz ); + + profiler.m_fiLock.lock(); + auto fi = profiler.m_fiQueue.prepare_next(); + fi->image = ptr; + fi->frame = uint32_t( profiler.m_frameCount.load( std::memory_order_relaxed ) - offset ); + fi->w = w; + fi->h = h; + fi->flip = flip; + profiler.m_fiQueue.commit_next(); + profiler.m_fiLock.unlock(); +#endif +} + void Profiler::PlotData( const char* name, int64_t val ) { #ifdef TRACY_ON_DEMAND if( !GetProfiler().IsConnected() ) return; #endif - TracyLfqPrepare( QueueType::PlotData ); - MemWrite( &item->plotData.name, (uint64_t)name ); - MemWrite( &item->plotData.time, GetTime() ); - MemWrite( &item->plotData.type, PlotDataType::Int ); - MemWrite( &item->plotData.data.i, val ); + TracyLfqPrepare( QueueType::PlotDataInt ); + MemWrite( &item->plotDataInt.name, (uint64_t)name ); + MemWrite( &item->plotDataInt.time, GetTime() ); + MemWrite( &item->plotDataInt.val, val ); TracyLfqCommit; } @@ -2956,11 +3791,10 @@ void Profiler::PlotData( const char* name, float val ) #ifdef TRACY_ON_DEMAND if( !GetProfiler().IsConnected() ) return; #endif - TracyLfqPrepare( QueueType::PlotData ); - MemWrite( &item->plotData.name, (uint64_t)name ); - MemWrite( &item->plotData.time, GetTime() ); - MemWrite( &item->plotData.type, PlotDataType::Float ); - MemWrite( &item->plotData.data.f, val ); + TracyLfqPrepare( QueueType::PlotDataFloat ); + MemWrite( &item->plotDataFloat.name, (uint64_t)name ); + MemWrite( &item->plotDataFloat.time, GetTime() ); + MemWrite( &item->plotDataFloat.val, val ); TracyLfqCommit; } @@ -2969,19 +3803,21 @@ void Profiler::PlotData( const char* name, double val ) #ifdef TRACY_ON_DEMAND if( !GetProfiler().IsConnected() ) return; #endif - TracyLfqPrepare( QueueType::PlotData ); - MemWrite( &item->plotData.name, (uint64_t)name ); - MemWrite( &item->plotData.time, GetTime() ); - MemWrite( &item->plotData.type, PlotDataType::Double ); - MemWrite( &item->plotData.data.d, val ); + TracyLfqPrepare( QueueType::PlotDataDouble ); + MemWrite( &item->plotDataDouble.name, (uint64_t)name ); + MemWrite( &item->plotDataDouble.time, GetTime() ); + MemWrite( &item->plotDataDouble.val, val ); TracyLfqCommit; } -void Profiler::ConfigurePlot( const char* name, PlotFormatType type ) +void Profiler::ConfigurePlot( const char* name, PlotFormatType type, bool step, bool fill, uint32_t color ) { TracyLfqPrepare( QueueType::PlotConfig ); MemWrite( &item->plotConfig.name, (uint64_t)name ); MemWrite( &item->plotConfig.type, (uint8_t)type ); + MemWrite( &item->plotConfig.step, (uint8_t)step ); + MemWrite( &item->plotConfig.fill, (uint8_t)fill ); + MemWrite( &item->plotConfig.color, color ); #ifdef TRACY_ON_DEMAND GetProfiler().DeferItem( *item ); @@ -2990,7 +3826,7 @@ void Profiler::ConfigurePlot( const char* name, PlotFormatType type ) TracyLfqCommit; } - void Profiler::Message( const char* txt, size_t size, int callstack ) +void Profiler::Message( const char* txt, size_t size, int callstack ) { assert( size < std::numeric_limits::max() ); #ifdef TRACY_ON_DEMAND @@ -2998,17 +3834,17 @@ void Profiler::ConfigurePlot( const char* name, PlotFormatType type ) #endif if( callstack != 0 ) { - InitRPMallocThread(); tracy::GetProfiler().SendCallstack( callstack ); } - TracyLfqPrepare( callstack == 0 ? QueueType::Message : QueueType::MessageCallstack ); auto ptr = (char*)tracy_malloc( size ); memcpy( ptr, txt, size ); + + TracyQueuePrepare( callstack == 0 ? QueueType::Message : QueueType::MessageCallstack ); MemWrite( &item->messageFat.time, GetTime() ); MemWrite( &item->messageFat.text, (uint64_t)ptr ); MemWrite( &item->messageFat.size, (uint16_t)size ); - TracyLfqCommit; + TracyQueueCommit( messageFatThread ); } void Profiler::Message( const char* txt, int callstack ) @@ -3018,14 +3854,13 @@ void Profiler::Message( const char* txt, int callstack ) #endif if( callstack != 0 ) { - InitRPMallocThread(); tracy::GetProfiler().SendCallstack( callstack ); } - TracyLfqPrepare( callstack == 0 ? QueueType::MessageLiteral : QueueType::MessageLiteralCallstack ); + TracyQueuePrepare( callstack == 0 ? QueueType::MessageLiteral : QueueType::MessageLiteralCallstack ); MemWrite( &item->messageLiteral.time, GetTime() ); MemWrite( &item->messageLiteral.text, (uint64_t)txt ); - TracyLfqCommit; + TracyQueueCommit( messageLiteralThread ); } void Profiler::MessageColor( const char* txt, size_t size, uint32_t color, int callstack ) @@ -3036,20 +3871,20 @@ void Profiler::MessageColor( const char* txt, size_t size, uint32_t color, int c #endif if( callstack != 0 ) { - InitRPMallocThread(); tracy::GetProfiler().SendCallstack( callstack ); } - TracyLfqPrepare( callstack == 0 ? QueueType::MessageColor : QueueType::MessageColorCallstack ); auto ptr = (char*)tracy_malloc( size ); memcpy( ptr, txt, size ); + + TracyQueuePrepare( callstack == 0 ? QueueType::MessageColor : QueueType::MessageColorCallstack ); MemWrite( &item->messageColorFat.time, GetTime() ); MemWrite( &item->messageColorFat.text, (uint64_t)ptr ); MemWrite( &item->messageColorFat.r, uint8_t( ( color ) & 0xFF ) ); MemWrite( &item->messageColorFat.g, uint8_t( ( color >> 8 ) & 0xFF ) ); MemWrite( &item->messageColorFat.b, uint8_t( ( color >> 16 ) & 0xFF ) ); MemWrite( &item->messageColorFat.size, (uint16_t)size ); - TracyLfqCommit; + TracyQueueCommit( messageColorFatThread ); } void Profiler::MessageColor( const char* txt, uint32_t color, int callstack ) @@ -3059,23 +3894,21 @@ void Profiler::MessageColor( const char* txt, uint32_t color, int callstack ) #endif if( callstack != 0 ) { - InitRPMallocThread(); tracy::GetProfiler().SendCallstack( callstack ); } - TracyLfqPrepare( callstack == 0 ? QueueType::MessageLiteralColor : QueueType::MessageLiteralColorCallstack ); + TracyQueuePrepare( callstack == 0 ? QueueType::MessageLiteralColor : QueueType::MessageLiteralColorCallstack ); MemWrite( &item->messageColorLiteral.time, GetTime() ); MemWrite( &item->messageColorLiteral.text, (uint64_t)txt ); MemWrite( &item->messageColorLiteral.r, uint8_t( ( color ) & 0xFF ) ); MemWrite( &item->messageColorLiteral.g, uint8_t( ( color >> 8 ) & 0xFF ) ); MemWrite( &item->messageColorLiteral.b, uint8_t( ( color >> 16 ) & 0xFF ) ); - TracyLfqCommit; + TracyQueueCommit( messageColorLiteralThread ); } void Profiler::MessageAppInfo( const char* txt, size_t size ) { assert( size < std::numeric_limits::max() ); - InitRPMallocThread(); auto ptr = (char*)tracy_malloc( size ); memcpy( ptr, txt, size ); TracyLfqPrepare( QueueType::MessageAppInfo ); @@ -3126,7 +3959,6 @@ void Profiler::MemAllocCallstack( const void* ptr, size_t size, int depth, bool # endif const auto thread = GetThreadHandle(); - InitRPMallocThread(); auto callstack = Callstack( depth ); profiler.m_serialLock.lock(); @@ -3134,6 +3966,7 @@ void Profiler::MemAllocCallstack( const void* ptr, size_t size, int depth, bool SendMemAlloc( QueueType::MemAllocCallstack, thread, ptr, size ); profiler.m_serialLock.unlock(); #else + static_cast(depth); // unused MemAlloc( ptr, size, secure ); #endif } @@ -3141,6 +3974,11 @@ void Profiler::MemAllocCallstack( const void* ptr, size_t size, int depth, bool void Profiler::MemFreeCallstack( const void* ptr, int depth, bool secure ) { if( secure && !ProfilerAvailable() ) return; + if( !ProfilerAllocatorAvailable() ) + { + MemFree( ptr, secure ); + return; + } #ifdef TRACY_HAS_CALLSTACK auto& profiler = GetProfiler(); # ifdef TRACY_ON_DEMAND @@ -3148,7 +3986,6 @@ void Profiler::MemFreeCallstack( const void* ptr, int depth, bool secure ) # endif const auto thread = GetThreadHandle(); - InitRPMallocThread(); auto callstack = Callstack( depth ); profiler.m_serialLock.lock(); @@ -3156,6 +3993,7 @@ void Profiler::MemFreeCallstack( const void* ptr, int depth, bool secure ) SendMemFree( QueueType::MemFreeCallstack, thread, ptr ); profiler.m_serialLock.unlock(); #else + static_cast(depth); // unused MemFree( ptr, secure ); #endif } @@ -3198,7 +4036,6 @@ void Profiler::MemAllocCallstackNamed( const void* ptr, size_t size, int depth, # endif const auto thread = GetThreadHandle(); - InitRPMallocThread(); auto callstack = Callstack( depth ); profiler.m_serialLock.lock(); @@ -3207,6 +4044,8 @@ void Profiler::MemAllocCallstackNamed( const void* ptr, size_t size, int depth, SendMemAlloc( QueueType::MemAllocCallstackNamed, thread, ptr, size ); profiler.m_serialLock.unlock(); #else + static_cast(depth); // unused + static_cast(name); // unused MemAlloc( ptr, size, secure ); #endif } @@ -3221,7 +4060,6 @@ void Profiler::MemFreeCallstackNamed( const void* ptr, int depth, bool secure, c # endif const auto thread = GetThreadHandle(); - InitRPMallocThread(); auto callstack = Callstack( depth ); profiler.m_serialLock.lock(); @@ -3230,6 +4068,8 @@ void Profiler::MemFreeCallstackNamed( const void* ptr, int depth, bool secure, c SendMemFree( QueueType::MemFreeCallstackNamed, thread, ptr ); profiler.m_serialLock.unlock(); #else + static_cast(depth); // unused + static_cast(name); // unused MemFree( ptr, secure ); #endif } @@ -3238,13 +4078,22 @@ void Profiler::SendCallstack( int depth ) { #ifdef TRACY_HAS_CALLSTACK auto ptr = Callstack( depth ); - TracyLfqPrepare( QueueType::Callstack ); + TracyQueuePrepare( QueueType::Callstack ); MemWrite( &item->callstackFat.ptr, (uint64_t)ptr ); - TracyLfqCommit; + TracyQueueCommit( callstackFatThread ); +#else + static_cast(depth); // unused #endif } void Profiler::ParameterRegister( ParameterCallback cb ) { GetProfiler().m_paramCallback = cb; } +void Profiler::ParameterRegister( ParameterCallback cb, void* data ) +{ + auto& profiler = GetProfiler(); + profiler.m_paramCallback = cb; + profiler.m_paramCallbackData = data; +} + void Profiler::ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val ) { TracyLfqPrepare( QueueType::ParamSetup ); @@ -3263,11 +4112,12 @@ void Profiler::ParameterSetup( uint32_t idx, const char* name, bool isBool, int3 void Profiler::SendCallstack( int depth, const char* skipBefore ) { #ifdef TRACY_HAS_CALLSTACK - TracyLfqPrepare( QueueType::Callstack ); auto ptr = Callstack( depth ); CutCallstack( ptr, skipBefore ); + + TracyQueuePrepare( QueueType::Callstack ); MemWrite( &item->callstackFat.ptr, (uint64_t)ptr ); - TracyLfqCommit; + TracyQueueCommit( callstackFatThread ); #endif } @@ -3322,246 +4172,126 @@ void Profiler::HandleParameter( uint64_t payload ) assert( m_paramCallback ); const auto idx = uint32_t( payload >> 32 ); const auto val = int32_t( payload & 0xFFFFFFFF ); - m_paramCallback( idx, val ); + m_paramCallback( m_paramCallbackData, idx, val ); AckServerQuery(); } -#ifdef __ANDROID__ -// Implementation helpers of EnsureReadable(address). -// This is so far only needed on Android, where it is common for libraries to be mapped -// with only executable, not readable, permissions. Typical example (line from /proc/self/maps): -/* -746b63b000-746b6dc000 --xp 00042000 07:48 35 /apex/com.android.runtime/lib64/bionic/libc.so -*/ -// See https://github.com/wolfpld/tracy/issues/125 . -// To work around this, we parse /proc/self/maps and we use mprotect to set read permissions -// on any mappings that contain symbols addresses hit by HandleSymbolCodeQuery. - -namespace { -// Holds some information about a single memory mapping. -struct MappingInfo { - // Start of address range. Inclusive. - uintptr_t start_address; - // End of address range. Exclusive, so the mapping is the half-open interval - // [start, end) and its length in bytes is `end - start`. As in /proc/self/maps. - uintptr_t end_address; - // Read/Write/Executable permissions. - bool perm_r, perm_w, perm_x; -}; -} // anonymous namespace - -// Internal implementation helper for LookUpMapping(address). -// -// Parses /proc/self/maps returning a vector. -// /proc/self/maps is assumed to be sorted by ascending address, so the resulting -// vector is sorted by ascending address too. -static std::vector ParseMappings() -{ - std::vector result; - FILE* file = fopen( "/proc/self/maps", "r" ); - if( !file ) return result; - char line[1024]; - while( fgets( line, sizeof( line ), file ) ) - { - uintptr_t start_addr; - uintptr_t end_addr; - if( sscanf( line, "%lx-%lx", &start_addr, &end_addr ) != 2 ) continue; - char* first_space = strchr( line, ' ' ); - if( !first_space ) continue; - char* perm = first_space + 1; - char* second_space = strchr( perm, ' ' ); - if( !second_space || second_space - perm != 4 ) continue; - result.emplace_back(); - auto& mapping = result.back(); - mapping.start_address = start_addr; - mapping.end_address = end_addr; - mapping.perm_r = perm[0] == 'r'; - mapping.perm_w = perm[1] == 'w'; - mapping.perm_x = perm[2] == 'x'; - } - fclose( file ); - return result; -} - -// Internal implementation helper for LookUpMapping(address). -// -// Takes as input an `address` and a known vector `mappings`, assumed to be -// sorted by increasing addresses, as /proc/self/maps seems to be. -// Returns a pointer to the MappingInfo describing the mapping that this -// address belongs to, or nullptr if the address isn't in `mappings`. -static MappingInfo* LookUpMapping(std::vector& mappings, uintptr_t address) -{ - // Comparison function for std::lower_bound. Returns true if all addresses in `m1` - // are lower than `addr`. - auto Compare = []( const MappingInfo& m1, uintptr_t addr ) { - // '<=' because the address ranges are half-open intervals, [start, end). - return m1.end_address <= addr; - }; - auto iter = std::lower_bound( mappings.begin(), mappings.end(), address, Compare ); - if( iter == mappings.end() || iter->start_address > address) { - return nullptr; - } - return &*iter; -} - -// Internal implementation helper for EnsureReadable(address). -// -// Takes as input an `address` and returns a pointer to a MappingInfo -// describing the mapping that this address belongs to, or nullptr if -// the address isn't in any known mapping. -// -// This function is stateful and not reentrant (assumes to be called from -// only one thread). It holds a vector of mappings parsed from /proc/self/maps. -// -// Attempts to react to mappings changes by re-parsing /proc/self/maps. -static MappingInfo* LookUpMapping(uintptr_t address) -{ - // Static state managed by this function. Not constant, we mutate that state as - // we turn some mappings readable. Initially parsed once here, updated as needed below. - static std::vector s_mappings = ParseMappings(); - MappingInfo* mapping = LookUpMapping( s_mappings, address ); - if( mapping ) return mapping; - - // This address isn't in any known mapping. Try parsing again, maybe - // mappings changed. - s_mappings = ParseMappings(); - return LookUpMapping( s_mappings, address ); -} - -// Internal implementation helper for EnsureReadable(address). -// -// Attempts to make the specified `mapping` readable if it isn't already. -// Returns true if and only if the mapping is readable. -static bool EnsureReadable( MappingInfo& mapping ) -{ - if( mapping.perm_r ) - { - // The mapping is already readable. - return true; - } - int prot = PROT_READ; - if( mapping.perm_w ) prot |= PROT_WRITE; - if( mapping.perm_x ) prot |= PROT_EXEC; - if( mprotect( reinterpret_cast( mapping.start_address ), - mapping.end_address - mapping.start_address, prot ) == -1 ) - { - // Failed to make the mapping readable. Shouldn't happen, hasn't - // been observed yet. If it happened in practice, we should consider - // adding a bool to MappingInfo to track this to avoid retrying mprotect - // everytime on such mappings. - return false; - } - // The mapping is now readable. Update `mapping` so the next call will be fast. - mapping.perm_r = true; - return true; -} - -// Attempts to set the read permission on the entire mapping containing the -// specified address. Returns true if and only if the mapping is now readable. -static bool EnsureReadable( uintptr_t address ) -{ - MappingInfo* mapping = LookUpMapping(address); - return mapping && EnsureReadable( *mapping ); -} - -#endif // defined __ANDROID__ - -void Profiler::HandleSymbolQuery( uint64_t symbol ) -{ -#ifdef TRACY_HAS_CALLSTACK -#ifdef __ANDROID__ - // On Android it's common for code to be in mappings that are only executable - // but not readable. - if( !EnsureReadable( symbol ) ) - { - return; - } -#endif - const auto sym = DecodeSymbolAddress( symbol ); - - SendSingleString( sym.file ); - - QueueItem item; - MemWrite( &item.hdr.type, QueueType::SymbolInformation ); - MemWrite( &item.symbolInformation.line, sym.line ); - MemWrite( &item.symbolInformation.symAddr, symbol ); - - AppendData( &item, QueueDataSize[(int)QueueType::SymbolInformation] ); - - if( sym.needFree ) tracy_free( (void*)sym.file ); -#endif -} - void Profiler::HandleSymbolCodeQuery( uint64_t symbol, uint32_t size ) { -#ifdef __ANDROID__ - // On Android it's common for code to be in mappings that are only executable - // but not readable. - if( !EnsureReadable( symbol ) ) + if( symbol >> 63 != 0 ) { - return; + QueueKernelCode( symbol, size ); } + else + { +#ifdef __ANDROID__ + // On Android it's common for code to be in mappings that are only executable + // but not readable. + if( !EnsureReadable( symbol ) ) + { + AckSymbolCodeNotAvailable(); + return; + } #endif - SendLongString( symbol, (const char*)symbol, size, QueueType::SymbolCode ); + SendLongString( symbol, (const char*)symbol, size, QueueType::SymbolCode ); + } } -void Profiler::HandleSourceCodeQuery() +void Profiler::HandleSourceCodeQuery( char* data, char* image, uint32_t id ) { - assert( m_exectime != 0 ); - assert( m_queryData ); - + bool ok = false; struct stat st; - if( stat( m_queryData, &st ) == 0 && (uint64_t)st.st_mtime < m_exectime && st.st_size < ( TargetFrameSize - 16 ) ) + if( stat( data, &st ) == 0 && (uint64_t)st.st_mtime < m_exectime ) { - FILE* f = fopen( m_queryData, "rb" ); - tracy_free( m_queryData ); - if( f ) + if( st.st_size < ( TargetFrameSize - 16 ) ) { - auto ptr = (char*)tracy_malloc( st.st_size ); - auto rd = fread( ptr, 1, st.st_size, f ); - fclose( f ); - if( rd == (size_t)st.st_size ) + FILE* f = fopen( data, "rb" ); + if( f ) { - SendLongString( (uint64_t)ptr, ptr, rd, QueueType::SourceCode ); + auto ptr = (char*)tracy_malloc_fast( st.st_size ); + auto rd = fread( ptr, 1, st.st_size, f ); + fclose( f ); + if( rd == (size_t)st.st_size ) + { + TracyLfqPrepare( QueueType::SourceCodeMetadata ); + MemWrite( &item->sourceCodeMetadata.ptr, (uint64_t)ptr ); + MemWrite( &item->sourceCodeMetadata.size, (uint32_t)rd ); + MemWrite( &item->sourceCodeMetadata.id, id ); + TracyLfqCommit; + ok = true; + } } - else - { - AckSourceCodeNotAvailable(); - } - tracy_free( ptr ); } - else + } + +#ifdef TRACY_DEBUGINFOD + else if( image && data[0] == '/' ) + { + size_t size; + auto buildid = GetBuildIdForImage( image, size ); + if( buildid ) { - AckSourceCodeNotAvailable(); + auto d = debuginfod_find_source( GetDebuginfodClient(), buildid, size, data, nullptr ); + TracyDebug( "DebugInfo source query: %s, fn: %s, image: %s\n", d >= 0 ? " ok " : "fail", data, image ); + if( d >= 0 ) + { + struct stat st; + fstat( d, &st ); + if( st.st_size < ( TargetFrameSize - 16 ) ) + { + lseek( d, 0, SEEK_SET ); + auto ptr = (char*)tracy_malloc_fast( st.st_size ); + auto rd = read( d, ptr, st.st_size ); + if( rd == (size_t)st.st_size ) + { + TracyLfqPrepare( QueueType::SourceCodeMetadata ); + MemWrite( &item->sourceCodeMetadata.ptr, (uint64_t)ptr ); + MemWrite( &item->sourceCodeMetadata.size, (uint32_t)rd ); + MemWrite( &item->sourceCodeMetadata.id, id ); + TracyLfqCommit; + ok = true; + } + } + close( d ); + } } } else { - tracy_free( m_queryData ); - AckSourceCodeNotAvailable(); + TracyDebug( "DebugInfo invalid query fn: %s, image: %s\n", data, image ); } - m_queryData = nullptr; -} - -void Profiler::SendCodeLocation( uint64_t ptr ) -{ -#ifdef TRACY_HAS_CALLSTACK - const auto sym = DecodeCodeAddress( ptr ); - - SendSingleString( sym.file ); - - QueueItem item; - MemWrite( &item.hdr.type, QueueType::CodeInformation ); - MemWrite( &item.codeInformation.ptr, ptr ); - MemWrite( &item.codeInformation.line, sym.line ); - - AppendData( &item, QueueDataSize[(int)QueueType::CodeInformation] ); - - if( sym.needFree ) tracy_free( (void*)sym.file ); #endif + + if( !ok && m_sourceCallback ) + { + size_t sz; + char* ptr = m_sourceCallback( m_sourceCallbackData, data, sz ); + if( ptr ) + { + if( sz < ( TargetFrameSize - 16 ) ) + { + TracyLfqPrepare( QueueType::SourceCodeMetadata ); + MemWrite( &item->sourceCodeMetadata.ptr, (uint64_t)ptr ); + MemWrite( &item->sourceCodeMetadata.size, (uint32_t)sz ); + MemWrite( &item->sourceCodeMetadata.id, id ); + TracyLfqCommit; + ok = true; + } + } + } + + if( !ok ) + { + TracyLfqPrepare( QueueType::AckSourceCodeNotAvailable ); + MemWrite( &item->sourceCodeNotAvailable, id ); + TracyLfqCommit; + } + + tracy_free_fast( data ); + tracy_free_fast( image ); } -#if ( defined _WIN32 || defined __CYGWIN__ ) && defined TRACY_TIMER_QPC +#if defined _WIN32 && defined TRACY_TIMER_QPC int64_t Profiler::GetTimeQpc() { LARGE_INTEGER t; @@ -3572,4 +4302,486 @@ int64_t Profiler::GetTimeQpc() } +#if 0 +#ifdef __cplusplus +extern "C" { #endif + +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin( const struct ___tracy_source_location_data* srcloc, int active ) +{ + ___tracy_c_zone_context ctx; +#ifdef TRACY_ON_DEMAND + ctx.active = active && tracy::GetProfiler().IsConnected(); +#else + ctx.active = active; +#endif + if( !ctx.active ) return ctx; + const auto id = tracy::GetProfiler().GetNextZoneId(); + ctx.id = id; + +#ifndef TRACY_NO_VERIFY + { + TracyQueuePrepareC( tracy::QueueType::ZoneValidation ); + tracy::MemWrite( &item->zoneValidation.id, id ); + TracyQueueCommitC( zoneValidationThread ); + } +#endif + { + TracyQueuePrepareC( tracy::QueueType::ZoneBegin ); + tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc ); + TracyQueueCommitC( zoneBeginThread ); + } + return ctx; +} + +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_callstack( const struct ___tracy_source_location_data* srcloc, int depth, int active ) +{ + ___tracy_c_zone_context ctx; +#ifdef TRACY_ON_DEMAND + ctx.active = active && tracy::GetProfiler().IsConnected(); +#else + ctx.active = active; +#endif + if( !ctx.active ) return ctx; + const auto id = tracy::GetProfiler().GetNextZoneId(); + ctx.id = id; + +#ifndef TRACY_NO_VERIFY + { + TracyQueuePrepareC( tracy::QueueType::ZoneValidation ); + tracy::MemWrite( &item->zoneValidation.id, id ); + TracyQueueCommitC( zoneValidationThread ); + } +#endif + tracy::GetProfiler().SendCallstack( depth ); + { + TracyQueuePrepareC( tracy::QueueType::ZoneBeginCallstack ); + tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc ); + TracyQueueCommitC( zoneBeginThread ); + } + return ctx; +} + +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc( uint64_t srcloc, int active ) +{ + ___tracy_c_zone_context ctx; +#ifdef TRACY_ON_DEMAND + ctx.active = active && tracy::GetProfiler().IsConnected(); +#else + ctx.active = active; +#endif + if( !ctx.active ) + { + tracy::tracy_free( (void*)srcloc ); + return ctx; + } + const auto id = tracy::GetProfiler().GetNextZoneId(); + ctx.id = id; + +#ifndef TRACY_NO_VERIFY + { + TracyQueuePrepareC( tracy::QueueType::ZoneValidation ); + tracy::MemWrite( &item->zoneValidation.id, id ); + TracyQueueCommitC( zoneValidationThread ); + } +#endif + { + TracyQueuePrepareC( tracy::QueueType::ZoneBeginAllocSrcLoc ); + tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->zoneBegin.srcloc, srcloc ); + TracyQueueCommitC( zoneBeginThread ); + } + return ctx; +} + +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc_callstack( uint64_t srcloc, int depth, int active ) +{ + ___tracy_c_zone_context ctx; +#ifdef TRACY_ON_DEMAND + ctx.active = active && tracy::GetProfiler().IsConnected(); +#else + ctx.active = active; +#endif + if( !ctx.active ) + { + tracy::tracy_free( (void*)srcloc ); + return ctx; + } + const auto id = tracy::GetProfiler().GetNextZoneId(); + ctx.id = id; + +#ifndef TRACY_NO_VERIFY + { + TracyQueuePrepareC( tracy::QueueType::ZoneValidation ); + tracy::MemWrite( &item->zoneValidation.id, id ); + TracyQueueCommitC( zoneValidationThread ); + } +#endif + tracy::GetProfiler().SendCallstack( depth ); + { + TracyQueuePrepareC( tracy::QueueType::ZoneBeginAllocSrcLocCallstack ); + tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->zoneBegin.srcloc, srcloc ); + TracyQueueCommitC( zoneBeginThread ); + } + return ctx; +} + +TRACY_API void ___tracy_emit_zone_end( TracyCZoneCtx ctx ) +{ + if( !ctx.active ) return; +#ifndef TRACY_NO_VERIFY + { + TracyQueuePrepareC( tracy::QueueType::ZoneValidation ); + tracy::MemWrite( &item->zoneValidation.id, ctx.id ); + TracyQueueCommitC( zoneValidationThread ); + } +#endif + { + TracyQueuePrepareC( tracy::QueueType::ZoneEnd ); + tracy::MemWrite( &item->zoneEnd.time, tracy::Profiler::GetTime() ); + TracyQueueCommitC( zoneEndThread ); + } +} + +TRACY_API void ___tracy_emit_zone_text( TracyCZoneCtx ctx, const char* txt, size_t size ) +{ + assert( size < std::numeric_limits::max() ); + if( !ctx.active ) return; + auto ptr = (char*)tracy::tracy_malloc( size ); + memcpy( ptr, txt, size ); +#ifndef TRACY_NO_VERIFY + { + TracyQueuePrepareC( tracy::QueueType::ZoneValidation ); + tracy::MemWrite( &item->zoneValidation.id, ctx.id ); + TracyQueueCommitC( zoneValidationThread ); + } +#endif + { + TracyQueuePrepareC( tracy::QueueType::ZoneText ); + tracy::MemWrite( &item->zoneTextFat.text, (uint64_t)ptr ); + tracy::MemWrite( &item->zoneTextFat.size, (uint16_t)size ); + TracyQueueCommitC( zoneTextFatThread ); + } +} + +TRACY_API void ___tracy_emit_zone_name( TracyCZoneCtx ctx, const char* txt, size_t size ) +{ + assert( size < std::numeric_limits::max() ); + if( !ctx.active ) return; + auto ptr = (char*)tracy::tracy_malloc( size ); + memcpy( ptr, txt, size ); +#ifndef TRACY_NO_VERIFY + { + TracyQueuePrepareC( tracy::QueueType::ZoneValidation ); + tracy::MemWrite( &item->zoneValidation.id, ctx.id ); + TracyQueueCommitC( zoneValidationThread ); + } +#endif + { + TracyQueuePrepareC( tracy::QueueType::ZoneName ); + tracy::MemWrite( &item->zoneTextFat.text, (uint64_t)ptr ); + tracy::MemWrite( &item->zoneTextFat.size, (uint16_t)size ); + TracyQueueCommitC( zoneTextFatThread ); + } +} + +TRACY_API void ___tracy_emit_zone_color( TracyCZoneCtx ctx, uint32_t color ) { + if( !ctx.active ) return; +#ifndef TRACY_NO_VERIFY + { + TracyQueuePrepareC( tracy::QueueType::ZoneValidation ); + tracy::MemWrite( &item->zoneValidation.id, ctx.id ); + TracyQueueCommitC( zoneValidationThread ); + } +#endif + { + TracyQueuePrepareC( tracy::QueueType::ZoneColor ); + tracy::MemWrite( &item->zoneColor.r, uint8_t( ( color ) & 0xFF ) ); + tracy::MemWrite( &item->zoneColor.g, uint8_t( ( color >> 8 ) & 0xFF ) ); + tracy::MemWrite( &item->zoneColor.b, uint8_t( ( color >> 16 ) & 0xFF ) ); + TracyQueueCommitC( zoneColorThread ); + } +} + +TRACY_API void ___tracy_emit_zone_value( TracyCZoneCtx ctx, uint64_t value ) +{ + if( !ctx.active ) return; +#ifndef TRACY_NO_VERIFY + { + TracyQueuePrepareC( tracy::QueueType::ZoneValidation ); + tracy::MemWrite( &item->zoneValidation.id, ctx.id ); + TracyQueueCommitC( zoneValidationThread ); + } +#endif + { + TracyQueuePrepareC( tracy::QueueType::ZoneValue ); + tracy::MemWrite( &item->zoneValue.value, value ); + TracyQueueCommitC( zoneValueThread ); + } +} + +TRACY_API void ___tracy_emit_memory_alloc( const void* ptr, size_t size, int secure ) { tracy::Profiler::MemAlloc( ptr, size, secure != 0 ); } +TRACY_API void ___tracy_emit_memory_alloc_callstack( const void* ptr, size_t size, int depth, int secure ) { tracy::Profiler::MemAllocCallstack( ptr, size, depth, secure != 0 ); } +TRACY_API void ___tracy_emit_memory_free( const void* ptr, int secure ) { tracy::Profiler::MemFree( ptr, secure != 0 ); } +TRACY_API void ___tracy_emit_memory_free_callstack( const void* ptr, int depth, int secure ) { tracy::Profiler::MemFreeCallstack( ptr, depth, secure != 0 ); } +TRACY_API void ___tracy_emit_memory_alloc_named( const void* ptr, size_t size, int secure, const char* name ) { tracy::Profiler::MemAllocNamed( ptr, size, secure != 0, name ); } +TRACY_API void ___tracy_emit_memory_alloc_callstack_named( const void* ptr, size_t size, int depth, int secure, const char* name ) { tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, secure != 0, name ); } +TRACY_API void ___tracy_emit_memory_free_named( const void* ptr, int secure, const char* name ) { tracy::Profiler::MemFreeNamed( ptr, secure != 0, name ); } +TRACY_API void ___tracy_emit_memory_free_callstack_named( const void* ptr, int depth, int secure, const char* name ) { tracy::Profiler::MemFreeCallstackNamed( ptr, depth, secure != 0, name ); } +TRACY_API void ___tracy_emit_frame_mark( const char* name ) { tracy::Profiler::SendFrameMark( name ); } +TRACY_API void ___tracy_emit_frame_mark_start( const char* name ) { tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgStart ); } +TRACY_API void ___tracy_emit_frame_mark_end( const char* name ) { tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgEnd ); } +TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_t h, uint8_t offset, int flip ) { tracy::Profiler::SendFrameImage( image, w, h, offset, flip ); } +TRACY_API void ___tracy_emit_plot( const char* name, double val ) { tracy::Profiler::PlotData( name, val ); } +TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack ) { tracy::Profiler::Message( txt, size, callstack ); } +TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack ) { tracy::Profiler::Message( txt, callstack ); } +TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, size, color, callstack ); } +TRACY_API void ___tracy_emit_messageLC( const char* txt, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, color, callstack ); } +TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size ) { tracy::Profiler::MessageAppInfo( txt, size ); } + +TRACY_API uint64_t ___tracy_alloc_srcloc( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz ) { + return tracy::Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz ); +} + +TRACY_API uint64_t ___tracy_alloc_srcloc_name( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz ) { + return tracy::Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); +} + +TRACY_API void ___tracy_emit_gpu_zone_begin( const struct ___tracy_gpu_zone_begin_data data ) +{ + TracyLfqPrepareC( tracy::QueueType::GpuZoneBegin ); + tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); + tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); + tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); + TracyLfqCommitC; +} + +TRACY_API void ___tracy_emit_gpu_zone_begin_callstack( const struct ___tracy_gpu_zone_begin_callstack_data data ) +{ + tracy::GetProfiler().SendCallstack( data.depth ); + TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginCallstack ); + tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); + tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); + tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); + TracyLfqCommitC; +} + +TRACY_API void ___tracy_emit_gpu_zone_begin_alloc( const struct ___tracy_gpu_zone_begin_data data ) +{ + TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLoc ); + tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); + tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); + tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); + TracyLfqCommitC; +} + +TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack( const struct ___tracy_gpu_zone_begin_callstack_data data ) +{ + tracy::GetProfiler().SendCallstack( data.depth ); + TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLocCallstack ); + tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); + tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); + tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); + TracyLfqCommitC; +} + +TRACY_API void ___tracy_emit_gpu_time( const struct ___tracy_gpu_time_data data ) +{ + TracyLfqPrepareC( tracy::QueueType::GpuTime ); + tracy::MemWrite( &item->gpuTime.gpuTime, data.gpuTime ); + tracy::MemWrite( &item->gpuTime.queryId, data.queryId ); + tracy::MemWrite( &item->gpuTime.context, data.context ); + TracyLfqCommitC; +} + +TRACY_API void ___tracy_emit_gpu_zone_end( const struct ___tracy_gpu_zone_end_data data ) +{ + TracyLfqPrepareC( tracy::QueueType::GpuZoneEnd ); + tracy::MemWrite( &item->gpuZoneEnd.cpuTime, tracy::Profiler::GetTime() ); + memset( &item->gpuZoneEnd.thread, 0, sizeof( item->gpuZoneEnd.thread ) ); + tracy::MemWrite( &item->gpuZoneEnd.queryId, data.queryId ); + tracy::MemWrite( &item->gpuZoneEnd.context, data.context ); + TracyLfqCommitC; +} + +TRACY_API void ___tracy_emit_gpu_new_context( ___tracy_gpu_new_context_data data ) +{ + TracyLfqPrepareC( tracy::QueueType::GpuNewContext ); + tracy::MemWrite( &item->gpuNewContext.cpuTime, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuNewContext.gpuTime, data.gpuTime ); + tracy::MemWrite( &item->gpuNewContext.period, data.period ); + tracy::MemWrite( &item->gpuNewContext.context, data.context ); + tracy::MemWrite( &item->gpuNewContext.flags, data.flags ); + tracy::MemWrite( &item->gpuNewContext.type, data.type ); + TracyLfqCommitC; +} + +TRACY_API void ___tracy_emit_gpu_context_name( const struct ___tracy_gpu_context_name_data data ) +{ + auto ptr = (char*)tracy::tracy_malloc( data.len ); + memcpy( ptr, data.name, data.len ); + + TracyLfqPrepareC( tracy::QueueType::GpuContextName ); + tracy::MemWrite( &item->gpuContextNameFat.context, data.context ); + tracy::MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); + tracy::MemWrite( &item->gpuContextNameFat.size, data.len ); + TracyLfqCommitC; +} + +TRACY_API void ___tracy_emit_gpu_calibration( const struct ___tracy_gpu_calibration_data data ) +{ + TracyLfqPrepareC( tracy::QueueType::GpuCalibration ); + tracy::MemWrite( &item->gpuCalibration.cpuTime, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->gpuCalibration.gpuTime, data.gpuTime ); + tracy::MemWrite( &item->gpuCalibration.cpuDelta, data.cpuDelta ); + tracy::MemWrite( &item->gpuCalibration.context, data.context ); + TracyLfqCommitC; +} + +TRACY_API void ___tracy_emit_gpu_zone_begin_serial( const struct ___tracy_gpu_zone_begin_data data ) +{ + auto item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginSerial ); + tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); + tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); + tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); + tracy::Profiler::QueueSerialFinish(); +} + +TRACY_API void ___tracy_emit_gpu_zone_begin_callstack_serial( const struct ___tracy_gpu_zone_begin_callstack_data data ) +{ + auto item = tracy::Profiler::QueueSerialCallstack( tracy::Callstack( data.depth ) ); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginCallstackSerial ); + tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); + tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); + tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); + tracy::Profiler::QueueSerialFinish(); +} + +TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_serial( const struct ___tracy_gpu_zone_begin_data data ) +{ + auto item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocSerial ); + tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); + tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); + tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); + tracy::Profiler::QueueSerialFinish(); +} + +TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack_serial( const struct ___tracy_gpu_zone_begin_callstack_data data ) +{ + auto item = tracy::Profiler::QueueSerialCallstack( tracy::Callstack( data.depth ) ); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocCallstackSerial ); + tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); + tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); + tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); + tracy::Profiler::QueueSerialFinish(); +} + +TRACY_API void ___tracy_emit_gpu_time_serial( const struct ___tracy_gpu_time_data data ) +{ + auto item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuTime ); + tracy::MemWrite( &item->gpuTime.gpuTime, data.gpuTime ); + tracy::MemWrite( &item->gpuTime.queryId, data.queryId ); + tracy::MemWrite( &item->gpuTime.context, data.context ); + tracy::Profiler::QueueSerialFinish(); +} + +TRACY_API void ___tracy_emit_gpu_zone_end_serial( const struct ___tracy_gpu_zone_end_data data ) +{ + auto item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneEndSerial ); + tracy::MemWrite( &item->gpuZoneEnd.cpuTime, tracy::Profiler::GetTime() ); + memset( &item->gpuZoneEnd.thread, 0, sizeof( item->gpuZoneEnd.thread ) ); + tracy::MemWrite( &item->gpuZoneEnd.queryId, data.queryId ); + tracy::MemWrite( &item->gpuZoneEnd.context, data.context ); + tracy::Profiler::QueueSerialFinish(); +} + +TRACY_API void ___tracy_emit_gpu_new_context_serial( ___tracy_gpu_new_context_data data ) +{ + auto item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuNewContext ); + tracy::MemWrite( &item->gpuNewContext.cpuTime, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuNewContext.gpuTime, data.gpuTime ); + tracy::MemWrite( &item->gpuNewContext.period, data.period ); + tracy::MemWrite( &item->gpuNewContext.context, data.context ); + tracy::MemWrite( &item->gpuNewContext.flags, data.flags ); + tracy::MemWrite( &item->gpuNewContext.type, data.type ); + tracy::Profiler::QueueSerialFinish(); +} + +TRACY_API void ___tracy_emit_gpu_context_name_serial( const struct ___tracy_gpu_context_name_data data ) +{ + auto ptr = (char*)tracy::tracy_malloc( data.len ); + memcpy( ptr, data.name, data.len ); + + auto item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuContextName ); + tracy::MemWrite( &item->gpuContextNameFat.context, data.context ); + tracy::MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); + tracy::MemWrite( &item->gpuContextNameFat.size, data.len ); + tracy::Profiler::QueueSerialFinish(); +} + +TRACY_API void ___tracy_emit_gpu_calibration_serial( const struct ___tracy_gpu_calibration_data data ) +{ + auto item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuCalibration ); + tracy::MemWrite( &item->gpuCalibration.cpuTime, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->gpuCalibration.gpuTime, data.gpuTime ); + tracy::MemWrite( &item->gpuCalibration.cpuDelta, data.cpuDelta ); + tracy::MemWrite( &item->gpuCalibration.context, data.context ); + tracy::Profiler::QueueSerialFinish(); +} + +TRACY_API int ___tracy_connected( void ) +{ + return tracy::GetProfiler().IsConnected(); +} + +#ifdef TRACY_FIBERS +TRACY_API void ___tracy_fiber_enter( const char* fiber ){ tracy::Profiler::EnterFiber( fiber ); } +TRACY_API void ___tracy_fiber_leave( void ){ tracy::Profiler::LeaveFiber(); } +#endif + +# ifdef TRACY_MANUAL_LIFETIME +TRACY_API void ___tracy_startup_profiler( void ) +{ + tracy::StartupProfiler(); +} + +TRACY_API void ___tracy_shutdown_profiler( void ) +{ + tracy::ShutdownProfiler(); +} +# endif + +#ifdef __cplusplus +} +#endif +#endif + + diff --git a/Source/ThirdParty/tracy/client/TracyProfiler.hpp b/Source/ThirdParty/tracy/client/TracyProfiler.hpp index cdd6154b6..99ae63e4f 100644 --- a/Source/ThirdParty/tracy/client/TracyProfiler.hpp +++ b/Source/ThirdParty/tracy/client/TracyProfiler.hpp @@ -8,6 +8,7 @@ #include #include "tracy_concurrentqueue.h" +#include "tracy_SPSCQueue.h" #include "TracyCallstack.hpp" #include "TracySysTime.hpp" #include "TracyFastVector.hpp" @@ -17,7 +18,7 @@ #include "../common/TracyMutex.hpp" #include "../common/TracyProtocol.hpp" -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 # include #endif #ifdef __APPLE__ @@ -25,11 +26,15 @@ # include #endif -#if !defined TRACY_TIMER_FALLBACK && ( defined _WIN32 || defined __CYGWIN__ || ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) || ( defined TARGET_OS_IOS && TARGET_OS_IOS == 1 ) ) +#if ( defined _WIN32 || ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) || ( defined TARGET_OS_IOS && TARGET_OS_IOS == 1 ) ) # define TRACY_HW_TIMER #endif -#if !defined TRACY_HW_TIMER +#ifdef __linux__ +# include +#endif + +#if defined TRACY_TIMER_FALLBACK || !defined TRACY_HW_TIMER # include #endif @@ -55,11 +60,27 @@ TRACY_API Profiler& GetProfiler(); TRACY_API std::atomic& GetLockCounter(); TRACY_API std::atomic& GetGpuCtxCounter(); TRACY_API GpuCtxWrapper& GetGpuCtx(); -TRACY_API uint64_t GetThreadHandle(); -TRACY_API void InitRPMallocThread(); +TRACY_API uint32_t GetThreadHandle(); TRACY_API bool ProfilerAvailable(); +TRACY_API bool ProfilerAllocatorAvailable(); TRACY_API int64_t GetFrequencyQpc(); +#if defined TRACY_TIMER_FALLBACK && defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) +TRACY_API bool HardwareSupportsInvariantTSC(); // check, if we need fallback scenario +#else +# if defined TRACY_HW_TIMER +tracy_force_inline bool HardwareSupportsInvariantTSC() +{ + return true; // this is checked at startup +} +# else +tracy_force_inline bool HardwareSupportsInvariantTSC() +{ + return false; +} +# endif +#endif + #ifdef TRACY_ON_DEMAND struct LuaZoneState { @@ -90,6 +111,29 @@ struct LuaZoneState __tail.store( __magic + 1, std::memory_order_release ); +#ifdef TRACY_FIBERS +# define TracyQueuePrepare( _type ) \ + auto item = Profiler::QueueSerial(); \ + MemWrite( &item->hdr.type, _type ); +# define TracyQueueCommit( _name ) \ + MemWrite( &item->_name.thread, GetThreadHandle() ); \ + Profiler::QueueSerialFinish(); +# define TracyQueuePrepareC( _type ) \ + auto item = tracy::Profiler::QueueSerial(); \ + tracy::MemWrite( &item->hdr.type, _type ); +# define TracyQueueCommitC( _name ) \ + tracy::MemWrite( &item->_name.thread, tracy::GetThreadHandle() ); \ + tracy::Profiler::QueueSerialFinish(); +#else +# define TracyQueuePrepare( _type ) TracyLfqPrepare( _type ) +# define TracyQueueCommit( _name ) TracyLfqCommit +# define TracyQueuePrepareC( _type ) TracyLfqPrepareC( _type ) +# define TracyQueueCommitC( _name ) TracyLfqCommitC +#endif + + +typedef char*(*SourceContentsCallback)( void* data, const char* filename, size_t& size ); + class TRACY_API Profiler { struct FrameImageQueueItem @@ -98,10 +142,26 @@ class TRACY_API Profiler uint32_t frame; uint16_t w; uint16_t h; - uint8_t offset; bool flip; }; + enum class SymbolQueueItemType + { + CallstackFrame, + SymbolQuery, + ExternalName, + KernelCode, + SourceCode + }; + + struct SymbolQueueItem + { + SymbolQueueItemType type; + uint64_t ptr; + uint64_t extra; + uint32_t id; + }; + public: Profiler(); ~Profiler(); @@ -112,25 +172,33 @@ public: { #ifdef TRACY_HW_TIMER # if defined TARGET_OS_IOS && TARGET_OS_IOS == 1 - return mach_absolute_time(); -# elif defined _WIN32 || defined __CYGWIN__ + if( HardwareSupportsInvariantTSC() ) return mach_absolute_time(); +# elif defined _WIN32 # ifdef TRACY_TIMER_QPC return GetTimeQpc(); # else - return int64_t( __rdtsc() ); + if( HardwareSupportsInvariantTSC() ) return int64_t( __rdtsc() ); # endif # elif defined __i386 || defined _M_IX86 - uint32_t eax, edx; - asm volatile ( "rdtsc" : "=a" (eax), "=d" (edx) ); - return ( uint64_t( edx ) << 32 ) + uint64_t( eax ); + if( HardwareSupportsInvariantTSC() ) + { + uint32_t eax, edx; + asm volatile ( "rdtsc" : "=a" (eax), "=d" (edx) ); + return ( uint64_t( edx ) << 32 ) + uint64_t( eax ); + } # elif defined __x86_64__ || defined _M_X64 - uint64_t rax, rdx; - asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) ); - return (int64_t)(( rdx << 32 ) + rax); + if( HardwareSupportsInvariantTSC() ) + { + uint64_t rax, rdx; + asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) ); + return (int64_t)(( rdx << 32 ) + rax); + } # else # error "TRACY_HW_TIMER detection logic needs fixing" # endif -#else +#endif + +#if !defined TRACY_HW_TIMER || defined TRACY_TIMER_FALLBACK # if defined __linux__ && defined CLOCK_MONOTONIC_RAW struct timespec ts; clock_gettime( CLOCK_MONOTONIC_RAW, &ts ); @@ -138,6 +206,10 @@ public: # else return std::chrono::duration_cast( std::chrono::high_resolution_clock::now().time_since_epoch() ).count(); # endif +#endif + +#if !defined TRACY_TIMER_FALLBACK + return 0; // unreachable branch #endif } @@ -168,12 +240,37 @@ public: p.m_serialLock.unlock(); } + static tracy_force_inline void SourceCallbackRegister( SourceContentsCallback cb, void* data ) + { + auto& profiler = GetProfiler(); + profiler.m_sourceCallback = cb; + profiler.m_sourceCallbackData = data; + } + +#ifdef TRACY_FIBERS + static tracy_force_inline void EnterFiber( const char* fiber ) + { + TracyQueuePrepare( QueueType::FiberEnter ); + MemWrite( &item->fiberEnter.time, GetTime() ); + MemWrite( &item->fiberEnter.fiber, (uint64_t)fiber ); + TracyQueueCommit( fiberEnter ); + } + + static tracy_force_inline void LeaveFiber() + { + TracyQueuePrepare( QueueType::FiberLeave ); + MemWrite( &item->fiberLeave.time, GetTime() ); + TracyQueueCommit( fiberLeave ); + } +#endif + static void SendFrameMark( const char* name ); static void SendFrameMark( const char* name, QueueType type ); + static void SendFrameImage( const void* image, uint16_t w, uint16_t h, uint8_t offset, bool flip ); static void PlotData( const char* name, int64_t val ); static void PlotData( const char* name, float val ); static void PlotData( const char* name, double val ); - static void ConfigurePlot( const char* name, PlotFormatType type ); + static void ConfigurePlot( const char* name, PlotFormatType type, bool step, bool fill, uint32_t color ); static void Message( const char* txt, size_t size, int callstack ); static void Message( const char* txt, int callstack ); static void MessageColor( const char* txt, size_t size, uint32_t color, int callstack ); @@ -189,6 +286,7 @@ public: static void MemFreeCallstackNamed( const void* ptr, int depth, bool secure, const char* name ); static void SendCallstack( int depth ); static void ParameterRegister( ParameterCallback cb ); + static void ParameterRegister( ParameterCallback cb, void* data ); static void ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val ); void SendCallstack( int depth, const char* skipBefore ); @@ -296,15 +394,28 @@ public: private: enum class DequeueStatus { DataDequeued, ConnectionLost, QueueEmpty }; + enum class ThreadCtxStatus { Same, Changed, ConnectionLost }; static void LaunchWorker( void* ptr ) { ((Profiler*)ptr)->Worker(); } void Worker(); +#ifndef TRACY_NO_FRAME_IMAGE + static void LaunchCompressWorker( void* ptr ) { ((Profiler*)ptr)->CompressWorker(); } + void CompressWorker(); +#endif + +#ifdef TRACY_HAS_CALLSTACK + static void LaunchSymbolWorker( void* ptr ) { ((Profiler*)ptr)->SymbolWorker(); } + void SymbolWorker(); + void HandleSymbolQueueItem( const SymbolQueueItem& si ); +#endif + void ClearQueues( tracy::moodycamel::ConsumerToken& token ); void ClearSerial(); DequeueStatus Dequeue( tracy::moodycamel::ConsumerToken& token ); DequeueStatus DequeueContextSwitches( tracy::moodycamel::ConsumerToken& token, int64_t& timeStop ); DequeueStatus DequeueSerial(); + ThreadCtxStatus ThreadCtxCheck( uint32_t threadId ); bool CommitData(); tracy_force_inline bool AppendData( const void* data, size_t len ) @@ -338,18 +449,21 @@ private: void SendCallstackPayload( uint64_t ptr ); void SendCallstackPayload64( uint64_t ptr ); void SendCallstackAlloc( uint64_t ptr ); - void SendCallstackFrame( uint64_t ptr ); - void SendCodeLocation( uint64_t ptr ); + + void QueueCallstackFrame( uint64_t ptr ); + void QueueSymbolQuery( uint64_t symbol ); + void QueueExternalName( uint64_t ptr ); + void QueueKernelCode( uint64_t symbol, uint32_t size ); + void QueueSourceCodeQuery( uint32_t id ); bool HandleServerQuery(); void HandleDisconnect(); void HandleParameter( uint64_t payload ); - void HandleSymbolQuery( uint64_t symbol ); void HandleSymbolCodeQuery( uint64_t symbol, uint32_t size ); - void HandleSourceCodeQuery(); + void HandleSourceCodeQuery( char* data, char* image, uint32_t id ); void AckServerQuery(); - void AckSourceCodeNotAvailable(); + void AckSymbolCodeNotAvailable(); void CalibrateTimer(); void CalibrateDelay(); @@ -362,10 +476,12 @@ private: MemWrite( &item->hdr.type, QueueType::CallstackSerial ); MemWrite( &item->callstackFat.ptr, (uint64_t)ptr ); GetProfiler().m_serialQueue.commit_next(); +#else + static_cast(ptr); // unused #endif } - static tracy_force_inline void SendMemAlloc( QueueType type, const uint64_t thread, const void* ptr, size_t size ) + static tracy_force_inline void SendMemAlloc( QueueType type, const uint32_t thread, const void* ptr, size_t size ) { assert( type == QueueType::MemAlloc || type == QueueType::MemAllocCallstack || type == QueueType::MemAllocNamed || type == QueueType::MemAllocCallstackNamed ); @@ -388,7 +504,7 @@ private: GetProfiler().m_serialQueue.commit_next(); } - static tracy_force_inline void SendMemFree( QueueType type, const uint64_t thread, const void* ptr ) + static tracy_force_inline void SendMemFree( QueueType type, const uint32_t thread, const void* ptr ) { assert( type == QueueType::MemFree || type == QueueType::MemFreeCallstack || type == QueueType::MemFreeNamed || type == QueueType::MemFreeCallstackNamed ); @@ -409,7 +525,7 @@ private: GetProfiler().m_serialQueue.commit_next(); } -#if ( defined _WIN32 || defined __CYGWIN__ ) && defined TRACY_TIMER_QPC +#if defined _WIN32 && defined TRACY_TIMER_QPC static int64_t GetTimeQpc(); #endif @@ -417,7 +533,7 @@ private: uint64_t m_resolution; uint64_t m_delay; std::atomic m_timeBegin; - uint64_t m_mainThread; + uint32_t m_mainThread; uint64_t m_epoch, m_exectime; std::atomic m_shutdown; std::atomic m_shutdownManual; @@ -429,7 +545,7 @@ private: std::atomic m_zoneId; int64_t m_samplingPeriod; - uint64_t m_threadCtx; + uint32_t m_threadCtx; int64_t m_refTimeThread; int64_t m_refTimeSerial; int64_t m_refTimeCtx; @@ -445,6 +561,13 @@ private: FastVector m_serialQueue, m_serialDequeue; TracyMutex m_serialLock; +#ifndef TRACY_NO_FRAME_IMAGE + FastVector m_fiQueue, m_fiDequeue; + TracyMutex m_fiLock; +#endif + + SPSCQueue m_symbolQueue; + std::atomic m_frameCount; std::atomic m_isConnected; #ifdef TRACY_ON_DEMAND @@ -464,9 +587,23 @@ private: #endif ParameterCallback m_paramCallback; + void* m_paramCallbackData; + SourceContentsCallback m_sourceCallback; + void* m_sourceCallbackData; + char* m_queryImage; char* m_queryData; char* m_queryDataPtr; + +#if defined _WIN32 + void* m_exceptionHandler; +#endif +#ifdef __linux__ + struct { + struct sigaction pwr, ill, fpe, segv, pipe, bus, abrt; + } m_prevSignal; +#endif + bool m_crashHandlerInstalled; }; } diff --git a/Source/ThirdParty/tracy/client/TracyRingBuffer.hpp b/Source/ThirdParty/tracy/client/TracyRingBuffer.hpp index 29d935596..e9100e2d8 100644 --- a/Source/ThirdParty/tracy/client/TracyRingBuffer.hpp +++ b/Source/ThirdParty/tracy/client/TracyRingBuffer.hpp @@ -1,27 +1,44 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "TracyDebug.hpp" + namespace tracy { -template class RingBuffer { public: - RingBuffer( int fd ) - : m_fd( fd ) + RingBuffer( unsigned int size, int fd, int id, int cpu = -1 ) + : m_size( size ) + , m_id( id ) + , m_cpu( cpu ) + , m_fd( fd ) { const auto pageSize = uint32_t( getpagesize() ); - assert( Size >= pageSize ); - assert( __builtin_popcount( Size ) == 1 ); - m_mapSize = Size + pageSize; + assert( size >= pageSize ); + assert( __builtin_popcount( size ) == 1 ); + m_mapSize = size + pageSize; auto mapAddr = mmap( nullptr, m_mapSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 ); - if( !mapAddr ) + if( mapAddr == MAP_FAILED ) { + TracyDebug( "mmap failed: errno %i (%s)\n", errno, strerror( errno ) ); m_fd = 0; + m_metadata = nullptr; close( fd ); return; } m_metadata = (perf_event_mmap_page*)mapAddr; assert( m_metadata->data_offset == pageSize ); m_buffer = ((char*)mapAddr) + pageSize; + m_tail = m_metadata->data_tail; } ~RingBuffer() @@ -49,36 +66,35 @@ public: } bool IsValid() const { return m_metadata != nullptr; } + int GetId() const { return m_id; } + int GetCpu() const { return m_cpu; } void Enable() { ioctl( m_fd, PERF_EVENT_IOC_ENABLE, 0 ); } - bool HasData() const - { - const auto head = LoadHead(); - return head > m_metadata->data_tail; - } - void Read( void* dst, uint64_t offset, uint64_t cnt ) { - auto src = ( m_metadata->data_tail + offset ) % Size; - if( src + cnt <= Size ) + const auto size = m_size; + auto src = ( m_tail + offset ) % size; + if( src + cnt <= size ) { memcpy( dst, m_buffer + src, cnt ); } else { - const auto s0 = Size - src; - memcpy( dst, m_buffer + src, s0 ); - memcpy( (char*)dst + s0, m_buffer, cnt - s0 ); + const auto s0 = size - src; + const auto buf = m_buffer; + memcpy( dst, buf + src, s0 ); + memcpy( (char*)dst + s0, buf, cnt - s0 ); } } void Advance( uint64_t cnt ) { - StoreTail( m_metadata->data_tail + cnt ); + m_tail += cnt; + StoreTail(); } bool CheckTscCaps() const @@ -88,26 +104,35 @@ public: int64_t ConvertTimeToTsc( int64_t timestamp ) const { - assert( m_metadata->cap_user_time_zero ); + if( !m_metadata->cap_user_time_zero ) return 0; const auto time = timestamp - m_metadata->time_zero; const auto quot = time / m_metadata->time_mult; const auto rem = time % m_metadata->time_mult; return ( quot << m_metadata->time_shift ) + ( rem << m_metadata->time_shift ) / m_metadata->time_mult; } -private: uint64_t LoadHead() const { return std::atomic_load_explicit( (const volatile std::atomic*)&m_metadata->data_head, std::memory_order_acquire ); } - void StoreTail( uint64_t tail ) + uint64_t GetTail() const { - std::atomic_store_explicit( (volatile std::atomic*)&m_metadata->data_tail, tail, std::memory_order_release ); + return m_tail; } - perf_event_mmap_page* m_metadata; +private: + void StoreTail() + { + std::atomic_store_explicit( (volatile std::atomic*)&m_metadata->data_tail, m_tail, std::memory_order_release ); + } + + unsigned int m_size; + uint64_t m_tail; char* m_buffer; + int m_id; + int m_cpu; + perf_event_mmap_page* m_metadata; size_t m_mapSize; int m_fd; diff --git a/Source/ThirdParty/tracy/client/TracyScoped.hpp b/Source/ThirdParty/tracy/client/TracyScoped.hpp index b18f02388..1e5b6c809 100644 --- a/Source/ThirdParty/tracy/client/TracyScoped.hpp +++ b/Source/ThirdParty/tracy/client/TracyScoped.hpp @@ -8,7 +8,7 @@ #include "../common/TracySystem.hpp" #include "../common/TracyAlign.hpp" #include "../common/TracyAlloc.hpp" -#include "TracyProfiler.hpp" +#include "../client/TracyLock.hpp" namespace tracy { @@ -56,10 +56,10 @@ ScopedZone::ScopedZone( const SourceLocationData* srcloc, bool is_active ) #ifdef TRACY_ON_DEMAND m_connectionId = GetProfiler().ConnectionId(); #endif - TracyLfqPrepare( QueueType::ZoneBegin ); + TracyQueuePrepare( QueueType::ZoneBegin ); MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc ); - TracyLfqCommit; + TracyQueueCommit( zoneBeginThread ); } ScopedZone::ScopedZone( const SourceLocationData* srcloc, int depth, bool is_active ) @@ -75,10 +75,10 @@ ScopedZone::ScopedZone( const SourceLocationData* srcloc, int depth, bool is_act #endif GetProfiler().SendCallstack( depth ); - TracyLfqPrepare( QueueType::ZoneBeginCallstack ); + TracyQueuePrepare( QueueType::ZoneBeginCallstack ); MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc ); - TracyLfqCommit; + TracyQueueCommit( zoneBeginThread ); } ScopedZone::ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool is_active ) @@ -92,11 +92,11 @@ ScopedZone::ScopedZone( uint32_t line, const char* source, size_t sourceSz, cons #ifdef TRACY_ON_DEMAND m_connectionId = GetProfiler().ConnectionId(); #endif - TracyLfqPrepare( QueueType::ZoneBeginAllocSrcLoc ); + TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLoc ); const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); MemWrite( &item->zoneBegin.srcloc, srcloc ); - TracyLfqCommit; + TracyQueueCommit( zoneBeginThread ); } ScopedZone::ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool is_active ) @@ -112,11 +112,11 @@ ScopedZone::ScopedZone( uint32_t line, const char* source, size_t sourceSz, cons #endif GetProfiler().SendCallstack( depth ); - TracyLfqPrepare( QueueType::ZoneBeginAllocSrcLocCallstack ); + TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLocCallstack ); const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); MemWrite( &item->zoneBegin.srcloc, srcloc ); - TracyLfqCommit; + TracyQueueCommit( zoneBeginThread ); } ScopedZone::~ScopedZone() @@ -125,9 +125,9 @@ ScopedZone::~ScopedZone() #ifdef TRACY_ON_DEMAND if( GetProfiler().ConnectionId() != m_connectionId ) return; #endif - TracyLfqPrepare( QueueType::ZoneEnd ); + TracyQueuePrepare( QueueType::ZoneEnd ); MemWrite( &item->zoneEnd.time, Profiler::GetTime() ); - TracyLfqCommit; + TracyQueueCommit( zoneEndThread ); } void ScopedZone::Text( const char* txt, size_t size ) @@ -139,13 +139,13 @@ void ScopedZone::Text( const char* txt, size_t size ) #endif auto ptr = (char*)tracy_malloc( size ); memcpy( ptr, txt, size ); - TracyLfqPrepare( QueueType::ZoneText ); + TracyQueuePrepare( QueueType::ZoneText ); MemWrite( &item->zoneTextFat.text, (uint64_t)ptr ); MemWrite( &item->zoneTextFat.size, (uint16_t)size ); - TracyLfqCommit; + TracyQueueCommit( zoneTextFatThread ); } -void ScopedZone::Text(const Char* txt, size_t size) +void ScopedZone::Text( const Char* txt, size_t size ) { assert( size < std::numeric_limits::max() ); if( !m_active ) return; @@ -155,10 +155,10 @@ void ScopedZone::Text(const Char* txt, size_t size) auto ptr = (char*)tracy_malloc( size ); for( int i = 0; i < size; i++) ptr[i] = (char)txt[i]; - TracyLfqPrepare( QueueType::ZoneText ); + TracyQueuePrepare( QueueType::ZoneText ); MemWrite( &item->zoneTextFat.text, (uint64_t)ptr ); MemWrite( &item->zoneTextFat.size, (uint16_t)size ); - TracyLfqCommit; + TracyQueueCommit( zoneTextFatThread ); } void ScopedZone::Name( const char* txt, size_t size ) @@ -170,10 +170,10 @@ void ScopedZone::Name( const char* txt, size_t size ) #endif auto ptr = (char*)tracy_malloc( size ); memcpy( ptr, txt, size ); - TracyLfqPrepare( QueueType::ZoneName ); + TracyQueuePrepare( QueueType::ZoneName ); MemWrite( &item->zoneTextFat.text, (uint64_t)ptr ); MemWrite( &item->zoneTextFat.size, (uint16_t)size ); - TracyLfqCommit; + TracyQueueCommit( zoneTextFatThread ); } void ScopedZone::Name( const Char* txt, size_t size ) @@ -186,10 +186,10 @@ void ScopedZone::Name( const Char* txt, size_t size ) auto ptr = (char*)tracy_malloc( size ); for( int i = 0; i < size; i++) ptr[i] = (char)txt[i]; - TracyLfqPrepare( QueueType::ZoneName ); + TracyQueuePrepare( QueueType::ZoneName ); MemWrite( &item->zoneTextFat.text, (uint64_t)ptr ); MemWrite( &item->zoneTextFat.size, (uint16_t)size ); - TracyLfqCommit; + TracyQueueCommit( zoneTextFatThread ); } void ScopedZone::Color( uint32_t color ) @@ -198,11 +198,11 @@ void ScopedZone::Color( uint32_t color ) #ifdef TRACY_ON_DEMAND if( GetProfiler().ConnectionId() != m_connectionId ) return; #endif - TracyLfqPrepare( QueueType::ZoneColor ); + TracyQueuePrepare( QueueType::ZoneColor ); MemWrite( &item->zoneColor.r, uint8_t( ( color ) & 0xFF ) ); MemWrite( &item->zoneColor.g, uint8_t( ( color >> 8 ) & 0xFF ) ); MemWrite( &item->zoneColor.b, uint8_t( ( color >> 16 ) & 0xFF ) ); - TracyLfqCommit; + TracyQueueCommit( zoneColorThread ); } void ScopedZone::Value( uint64_t value ) @@ -211,13 +211,10 @@ void ScopedZone::Value( uint64_t value ) #ifdef TRACY_ON_DEMAND if( GetProfiler().ConnectionId() != m_connectionId ) return; #endif - TracyLfqPrepare( QueueType::ZoneValue ); + TracyQueuePrepare( QueueType::ZoneValue ); MemWrite( &item->zoneValue.value, value ); - TracyLfqCommit; + TracyQueueCommit( zoneValueThread ); } - -bool ScopedZone::IsActive() const { return m_active; } - } #endif diff --git a/Source/ThirdParty/tracy/client/TracyStringHelpers.hpp b/Source/ThirdParty/tracy/client/TracyStringHelpers.hpp new file mode 100644 index 000000000..1f8e4a659 --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyStringHelpers.hpp @@ -0,0 +1,40 @@ +#ifndef __TRACYSTRINGHELPERS_HPP__ +#define __TRACYSTRINGHELPERS_HPP__ + +#include +#include + +#include "../common/TracyAlloc.hpp" + +namespace tracy +{ + +static tracy_force_inline char* CopyString( const char* src, size_t sz ) +{ + auto dst = (char*)tracy_malloc( sz + 1 ); + memcpy( dst, src, sz ); + dst[sz] = '\0'; + return dst; +} + +static tracy_force_inline char* CopyString( const char* src ) +{ + return CopyString( src, strlen( src ) ); +} + +static tracy_force_inline char* CopyStringFast( const char* src, size_t sz ) +{ + auto dst = (char*)tracy_malloc_fast( sz + 1 ); + memcpy( dst, src, sz ); + dst[sz] = '\0'; + return dst; +} + +static tracy_force_inline char* CopyStringFast( const char* src ) +{ + return CopyStringFast( src, strlen( src ) ); +} + +} + +#endif diff --git a/Source/ThirdParty/tracy/client/TracySysTime.cpp b/Source/ThirdParty/tracy/client/TracySysTime.cpp index e5903467d..b690a9114 100644 --- a/Source/ThirdParty/tracy/client/TracySysTime.cpp +++ b/Source/ThirdParty/tracy/client/TracySysTime.cpp @@ -2,7 +2,7 @@ #ifdef TRACY_HAS_SYSTIME -# if defined _WIN32 || defined __CYGWIN__ +# if defined _WIN32 # include # elif defined __linux__ # include @@ -18,7 +18,7 @@ namespace tracy { -# if defined _WIN32 || defined __CYGWIN__ +# if defined _WIN32 static inline uint64_t ConvertTime( const FILETIME& t ) { @@ -62,7 +62,7 @@ void SysTime::ReadTimes() { host_cpu_load_info_data_t info; mach_msg_type_number_t cnt = HOST_CPU_LOAD_INFO_COUNT; - host_statistics( mach_host_self(), HOST_CPU_LOAD_INFO, reinterpret_cast( &info ), &cnt ); + host_statistics( mach_host_self(), HOST_CPU_LOAD_INFO, reinterpret_cast( &info ), &cnt ); used = info.cpu_ticks[CPU_STATE_USER] + info.cpu_ticks[CPU_STATE_NICE] + info.cpu_ticks[CPU_STATE_SYSTEM]; idle = info.cpu_ticks[CPU_STATE_IDLE]; } @@ -95,7 +95,7 @@ float SysTime::Get() const auto diffIdle = idle - oldIdle; const auto diffUsed = used - oldUsed; -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 return diffUsed == 0 ? -1 : ( diffUsed - diffIdle ) * 100.f / diffUsed; #elif defined __linux__ || defined __APPLE__ || defined BSD const auto total = diffUsed + diffIdle; diff --git a/Source/ThirdParty/tracy/client/TracySysTime.hpp b/Source/ThirdParty/tracy/client/TracySysTime.hpp index fc6ba321a..cb5ebe736 100644 --- a/Source/ThirdParty/tracy/client/TracySysTime.hpp +++ b/Source/ThirdParty/tracy/client/TracySysTime.hpp @@ -1,7 +1,7 @@ #ifndef __TRACYSYSTIME_HPP__ #define __TRACYSYSTIME_HPP__ -#if defined _WIN32 || defined __CYGWIN__ || defined __linux__ || defined __APPLE__ +#if defined _WIN32 || defined __linux__ || defined __APPLE__ # define TRACY_HAS_SYSTIME #else # include diff --git a/Source/ThirdParty/tracy/client/TracySysTrace.cpp b/Source/ThirdParty/tracy/client/TracySysTrace.cpp index 972779770..23b1020a5 100644 --- a/Source/ThirdParty/tracy/client/TracySysTrace.cpp +++ b/Source/ThirdParty/tracy/client/TracySysTrace.cpp @@ -1,8 +1,38 @@ +#include "TracyDebug.hpp" +#include "TracyStringHelpers.hpp" #include "TracySysTrace.hpp" +#include "../common/TracySystem.hpp" #ifdef TRACY_HAS_SYSTEM_TRACING -# if defined _WIN32 || defined __CYGWIN__ +#ifndef TRACY_SAMPLING_HZ +# if defined _WIN32 +# define TRACY_SAMPLING_HZ 8000 +# elif defined __linux__ +# define TRACY_SAMPLING_HZ 10000 +# endif +#endif + +namespace tracy +{ + +static constexpr int GetSamplingFrequency() +{ +#if defined _WIN32 + return TRACY_SAMPLING_HZ > 8000 ? 8000 : ( TRACY_SAMPLING_HZ < 1 ? 1 : TRACY_SAMPLING_HZ ); +#else + return TRACY_SAMPLING_HZ > 1000000 ? 1000000 : ( TRACY_SAMPLING_HZ < 1 ? 1 : TRACY_SAMPLING_HZ ); +#endif +} + +static constexpr int GetSamplingPeriod() +{ + return 1000000000 / GetSamplingFrequency(); +} + +} + +# if defined _WIN32 # ifndef NOMINMAX # define NOMINMAX @@ -28,6 +58,7 @@ namespace tracy static const GUID PerfInfoGuid = { 0xce1dbfb4, 0x137e, 0x4da6, { 0x87, 0xb0, 0x3f, 0x59, 0xaa, 0x10, 0x2c, 0xbc } }; static const GUID DxgKrnlGuid = { 0x802ec45a, 0x1e99, 0x4b83, { 0x99, 0x20, 0x87, 0xc9, 0x82, 0x77, 0xba, 0x9d } }; +static const GUID ThreadV2Guid = { 0x3d6fa8d1, 0xfe05, 0x11d0, { 0x9d, 0xda, 0x00, 0xc0, 0x4f, 0xd7, 0xba, 0x7c } }; static TRACEHANDLE s_traceHandle; @@ -100,14 +131,6 @@ struct VSyncInfo uint64_t flipFenceId; }; -#ifdef __CYGWIN__ -extern "C" typedef DWORD (WINAPI *t_GetProcessIdOfThread)( HANDLE ); -extern "C" typedef DWORD (WINAPI *t_GetProcessImageFileNameA)( HANDLE, LPSTR, DWORD ); -extern "C" ULONG WMIAPI TraceSetInformation(TRACEHANDLE SessionHandle, TRACE_INFO_CLASS InformationClass, PVOID TraceInformation, ULONG InformationLength); -t_GetProcessIdOfThread GetProcessIdOfThread = (t_GetProcessIdOfThread)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetProcessIdOfThread" ); -t_GetProcessImageFileNameA GetProcessImageFileNameA = (t_GetProcessImageFileNameA)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetProcessImageFileNameA" ); -#endif - extern "C" typedef NTSTATUS (WINAPI *t_NtQueryInformationThread)( HANDLE, THREADINFOCLASS, PVOID, ULONG, PULONG ); extern "C" typedef BOOL (WINAPI *t_EnumProcessModules)( HANDLE, HMODULE*, DWORD, LPDWORD ); extern "C" typedef BOOL (WINAPI *t_GetModuleInformation)( HANDLE, HMODULE, LPMODULEINFO, DWORD ); @@ -138,10 +161,8 @@ void WINAPI EventRecordCallback( PEVENT_RECORD record ) TracyLfqPrepare( QueueType::ContextSwitch ); MemWrite( &item->contextSwitch.time, hdr.TimeStamp.QuadPart ); - memcpy( &item->contextSwitch.oldThread, &cswitch->oldThreadId, sizeof( cswitch->oldThreadId ) ); - memcpy( &item->contextSwitch.newThread, &cswitch->newThreadId, sizeof( cswitch->newThreadId ) ); - memset( ((char*)&item->contextSwitch.oldThread)+4, 0, 4 ); - memset( ((char*)&item->contextSwitch.newThread)+4, 0, 4 ); + MemWrite( &item->contextSwitch.oldThread, cswitch->oldThreadId ); + MemWrite( &item->contextSwitch.newThread, cswitch->newThreadId ); MemWrite( &item->contextSwitch.cpu, record->BufferContext.ProcessorNumber ); MemWrite( &item->contextSwitch.reason, cswitch->oldThreadWaitReason ); MemWrite( &item->contextSwitch.state, cswitch->oldThreadState ); @@ -153,8 +174,7 @@ void WINAPI EventRecordCallback( PEVENT_RECORD record ) TracyLfqPrepare( QueueType::ThreadWakeup ); MemWrite( &item->threadWakeup.time, hdr.TimeStamp.QuadPart ); - memcpy( &item->threadWakeup.thread, &rt->threadId, sizeof( rt->threadId ) ); - memset( ((char*)&item->threadWakeup.thread)+4, 0, 4 ); + MemWrite( &item->threadWakeup.thread, rt->threadId ); TracyLfqCommit; } else if( hdr.EventDescriptor.Opcode == 1 || hdr.EventDescriptor.Opcode == 3 ) @@ -174,7 +194,7 @@ void WINAPI EventRecordCallback( PEVENT_RECORD record ) if( hdr.EventDescriptor.Opcode == 32 ) { const auto sw = (const StackWalkEvent*)record->UserData; - if( sw->stackProcess == s_pid && ( sw->stack[0] & 0x8000000000000000 ) == 0 ) + if( sw->stackProcess == s_pid ) { const uint64_t sz = ( record->UserDataLength - 16 ) / 8; if( sz > 0 ) @@ -184,7 +204,7 @@ void WINAPI EventRecordCallback( PEVENT_RECORD record ) memcpy( trace+1, sw->stack, sizeof( uint64_t ) * sz ); TracyLfqPrepare( QueueType::CallstackSample ); MemWrite( &item->callstackSampleFat.time, sw->eventTimeStamp ); - MemWrite( &item->callstackSampleFat.thread, (uint64_t)sw->stackThread ); + MemWrite( &item->callstackSampleFat.thread, sw->stackThread ); MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace ); TracyLfqCommit; } @@ -196,20 +216,6 @@ void WINAPI EventRecordCallback( PEVENT_RECORD record ) } } -static constexpr const char* VsyncName[] = { - "[0] Vsync", - "[1] Vsync", - "[2] Vsync", - "[3] Vsync", - "[4] Vsync", - "[5] Vsync", - "[6] Vsync", - "[7] Vsync", - "Vsync" -}; - -static uint32_t VsyncTarget[8] = {}; - void WINAPI EventRecordCallbackVsync( PEVENT_RECORD record ) { #ifdef TRACY_ON_DEMAND @@ -222,30 +228,15 @@ void WINAPI EventRecordCallbackVsync( PEVENT_RECORD record ) const auto vs = (const VSyncInfo*)record->UserData; - int idx = 0; - do - { - if( VsyncTarget[idx] == 0 ) - { - VsyncTarget[idx] = vs->vidPnTargetId; - break; - } - else if( VsyncTarget[idx] == vs->vidPnTargetId ) - { - break; - } - } - while( ++idx < 8 ); - - TracyLfqPrepare( QueueType::FrameMarkMsg ); - MemWrite( &item->frameMark.time, hdr.TimeStamp.QuadPart ); - MemWrite( &item->frameMark.name, uint64_t( VsyncName[idx] ) ); + TracyLfqPrepare( QueueType::FrameVsync ); + MemWrite( &item->frameVsync.time, hdr.TimeStamp.QuadPart ); + MemWrite( &item->frameVsync.id, vs->vidPnTargetId ); TracyLfqCommit; } static void SetupVsync() { -#if _WIN32_WINNT >= _WIN32_WINNT_WINBLUE +#if _WIN32_WINNT >= _WIN32_WINNT_WINBLUE && !defined(__MINGW32__) const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + MAX_PATH; s_propVsync = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz ); memset( s_propVsync, 0, sizeof( EVENT_TRACE_PROPERTIES ) ); @@ -330,6 +321,11 @@ static void SetupVsync() #endif } +static constexpr int GetSamplingInterval() +{ + return GetSamplingPeriod() / 100; +} + bool SysTraceStart( int64_t& samplingPeriod ) { if( !_GetThreadDescription ) _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" ); @@ -360,10 +356,10 @@ bool SysTraceStart( int64_t& samplingPeriod ) if( isOs64Bit ) { TRACE_PROFILE_INTERVAL interval = {}; - interval.Interval = 1250; // 8 kHz + interval.Interval = GetSamplingInterval(); const auto intervalStatus = TraceSetInformation( 0, TraceSampledProfileIntervalInfo, &interval, sizeof( interval ) ); if( intervalStatus != ERROR_SUCCESS ) return false; - samplingPeriod = 125*1000; + samplingPeriod = GetSamplingPeriod(); } const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + sizeof( KERNEL_LOGGER_NAME ); @@ -415,9 +411,11 @@ bool SysTraceStart( int64_t& samplingPeriod ) if( isOs64Bit ) { - CLASSIC_EVENT_ID stackId; - stackId.EventGuid = PerfInfoGuid; - stackId.Type = 46; + CLASSIC_EVENT_ID stackId[2] = {}; + stackId[0].EventGuid = PerfInfoGuid; + stackId[0].Type = 46; + stackId[1].EventGuid = ThreadV2Guid; + stackId[1].Type = 36; const auto stackStatus = TraceSetInformation( s_traceHandle, TraceStackTracingInfo, &stackId, sizeof( stackId ) ); if( stackStatus != ERROR_SUCCESS ) { @@ -476,7 +474,7 @@ void SysTraceWorker( void* ptr ) tracy_free( s_prop ); } -void SysTraceSendExternalName( uint64_t thread ) +void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name ) { bool threadSent = false; auto hnd = OpenThread( THREAD_QUERY_INFORMATION, FALSE, DWORD( thread ) ); @@ -486,16 +484,19 @@ void SysTraceSendExternalName( uint64_t thread ) } if( hnd != 0 ) { - PWSTR tmp; - _GetThreadDescription( hnd, &tmp ); - char buf[256]; - if( tmp ) + if( _GetThreadDescription ) { - auto ret = wcstombs( buf, tmp, 256 ); - if( ret != 0 ) + PWSTR tmp; + _GetThreadDescription( hnd, &tmp ); + char buf[256]; + if( tmp ) { - GetProfiler().SendString( thread, buf, ret, QueueType::ExternalThreadName ); - threadSent = true; + auto ret = wcstombs( buf, tmp, 256 ); + if( ret != 0 ) + { + threadName = CopyString( buf, ret ); + threadSent = true; + } } } const auto pid = GetProcessIdOfThread( hnd ); @@ -525,7 +526,7 @@ void SysTraceSendExternalName( uint64_t thread ) const auto modlen = _GetModuleBaseNameA( phnd, modules[i], buf2, 1024 ); if( modlen != 0 ) { - GetProfiler().SendString( thread, buf2, modlen, QueueType::ExternalThreadName ); + threadName = CopyString( buf2, modlen ); threadSent = true; } } @@ -539,7 +540,7 @@ void SysTraceSendExternalName( uint64_t thread ) CloseHandle( hnd ); if( !threadSent ) { - GetProfiler().SendString( thread, "???", 3, QueueType::ExternalThreadName ); + threadName = CopyString( "???", 3 ); threadSent = true; } if( pid != 0 ) @@ -553,7 +554,7 @@ void SysTraceSendExternalName( uint64_t thread ) } if( pid == 4 ) { - GetProfiler().SendString( thread, "System", 6, QueueType::ExternalName ); + name = CopyStringFast( "System", 6 ); return; } else @@ -569,7 +570,7 @@ void SysTraceSendExternalName( uint64_t thread ) auto ptr = buf2 + sz - 1; while( ptr > buf2 && *ptr != '\\' ) ptr--; if( *ptr == '\\' ) ptr++; - GetProfiler().SendString( thread, ptr, QueueType::ExternalName ); + name = CopyStringFast( ptr ); return; } } @@ -579,9 +580,9 @@ void SysTraceSendExternalName( uint64_t thread ) if( !threadSent ) { - GetProfiler().SendString( thread, "???", 3, QueueType::ExternalThreadName ); + threadName = CopyString( "???", 3 ); } - GetProfiler().SendString( thread, "???", 3, QueueType::ExternalName ); + name = CopyStringFast( "???", 3 ); } } @@ -607,281 +608,139 @@ void SysTraceSendExternalName( uint64_t thread ) # include # include +# if defined __i386 || defined __x86_64__ +# include "TracyCpuid.hpp" +# endif + # include "TracyProfiler.hpp" # include "TracyRingBuffer.hpp" # include "TracyThread.hpp" -# ifdef __ANDROID__ -# include "TracySysTracePayload.hpp" -# endif - namespace tracy { -static const char BasePath[] = "/sys/kernel/debug/tracing/"; -static const char TracingOn[] = "tracing_on"; -static const char CurrentTracer[] = "current_tracer"; -static const char TraceOptions[] = "trace_options"; -static const char TraceClock[] = "trace_clock"; -static const char SchedSwitch[] = "events/sched/sched_switch/enable"; -static const char SchedWakeup[] = "events/sched/sched_wakeup/enable"; -static const char BufferSizeKb[] = "buffer_size_kb"; -static const char TracePipe[] = "trace_pipe"; - static std::atomic traceActive { false }; -static Thread* s_threadSampling = nullptr; static int s_numCpus = 0; +static int s_numBuffers = 0; +static int s_ctxBufferIdx = 0; -static constexpr size_t RingBufSize = 64*1024; -static RingBuffer* s_ring = nullptr; +static RingBuffer* s_ring = nullptr; + +static const int ThreadHashSize = 4 * 1024; +static uint32_t s_threadHash[ThreadHashSize] = {}; + +static bool CurrentProcOwnsThread( uint32_t tid ) +{ + const auto hash = tid & ( ThreadHashSize-1 ); + const auto hv = s_threadHash[hash]; + if( hv == tid ) return true; + if( hv == -tid ) return false; + + char path[256]; + sprintf( path, "/proc/self/task/%d", tid ); + struct stat st; + if( stat( path, &st ) == 0 ) + { + s_threadHash[hash] = tid; + return true; + } + else + { + s_threadHash[hash] = -tid; + return false; + } +} static int perf_event_open( struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags ) { return syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags ); } -static void SetupSampling( int64_t& samplingPeriod ) +enum TraceEventId { -#ifndef CLOCK_MONOTONIC_RAW - return; -#endif + EventCallstack, + EventCpuCycles, + EventInstructionsRetired, + EventCacheReference, + EventCacheMiss, + EventBranchRetired, + EventBranchMiss, + EventVsync, + EventContextSwitch, + EventWakeup, +}; - samplingPeriod = 100*1000; - - s_numCpus = (int)std::thread::hardware_concurrency(); - s_ring = (RingBuffer*)tracy_malloc( sizeof( RingBuffer ) * s_numCpus ); - - perf_event_attr pe = {}; - - pe.type = PERF_TYPE_SOFTWARE; - pe.size = sizeof( perf_event_attr ); - pe.config = PERF_COUNT_SW_CPU_CLOCK; - - pe.sample_freq = 10000; - pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN; -#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 ) - pe.sample_max_stack = 127; -#endif - pe.exclude_callchain_kernel = 1; - - pe.disabled = 1; - pe.freq = 1; -#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) - pe.use_clockid = 1; - pe.clockid = CLOCK_MONOTONIC_RAW; -#endif - - for( int i=0; i(); - tracy_free( s_ring ); - return; - } - new( s_ring+i ) RingBuffer( fd ); - } - - s_threadSampling = (Thread*)tracy_malloc( sizeof( Thread ) ); - new(s_threadSampling) Thread( [] (void*) { - ThreadExitHandler threadExitHandler; - SetThreadName( "Tracy Sampling" ); - sched_param sp = { 5 }; - pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp ); - uint32_t currentPid = (uint32_t)getpid(); -#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) - for( int i=0; i(); - tracy_free( s_ring ); - const char* err = "Tracy Profiler: sampling is disabled due to non-native scheduler clock. Are you running under a VM?"; - Profiler::MessageAppInfo( err, strlen( err ) ); - return; - } - } -#endif - for( int i=0; i> 63; - const auto m2 = test >> 47; - if( m1 == m2 ) break; - } - while( --cnt > 0 ); - for( uint64_t j=1; j> 63; - const auto m2 = test >> 47; - if( m1 != m2 ) trace[j] = 0; - } - - // skip kernel frames - uint64_t j; - for( j=0; j= 0 ) break; - } - if( j == cnt ) - { - tracy_free( trace ); - } - else - { - if( j > 0 ) - { - cnt -= j; - memmove( trace+1, trace+1+j, sizeof( uint64_t ) * cnt ); - } - memcpy( trace, &cnt, sizeof( uint64_t ) ); - -#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) - t0 = s_ring[i].ConvertTimeToTsc( t0 ); -#endif - - TracyLfqPrepare( QueueType::CallstackSample ); - MemWrite( &item->callstackSampleFat.time, t0 ); - MemWrite( &item->callstackSampleFat.thread, (uint64_t)tid ); - MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace ); - TracyLfqCommit; - } - } - } - s_ring[i].Advance( hdr.size ); - } - if( !traceActive.load( std::memory_order_relaxed) ) break; - if( !hadData ) - { - std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); - } - } - - for( int i=0; i(); - tracy_free( s_ring ); - }, nullptr ); -} - -#ifdef __ANDROID__ -static bool TraceWrite( const char* path, size_t psz, const char* val, size_t vsz ) -{ - // Explanation for "su root sh -c": there are 2 flavors of "su" in circulation - // on Android. The default Android su has the following syntax to run a command - // as root: - // su root 'command' - // and 'command' is exec'd not passed to a shell, so if shell interpretation is - // wanted, one needs to do: - // su root sh -c 'command' - // Besides that default Android 'su' command, some Android devices use a different - // su with a command-line interface closer to the familiar util-linux su found - // on Linux distributions. Fortunately, both the util-linux su and the one - // in https://github.com/topjohnwu/Magisk seem to be happy with the above - // `su root sh -c 'command'` command line syntax. - char tmp[256]; - sprintf( tmp, "su root sh -c 'echo \"%s\" > %s%s'", val, BasePath, path ); - return system( tmp ) == 0; -} -#else -static bool TraceWrite( const char* path, size_t psz, const char* val, size_t vsz ) -{ - char tmp[256]; - memcpy( tmp, BasePath, sizeof( BasePath ) - 1 ); - memcpy( tmp + sizeof( BasePath ) - 1, path, psz ); - - int fd = open( tmp, O_WRONLY ); - if( fd < 0 ) return false; - - for(;;) - { - ssize_t cnt = write( fd, val, vsz ); - if( cnt == (ssize_t)vsz ) + const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC ); + if( fd != -1 ) { close( fd ); - return true; + break; } - if( cnt < 0 ) + pe.precise_ip--; + } + pe.config = config0; + while( pe.precise_ip != 0 ) + { + const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC ); + if( fd != -1 ) { close( fd ); - return false; + break; } - vsz -= cnt; - val += cnt; + pe.precise_ip--; } + TracyDebug( " Probed precise_ip: %i\n", pe.precise_ip ); } -#endif -#ifdef __ANDROID__ -void SysTraceInjectPayload() +static void ProbePreciseIp( perf_event_attr& pe, pid_t pid ) { - int pipefd[2]; - if( pipe( pipefd ) == 0 ) + pe.precise_ip = 3; + while( pe.precise_ip != 0 ) { - const auto pid = fork(); - if( pid == 0 ) + const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC ); + if( fd != -1 ) { - // child - close( pipefd[1] ); - if( dup2( pipefd[0], STDIN_FILENO ) >= 0 ) - { - close( pipefd[0] ); - execlp( "su", "su", "root", "sh", "-c", "cat > /data/tracy_systrace", (char*)nullptr ); - exit( 1 ); - } - } - else if( pid > 0 ) - { - // parent - close( pipefd[0] ); - -#ifdef __aarch64__ - write( pipefd[1], tracy_systrace_aarch64_data, tracy_systrace_aarch64_size ); -#else - write( pipefd[1], tracy_systrace_armv7_data, tracy_systrace_armv7_size ); -#endif - close( pipefd[1] ); - waitpid( pid, nullptr, 0 ); - - system( "su root sh -c 'chmod 700 /data/tracy_systrace'" ); + close( fd ); + break; } + pe.precise_ip--; } + TracyDebug( " Probed precise_ip: %i\n", pe.precise_ip ); } + +static bool IsGenuineIntel() +{ +#if defined __i386 || defined __x86_64__ + uint32_t regs[4] = {}; + __get_cpuid( 0, regs, regs+1, regs+2, regs+3 ); + char manufacturer[12]; + memcpy( manufacturer, regs+1, 4 ); + memcpy( manufacturer+4, regs+3, 4 ); + memcpy( manufacturer+8, regs+2, 4 ); + return memcmp( manufacturer, "GenuineIntel", 12 ) == 0; +#else + return false; #endif +} + +static const char* ReadFile( const char* path ) +{ + int fd = open( path, O_RDONLY ); + if( fd < 0 ) return nullptr; + + static char tmp[64]; + const auto cnt = read( fd, tmp, 63 ); + close( fd ); + if( cnt < 0 ) return nullptr; + tmp[cnt] = '\0'; + return tmp; +} bool SysTraceStart( int64_t& samplingPeriod ) { @@ -889,374 +748,776 @@ bool SysTraceStart( int64_t& samplingPeriod ) return false; #endif - if( !TraceWrite( TracingOn, sizeof( TracingOn ), "0", 2 ) ) return false; - if( !TraceWrite( CurrentTracer, sizeof( CurrentTracer ), "nop", 4 ) ) return false; - TraceWrite( TraceOptions, sizeof( TraceOptions ), "norecord-cmd", 13 ); - TraceWrite( TraceOptions, sizeof( TraceOptions ), "norecord-tgid", 14 ); - TraceWrite( TraceOptions, sizeof( TraceOptions ), "noirq-info", 11 ); - TraceWrite( TraceOptions, sizeof( TraceOptions ), "noannotate", 11 ); -#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) - if( !TraceWrite( TraceClock, sizeof( TraceClock ), "x86-tsc", 8 ) ) return false; + const auto paranoidLevelStr = ReadFile( "/proc/sys/kernel/perf_event_paranoid" ); + if( !paranoidLevelStr ) return false; +#ifdef TRACY_VERBOSE + int paranoidLevel = 2; + paranoidLevel = atoi( paranoidLevelStr ); + TracyDebug( "perf_event_paranoid: %i\n", paranoidLevel ); +#endif + + int switchId = -1, wakeupId = -1, vsyncId = -1; + const auto switchIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_switch/id" ); + if( switchIdStr ) switchId = atoi( switchIdStr ); + const auto wakeupIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_wakeup/id" ); + if( wakeupIdStr ) wakeupId = atoi( wakeupIdStr ); + const auto vsyncIdStr = ReadFile( "/sys/kernel/debug/tracing/events/drm/drm_vblank_event/id" ); + if( vsyncIdStr ) vsyncId = atoi( vsyncIdStr ); + + TracyDebug( "sched_switch id: %i\n", switchId ); + TracyDebug( "sched_wakeup id: %i\n", wakeupId ); + TracyDebug( "drm_vblank_event id: %i\n", vsyncId ); + +#ifdef TRACY_NO_SAMPLE_RETIREMENT + const bool noRetirement = true; #else - if( !TraceWrite( TraceClock, sizeof( TraceClock ), "mono_raw", 9 ) ) return false; -#endif - if( !TraceWrite( SchedSwitch, sizeof( SchedSwitch ), "1", 2 ) ) return false; - if( !TraceWrite( SchedWakeup, sizeof( SchedWakeup ), "1", 2 ) ) return false; - if( !TraceWrite( BufferSizeKb, sizeof( BufferSizeKb ), "4096", 5 ) ) return false; - -#if defined __ANDROID__ && ( defined __aarch64__ || defined __ARM_ARCH ) - SysTraceInjectPayload(); + const char* noRetirementEnv = GetEnvVar( "TRACY_NO_SAMPLE_RETIREMENT" ); + const bool noRetirement = noRetirementEnv && noRetirementEnv[0] == '1'; #endif - if( !TraceWrite( TracingOn, sizeof( TracingOn ), "1", 2 ) ) return false; +#ifdef TRACY_NO_SAMPLE_CACHE + const bool noCache = true; +#else + const char* noCacheEnv = GetEnvVar( "TRACY_NO_SAMPLE_CACHE" ); + const bool noCache = noCacheEnv && noCacheEnv[0] == '1'; +#endif + +#ifdef TRACY_NO_SAMPLE_BRANCH + const bool noBranch = true; +#else + const char* noBranchEnv = GetEnvVar( "TRACY_NO_SAMPLE_BRANCH" ); + const bool noBranch = noBranchEnv && noBranchEnv[0] == '1'; +#endif + +#ifdef TRACY_NO_CONTEXT_SWITCH + const bool noCtxSwitch = true; +#else + const char* noCtxSwitchEnv = GetEnvVar( "TRACY_NO_CONTEXT_SWITCH" ); + const bool noCtxSwitch = noCtxSwitchEnv && noCtxSwitchEnv[0] == '1'; +#endif + +#ifdef TRACY_NO_VSYNC_CAPTURE + const bool noVsync = true; +#else + const char* noVsyncEnv = GetEnvVar( "TRACY_NO_VSYNC_CAPTURE" ); + const bool noVsync = noVsyncEnv && noVsyncEnv[0] == '1'; +#endif + + samplingPeriod = GetSamplingPeriod(); + uint32_t currentPid = (uint32_t)getpid(); + + s_numCpus = (int)std::thread::hardware_concurrency(); + + const auto maxNumBuffers = s_numCpus * ( + 1 + // software sampling + 2 + // CPU cycles + instructions retired + 2 + // cache reference + miss + 2 + // branch retired + miss + 2 + // context switches + wakeups + 1 // vsync + ); + s_ring = (RingBuffer*)tracy_malloc( sizeof( RingBuffer ) * maxNumBuffers ); + s_numBuffers = 0; + + // software sampling + perf_event_attr pe = {}; + pe.type = PERF_TYPE_SOFTWARE; + pe.size = sizeof( perf_event_attr ); + pe.config = PERF_COUNT_SW_CPU_CLOCK; + pe.sample_freq = GetSamplingFrequency(); + pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN; +#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 ) + pe.sample_max_stack = 127; +#endif + pe.disabled = 1; + pe.freq = 1; + pe.inherit = 1; +#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + pe.use_clockid = 1; + pe.clockid = CLOCK_MONOTONIC_RAW; +#endif + + TracyDebug( "Setup software sampling\n" ); + ProbePreciseIp( pe, currentPid ); + for( int i=0; i= KERNEL_VERSION( 4, 8, 0 ) + pe.sample_max_stack = 127; +#endif + pe.disabled = 1; + pe.inherit = 1; + pe.config = switchId; +#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + pe.use_clockid = 1; + pe.clockid = CLOCK_MONOTONIC_RAW; +#endif + + TracyDebug( "Setup context switch capture\n" ); + for( int i=0; i~Thread(); - tracy_free( s_threadSampling ); + const auto test = (int64_t)trace[cnt]; + const auto m1 = test >> 63; + const auto m2 = test >> 47; + if( m1 == m2 ) break; } -} - -static uint64_t ReadNumber( const char*& data ) -{ - auto ptr = data; - assert( *ptr >= '0' && *ptr <= '9' ); - uint64_t val = *ptr++ - '0'; - for(;;) + while( --cnt > 0 ); + for( uint64_t j=1; j 9 ) break; - val = val * 10 + v; - ptr++; + const auto test = (int64_t)trace[j]; + const auto m1 = test >> 63; + const auto m2 = test >> 47; + if( m1 != m2 ) trace[j] = 0; } - data = ptr; - return val; -} - -static uint8_t ReadState( char state ) -{ - switch( state ) - { - case 'D': return 101; - case 'I': return 102; - case 'R': return 103; - case 'S': return 104; - case 'T': return 105; - case 't': return 106; - case 'W': return 107; - case 'X': return 108; - case 'Z': return 109; - default: return 100; - } -} - -#if defined __ANDROID__ && defined __ANDROID_API__ && __ANDROID_API__ < 18 -/*- - * Copyright (c) 2011 The NetBSD Foundation, Inc. - * All rights reserved. - * - * This code is derived from software contributed to The NetBSD Foundation - * by Christos Zoulas. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -ssize_t getdelim(char **buf, size_t *bufsiz, int delimiter, FILE *fp) -{ - char *ptr, *eptr; - - if (*buf == NULL || *bufsiz == 0) { - *bufsiz = BUFSIZ; - if ((*buf = (char*)malloc(*bufsiz)) == NULL) - return -1; - } - - for (ptr = *buf, eptr = *buf + *bufsiz;;) { - int c = fgetc(fp); - if (c == -1) { - if (feof(fp)) - return ptr == *buf ? -1 : ptr - *buf; - else - return -1; - } - *ptr++ = c; - if (c == delimiter) { - *ptr = '\0'; - return ptr - *buf; - } - if (ptr + 2 >= eptr) { - char *nbuf; - size_t nbufsiz = *bufsiz * 2; - ssize_t d = ptr - *buf; - if ((nbuf = (char*)realloc(*buf, nbufsiz)) == NULL) - return -1; - *buf = nbuf; - *bufsiz = nbufsiz; - eptr = nbuf + nbufsiz; - ptr = nbuf + d; - } - } -} - -ssize_t getline(char **buf, size_t *bufsiz, FILE *fp) -{ - return getdelim(buf, bufsiz, '\n', fp); -} #endif -static void HandleTraceLine( const char* line ) -{ - line += 23; - while( *line != '[' ) line++; - line++; - const auto cpu = (uint8_t)ReadNumber( line ); - line++; // ']' - while( *line == ' ' ) line++; - -#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) - const auto time = ReadNumber( line ); -#else - const auto ts = ReadNumber( line ); - line++; // '.' - const auto tus = ReadNumber( line ); - const auto time = ts * 1000000000ll + tus * 1000ll; -#endif - - line += 2; // ': ' - if( memcmp( line, "sched_switch", 12 ) == 0 ) + for( uint64_t j=1; j<=cnt; j++ ) { - line += 14; - - while( memcmp( line, "prev_pid", 8 ) != 0 ) line++; - line += 9; - - const auto oldPid = ReadNumber( line ); - line++; - - while( memcmp( line, "prev_state", 10 ) != 0 ) line++; - line += 11; - - const auto oldState = (uint8_t)ReadState( *line ); - line += 5; - - while( memcmp( line, "next_pid", 8 ) != 0 ) line++; - line += 9; - - const auto newPid = ReadNumber( line ); - - uint8_t reason = 100; - - TracyLfqPrepare( QueueType::ContextSwitch ); - MemWrite( &item->contextSwitch.time, time ); - MemWrite( &item->contextSwitch.oldThread, oldPid ); - MemWrite( &item->contextSwitch.newThread, newPid ); - MemWrite( &item->contextSwitch.cpu, cpu ); - MemWrite( &item->contextSwitch.reason, reason ); - MemWrite( &item->contextSwitch.state, oldState ); - TracyLfqCommit; + if( trace[j] >= (uint64_t)-4095 ) // PERF_CONTEXT_MAX + { + memmove( trace+j, trace+j+1, sizeof( uint64_t ) * ( cnt - j ) ); + cnt--; + } } - else if( memcmp( line, "sched_wakeup", 12 ) == 0 ) - { - line += 14; - while( memcmp( line, "pid=", 4 ) != 0 ) line++; - line += 4; - - const auto pid = ReadNumber( line ); - - TracyLfqPrepare( QueueType::ThreadWakeup ); - MemWrite( &item->threadWakeup.time, time ); - MemWrite( &item->threadWakeup.thread, pid ); - TracyLfqCommit; - } + memcpy( trace, &cnt, sizeof( uint64_t ) ); + return trace; } -#ifdef __ANDROID__ -static void ProcessTraceLines( int fd ) +void SysTraceWorker( void* ptr ) { - // Linux pipe buffer is 64KB, additional 1KB is for unfinished lines - char* buf = (char*)tracy_malloc( (64+1)*1024 ); - char* line = buf; - + ThreadExitHandler threadExitHandler; + SetThreadName( "Tracy Sampling" ); + InitRpmalloc(); + sched_param sp = { 99 }; + if( pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp ) != 0 ) TracyDebug( "Failed to increase SysTraceWorker thread priority!\n" ); + auto ctxBufferIdx = s_ctxBufferIdx; + auto ringArray = s_ring; + auto numBuffers = s_numBuffers; + for( int i=0; i buf && *line != '\n' ) line--; - if( line > buf ) + auto& ring = ringArray[i]; + const auto head = ring.LoadHead(); + const auto tail = ring.GetTail(); + if( head != tail ) { - line++; - const auto lsz = end - line; - memmove( buf, line, lsz ); - line = buf + lsz; + const auto end = head - tail; + ring.Advance( end ); } } + if( !traceActive.load( std::memory_order_relaxed ) ) break; + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); continue; } #endif - const auto end = line + rd; - line = buf; - for(;;) - { - auto next = (char*)memchr( line, '\n', end - line ); - if( !next ) - { - const auto lsz = end - line; - memmove( buf, line, lsz ); - line = buf + lsz; - break; - } - HandleTraceLine( line ); - line = ++next; - } - if( rd < 64*1024 ) - { - std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); - } - } - - tracy_free( buf ); -} - -void SysTraceWorker( void* ptr ) -{ - ThreadExitHandler threadExitHandler; - SetThreadName( "Tracy SysTrace" ); - int pipefd[2]; - if( pipe( pipefd ) == 0 ) - { - const auto pid = fork(); - if( pid == 0 ) - { - // child - close( pipefd[0] ); - dup2( open( "/dev/null", O_WRONLY ), STDERR_FILENO ); - if( dup2( pipefd[1], STDOUT_FILENO ) >= 0 ) - { - close( pipefd[1] ); - sched_param sp = { 4 }; - pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp ); -#if defined __ANDROID__ && ( defined __aarch64__ || defined __ARM_ARCH ) - execlp( "su", "su", "root", "sh", "-c", "/data/tracy_systrace", (char*)nullptr ); -#endif - execlp( "su", "su", "root", "sh", "-c", "cat /sys/kernel/debug/tracing/trace_pipe", (char*)nullptr ); - exit( 1 ); - } - } - else if( pid > 0 ) - { - // parent - close( pipefd[1] ); - sched_param sp = { 5 }; - pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp ); - ProcessTraceLines( pipefd[0] ); - close( pipefd[0] ); - waitpid( pid, nullptr, 0 ); - } - } -} -#else -static void ProcessTraceLines( int fd ) -{ - char* buf = (char*)tracy_malloc( 64*1024 ); - - struct pollfd pfd; - pfd.fd = fd; - pfd.events = POLLIN | POLLERR; - - for(;;) - { - while( poll( &pfd, 1, 0 ) <= 0 ) + bool hadData = false; + for( int i=0; i tail ); + hadData = true; + + const auto id = ring.GetId(); + assert( id != EventContextSwitch ); + const auto end = head - tail; + uint64_t pos = 0; + if( id == EventCallstack ) + { + while( pos < end ) + { + perf_event_header hdr; + ring.Read( &hdr, pos, sizeof( perf_event_header ) ); + if( hdr.type == PERF_RECORD_SAMPLE ) + { + auto offset = pos + sizeof( perf_event_header ); + + // Layout: + // u32 pid, tid + // u64 time + // u64 cnt + // u64 ip[cnt] + + uint32_t tid; + uint64_t t0; + uint64_t cnt; + + offset += sizeof( uint32_t ); + ring.Read( &tid, offset, sizeof( uint32_t ) ); + offset += sizeof( uint32_t ); + ring.Read( &t0, offset, sizeof( uint64_t ) ); + offset += sizeof( uint64_t ); + ring.Read( &cnt, offset, sizeof( uint64_t ) ); + offset += sizeof( uint64_t ); + + if( cnt > 0 ) + { +#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + t0 = ring.ConvertTimeToTsc( t0 ); +#endif + auto trace = GetCallstackBlock( cnt, ring, offset ); + + TracyLfqPrepare( QueueType::CallstackSample ); + MemWrite( &item->callstackSampleFat.time, t0 ); + MemWrite( &item->callstackSampleFat.thread, tid ); + MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace ); + TracyLfqCommit; + } + } + pos += hdr.size; + } + } + else + { + while( pos < end ) + { + perf_event_header hdr; + ring.Read( &hdr, pos, sizeof( perf_event_header ) ); + if( hdr.type == PERF_RECORD_SAMPLE ) + { + auto offset = pos + sizeof( perf_event_header ); + + // Layout: + // u64 ip + // u64 time + + uint64_t ip, t0; + ring.Read( &ip, offset, sizeof( uint64_t ) ); + offset += sizeof( uint64_t ); + ring.Read( &t0, offset, sizeof( uint64_t ) ); + +#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + t0 = ring.ConvertTimeToTsc( t0 ); +#endif + QueueType type; + switch( id ) + { + case EventCpuCycles: + type = QueueType::HwSampleCpuCycle; + break; + case EventInstructionsRetired: + type = QueueType::HwSampleInstructionRetired; + break; + case EventCacheReference: + type = QueueType::HwSampleCacheReference; + break; + case EventCacheMiss: + type = QueueType::HwSampleCacheMiss; + break; + case EventBranchRetired: + type = QueueType::HwSampleBranchRetired; + break; + case EventBranchMiss: + type = QueueType::HwSampleBranchMiss; + break; + default: + assert( false ); + break; + } + + TracyLfqPrepare( type ); + MemWrite( &item->hwSample.ip, ip ); + MemWrite( &item->hwSample.time, t0 ); + TracyLfqCommit; + } + pos += hdr.size; + } + } + assert( pos == end ); + ring.Advance( end ); } + if( !traceActive.load( std::memory_order_relaxed ) ) break; - const auto rd = read( fd, buf, 64*1024 ); - if( rd <= 0 ) break; + if( ctxBufferIdx != numBuffers ) + { + const auto ctxBufNum = numBuffers - ctxBufferIdx; -#ifdef TRACY_ON_DEMAND - if( !GetProfiler().IsConnected() ) continue; + int activeNum = 0; + uint16_t active[512]; + uint32_t end[512]; + uint32_t pos[512]; + for( int i=0; i 0 ) + { + hadData = true; + while( activeNum > 0 ) + { + int sel = -1; + int selPos; + int64_t t0 = std::numeric_limits::max(); + for( int i=0; i= 0 ) + { + auto& ring = ringArray[ctxBufferIdx + sel]; + auto rbPos = pos[sel]; + auto offset = rbPos; + perf_event_header hdr; + ring.Read( &hdr, offset, sizeof( perf_event_header ) ); + +#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + t0 = ring.ConvertTimeToTsc( t0 ); #endif - auto line = buf; - const auto end = buf + rd; - for(;;) + const auto rid = ring.GetId(); + if( rid == EventContextSwitch ) + { + // Layout: + // u64 time + // u64 cnt + // u64 ip[cnt] + // u32 size + // u8 data[size] + // Data (not ABI stable, but has not changed since it was added, in 2009): + // u8 hdr[8] + // u8 prev_comm[16] + // u32 prev_pid + // u32 prev_prio + // lng prev_state + // u8 next_comm[16] + // u32 next_pid + // u32 next_prio + + offset += sizeof( perf_event_header ) + sizeof( uint64_t ); + + uint64_t cnt; + ring.Read( &cnt, offset, sizeof( uint64_t ) ); + offset += sizeof( uint64_t ); + const auto traceOffset = offset; + offset += sizeof( uint64_t ) * cnt + sizeof( uint32_t ) + 8 + 16; + + uint32_t prev_pid, next_pid; + long prev_state; + + ring.Read( &prev_pid, offset, sizeof( uint32_t ) ); + offset += sizeof( uint32_t ) + sizeof( uint32_t ); + ring.Read( &prev_state, offset, sizeof( long ) ); + offset += sizeof( long ) + 16; + ring.Read( &next_pid, offset, sizeof( uint32_t ) ); + + uint8_t reason = 100; + uint8_t state; + + if( prev_state & 0x0001 ) state = 104; + else if( prev_state & 0x0002 ) state = 101; + else if( prev_state & 0x0004 ) state = 105; + else if( prev_state & 0x0008 ) state = 106; + else if( prev_state & 0x0010 ) state = 108; + else if( prev_state & 0x0020 ) state = 109; + else if( prev_state & 0x0040 ) state = 110; + else if( prev_state & 0x0080 ) state = 102; + else state = 103; + + TracyLfqPrepare( QueueType::ContextSwitch ); + MemWrite( &item->contextSwitch.time, t0 ); + MemWrite( &item->contextSwitch.oldThread, prev_pid ); + MemWrite( &item->contextSwitch.newThread, next_pid ); + MemWrite( &item->contextSwitch.cpu, uint8_t( ring.GetCpu() ) ); + MemWrite( &item->contextSwitch.reason, reason ); + MemWrite( &item->contextSwitch.state, state ); + TracyLfqCommit; + + if( cnt > 0 && prev_pid != 0 && CurrentProcOwnsThread( prev_pid ) ) + { + auto trace = GetCallstackBlock( cnt, ring, traceOffset ); + + TracyLfqPrepare( QueueType::CallstackSampleContextSwitch ); + MemWrite( &item->callstackSampleFat.time, t0 ); + MemWrite( &item->callstackSampleFat.thread, prev_pid ); + MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace ); + TracyLfqCommit; + } + } + else if( rid == EventWakeup ) + { + // Layout: + // u64 time + // u32 size + // u8 data[size] + // Data: + // u8 hdr[8] + // u8 comm[16] + // u32 pid + // u32 prio + // u64 target_cpu + + offset += sizeof( perf_event_header ) + sizeof( uint64_t ) + sizeof( uint32_t ) + 8 + 16; + + uint32_t pid; + ring.Read( &pid, offset, sizeof( uint32_t ) ); + + TracyLfqPrepare( QueueType::ThreadWakeup ); + MemWrite( &item->threadWakeup.time, t0 ); + MemWrite( &item->threadWakeup.thread, pid ); + TracyLfqCommit; + } + else + { + assert( rid == EventVsync ); + // Layout: + // u64 time + // u32 size + // u8 data[size] + // Data (not ABI stable): + // u8 hdr[8] + // i32 crtc + // u32 seq + // i64 ktime + // u8 high precision + + offset += sizeof( perf_event_header ) + sizeof( uint64_t ) + sizeof( uint32_t ) + 8; + + int32_t crtc; + ring.Read( &crtc, offset, sizeof( int32_t ) ); + + // Note: The timestamp value t0 might be off by a number of microseconds from the + // true hardware vblank event. The ktime value should be used instead, but it is + // measured in CLOCK_MONOTONIC time. Tracy only supports the timestamp counter + // register (TSC) or CLOCK_MONOTONIC_RAW clock. +#if 0 + offset += sizeof( uint32_t ) * 2; + int64_t ktime; + ring.Read( &ktime, offset, sizeof( int64_t ) ); +#endif + + TracyLfqPrepare( QueueType::FrameVsync ); + MemWrite( &item->frameVsync.id, crtc ); + MemWrite( &item->frameVsync.time, t0 ); + TracyLfqCommit; + } + + rbPos += hdr.size; + if( rbPos == end[sel] ) + { + memmove( active+selPos, active+selPos+1, sizeof(*active) * ( activeNum - selPos - 1 ) ); + activeNum--; + } + else + { + pos[sel] = rbPos; + } + } + } + for( int i=0; i 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0'; - GetProfiler().SendString( thread, buf, QueueType::ExternalThreadName ); + threadName = CopyString( buf ); fclose( f ); } else { - GetProfiler().SendString( thread, "???", 3, QueueType::ExternalThreadName ); + threadName = CopyString( "???", 3 ); } sprintf( fn, "/proc/%" PRIu64 "/status", thread ); f = fopen( fn, "rb" ); if( f ) { + char* tmp = (char*)tracy_malloc_fast( 8*1024 ); + const auto fsz = (ptrdiff_t)fread( tmp, 1, 8*1024, f ); + fclose( f ); + int pid = -1; - size_t lsz = 1024; - auto line = (char*)tracy_malloc( lsz ); + auto line = tmp; for(;;) { - auto rd = getline( &line, &lsz, f ); - if( rd <= 0 ) break; if( memcmp( "Tgid:\t", line, 6 ) == 0 ) { pid = atoi( line + 6 ); break; } + while( line - tmp < fsz && *line != '\n' ) line++; + if( *line != '\n' ) break; + line++; } - tracy_free( line ); - fclose( f ); + tracy_free_fast( tmp ); + if( pid >= 0 ) { { @@ -1310,13 +1575,13 @@ void SysTraceSendExternalName( uint64_t thread ) char buf[256]; const auto sz = fread( buf, 1, 256, f ); if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0'; - GetProfiler().SendString( thread, buf, QueueType::ExternalName ); + name = CopyStringFast( buf ); fclose( f ); return; } } } - GetProfiler().SendString( thread, "???", 3, QueueType::ExternalName ); + name = CopyStringFast( "???", 3 ); } } diff --git a/Source/ThirdParty/tracy/client/TracySysTrace.hpp b/Source/ThirdParty/tracy/client/TracySysTrace.hpp index 688cbf2ae..8c663cd7a 100644 --- a/Source/ThirdParty/tracy/client/TracySysTrace.hpp +++ b/Source/ThirdParty/tracy/client/TracySysTrace.hpp @@ -1,8 +1,11 @@ #ifndef __TRACYSYSTRACE_HPP__ #define __TRACYSYSTRACE_HPP__ -#if !defined TRACY_NO_SYSTEM_TRACING && ( defined _WIN32 || defined __CYGWIN__ || defined __linux__ ) -# define TRACY_HAS_SYSTEM_TRACING +#if !defined TRACY_NO_SYSTEM_TRACING && ( defined _WIN32 || defined __linux__ ) +# include "../common/TracyUwp.hpp" +# ifndef TRACY_UWP +# define TRACY_HAS_SYSTEM_TRACING +# endif #endif #ifdef TRACY_HAS_SYSTEM_TRACING @@ -16,7 +19,7 @@ bool SysTraceStart( int64_t& samplingPeriod ); void SysTraceStop(); void SysTraceWorker( void* ptr ); -void SysTraceSendExternalName( uint64_t thread ); +void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name ); } diff --git a/Source/ThirdParty/tracy/client/TracySysTracePayload.hpp b/Source/ThirdParty/tracy/client/TracySysTracePayload.hpp deleted file mode 100644 index 7c292f9d0..000000000 --- a/Source/ThirdParty/tracy/client/TracySysTracePayload.hpp +++ /dev/null @@ -1,78 +0,0 @@ -// File: 'extra/systrace/tracy_systrace.armv7' (1149 bytes) -// File: 'extra/systrace/tracy_systrace.aarch64' (1650 bytes) - -// Exported using binary_to_compressed_c.cpp - -namespace tracy -{ - -static const unsigned int tracy_systrace_armv7_size = 1149; -static const unsigned int tracy_systrace_armv7_data[1152/4] = -{ - 0x464c457f, 0x00010101, 0x00000000, 0x00000000, 0x00280003, 0x00000001, 0x000001f0, 0x00000034, 0x00000000, 0x05000200, 0x00200034, 0x00280007, - 0x00000000, 0x00000006, 0x00000034, 0x00000034, 0x00000034, 0x000000e0, 0x000000e0, 0x00000004, 0x00000004, 0x00000003, 0x00000114, 0x00000114, - 0x00000114, 0x00000013, 0x00000013, 0x00000004, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x000003fd, 0x000003fd, 0x00000005, - 0x00001000, 0x00000001, 0x000003fd, 0x000013fd, 0x000013fd, 0x00000080, 0x000000b3, 0x00000006, 0x00001000, 0x00000002, 0x00000400, 0x00001400, - 0x00001400, 0x0000007d, 0x000000b0, 0x00000006, 0x00000004, 0x6474e551, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000006, - 0x00000004, 0x70000001, 0x000003a4, 0x000003a4, 0x000003a4, 0x00000008, 0x00000008, 0x00000004, 0x00000004, 0x7379732f, 0x2f6d6574, 0x2f6e6962, - 0x6b6e696c, 0x00007265, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000012, 0x00000016, 0x00000000, - 0x00000000, 0x00000012, 0x6f6c6400, 0x006e6570, 0x4342494c, 0x62696c00, 0x732e6c64, 0x6c64006f, 0x006d7973, 0x00000001, 0x00000003, 0x00000001, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00010001, 0x0000000d, 0x00000010, 0x00000000, 0x00050d63, 0x00020000, 0x00000008, - 0x00000000, 0x000014bc, 0x00000116, 0x000014c0, 0x00000216, 0xe52de004, 0xe59fe004, 0xe08fe00e, 0xe5bef008, 0x000012dc, 0xe28fc600, 0xe28cca01, - 0xe5bcf2dc, 0xe28fc600, 0xe28cca01, 0xe5bcf2d4, 0xe92d4ff0, 0xe28db01c, 0xe24dd024, 0xe24dd801, 0xe59f017c, 0xe3a01001, 0xe3a08001, 0xe08f0000, - 0xebfffff0, 0xe59f116c, 0xe1a04000, 0xe08f1001, 0xebffffef, 0xe59f1160, 0xe1a06000, 0xe1a00004, 0xe08f1001, 0xebffffea, 0xe59f1150, 0xe1a07000, - 0xe1a00004, 0xe08f1001, 0xebffffe5, 0xe59f1140, 0xe1a05000, 0xe1a00004, 0xe08f1001, 0xebffffe0, 0xe58d0004, 0xe1a00004, 0xe59f1128, 0xe08f1001, - 0xebffffdb, 0xe59f1120, 0xe1a0a000, 0xe1a00004, 0xe08f1001, 0xebffffd6, 0xe1a04000, 0xe59f010c, 0xe3a01000, 0xe3a09000, 0xe08f0000, 0xe12fff36, - 0xe1a06000, 0xe3700001, 0xca000001, 0xe3a00000, 0xe12fff37, 0xe3a00009, 0xe3a01001, 0xe1cd01bc, 0xe3a00008, 0xe1cd01b4, 0xe3090680, 0xe3400098, - 0xe3a02000, 0xe58d000c, 0xe28d0010, 0xe58d7000, 0xe58d6018, 0xe58d8010, 0xe58d9008, 0xe12fff35, 0xe3500000, 0xca00001d, 0xe28d7018, 0xe28d8010, - 0xe28d9020, 0xe1a00007, 0xe3a01001, 0xe3a02000, 0xe12fff35, 0xe3500000, 0xda00000a, 0xe1a00006, 0xe1a01009, 0xe3a02801, 0xe12fff3a, 0xe3500001, - 0xba00000e, 0xe1a02000, 0xe3a00001, 0xe1a01009, 0xe12fff34, 0xea000003, 0xe59d2004, 0xe28d0008, 0xe3a01000, 0xe12fff32, 0xe1a00008, 0xe3a01001, - 0xe3a02000, 0xe12fff35, 0xe3500001, 0xbaffffe4, 0xe59d1000, 0xe3a00000, 0xe12fff31, 0xe24bd01c, 0xe8bd8ff0, 0x00000198, 0x00000190, 0x00000181, - 0x00000172, 0x00000163, 0x00000159, 0x0000014a, 0x00000138, 0x7ffffe4c, 0x00000001, 0x6362696c, 0x006f732e, 0x6e65706f, 0x69786500, 0x6f700074, - 0x6e006c6c, 0x736f6e61, 0x7065656c, 0x61657200, 0x72770064, 0x00657469, 0x7379732f, 0x72656b2f, 0x2f6c656e, 0x75626564, 0x72742f67, 0x6e696361, - 0x72742f67, 0x5f656361, 0x65706970, 0x00000000, 0x00000003, 0x000014b0, 0x00000002, 0x00000010, 0x00000017, 0x000001b4, 0x00000014, 0x00000011, - 0x00000015, 0x00000000, 0x00000006, 0x00000128, 0x0000000b, 0x00000010, 0x00000005, 0x00000158, 0x0000000a, 0x0000001c, 0x6ffffef5, 0x00000174, - 0x00000001, 0x0000000d, 0x0000001e, 0x00000008, 0x6ffffffb, 0x00000001, 0x6ffffff0, 0x0000018c, 0x6ffffffe, 0x00000194, 0x6fffffff, 0x00000001, -}; - -static const unsigned int tracy_systrace_aarch64_size = 1650; -static const unsigned int tracy_systrace_aarch64_data[1652/4] = -{ - 0x464c457f, 0x00010102, 0x00000000, 0x00000000, 0x00b70003, 0x00000001, 0x000002e0, 0x00000000, 0x00000040, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00380040, 0x00400006, 0x00000000, 0x00000006, 0x00000005, 0x00000040, 0x00000000, 0x00000040, 0x00000000, 0x00000040, 0x00000000, - 0x00000150, 0x00000000, 0x00000150, 0x00000000, 0x00000008, 0x00000000, 0x00000003, 0x00000004, 0x00000190, 0x00000000, 0x00000190, 0x00000000, - 0x00000190, 0x00000000, 0x00000015, 0x00000000, 0x00000015, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000005, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000004e1, 0x00000000, 0x000004e1, 0x00000000, 0x00001000, 0x00000000, 0x00000001, 0x00000006, - 0x000004e8, 0x00000000, 0x000014e8, 0x00000000, 0x000014e8, 0x00000000, 0x0000018a, 0x00000000, 0x00000190, 0x00000000, 0x00001000, 0x00000000, - 0x00000002, 0x00000006, 0x000004e8, 0x00000000, 0x000014e8, 0x00000000, 0x000014e8, 0x00000000, 0x00000160, 0x00000000, 0x00000160, 0x00000000, - 0x00000008, 0x00000000, 0x6474e551, 0x00000006, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000008, 0x00000000, 0x7379732f, 0x2f6d6574, 0x2f6e6962, 0x6b6e696c, 0x34367265, 0x00000000, 0x00000001, 0x00000001, - 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00090003, 0x000002e0, 0x00000000, 0x00000000, 0x00000000, 0x00000010, 0x00000012, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x0000000a, 0x00000012, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x62696c00, 0x732e6c64, 0x6c64006f, 0x006d7973, 0x706f6c64, 0x4c006e65, - 0x00434249, 0x00000000, 0x00000000, 0x00000000, 0x00010001, 0x00000001, 0x00000010, 0x00000000, 0x00050d63, 0x00020000, 0x00000017, 0x00000000, - 0x00001668, 0x00000000, 0x00000402, 0x00000002, 0x00000000, 0x00000000, 0x00001670, 0x00000000, 0x00000402, 0x00000003, 0x00000000, 0x00000000, - 0xa9bf7bf0, 0xb0000010, 0xf9433211, 0x91198210, 0xd61f0220, 0xd503201f, 0xd503201f, 0xd503201f, 0xb0000010, 0xf9433611, 0x9119a210, 0xd61f0220, - 0xb0000010, 0xf9433a11, 0x9119c210, 0xd61f0220, 0xa9bb67fc, 0xa9015ff8, 0xa90257f6, 0xa9034ff4, 0xa9047bfd, 0x910103fd, 0xd14043ff, 0xd10083ff, - 0x90000000, 0x91124000, 0x52800021, 0x52800039, 0x97ffffec, 0x90000001, 0x91126021, 0xaa0003f7, 0x97ffffec, 0x90000001, 0xaa0003f8, 0x91127421, - 0xaa1703e0, 0x97ffffe7, 0x90000001, 0xaa0003f3, 0x91128821, 0xaa1703e0, 0x97ffffe2, 0x90000001, 0xaa0003f4, 0x91129c21, 0xaa1703e0, 0x97ffffdd, - 0x90000001, 0xaa0003f5, 0x9112c421, 0xaa1703e0, 0x97ffffd8, 0x90000001, 0xaa0003f6, 0x9112d821, 0xaa1703e0, 0x97ffffd3, 0xaa0003f7, 0x90000000, - 0x9112f000, 0x2a1f03e1, 0xd63f0300, 0x2a0003f8, 0x36f80060, 0x2a1f03e0, 0xd63f0260, 0x90000009, 0x3dc12120, 0x52800128, 0x79003be8, 0x52800108, - 0x910043e0, 0x52800021, 0x2a1f03e2, 0xb9001bf8, 0xb90013f9, 0x79002be8, 0x3d8003e0, 0xd63f0280, 0x7100001f, 0x5400036c, 0x910063e0, 0x52800021, - 0x2a1f03e2, 0xd63f0280, 0x7100001f, 0x5400018d, 0x910083e1, 0x52a00022, 0x2a1803e0, 0xd63f02c0, 0xf100041f, 0x540001eb, 0xaa0003e2, 0x910083e1, - 0x52800020, 0xd63f02e0, 0x14000004, 0x910003e0, 0xaa1f03e1, 0xd63f02a0, 0x910043e0, 0x52800021, 0x2a1f03e2, 0xd63f0280, 0x7100041f, 0x54fffceb, - 0x2a1f03e0, 0xd63f0260, 0x914043ff, 0x910083ff, 0xa9447bfd, 0xa9434ff4, 0xa94257f6, 0xa9415ff8, 0xa8c567fc, 0xd65f03c0, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00989680, 0x00000000, 0x6362696c, 0x006f732e, 0x6e65706f, 0x69786500, 0x6f700074, 0x6e006c6c, 0x736f6e61, 0x7065656c, - 0x61657200, 0x72770064, 0x00657469, 0x7379732f, 0x72656b2f, 0x2f6c656e, 0x75626564, 0x72742f67, 0x6e696361, 0x72742f67, 0x5f656361, 0x65706970, - 0x00000000, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x6ffffef5, 0x00000000, 0x000001a8, 0x00000000, 0x00000005, 0x00000000, - 0x00000228, 0x00000000, 0x00000006, 0x00000000, 0x000001c8, 0x00000000, 0x0000000a, 0x00000000, 0x0000001c, 0x00000000, 0x0000000b, 0x00000000, - 0x00000018, 0x00000000, 0x00000015, 0x00000000, 0x00000000, 0x00000000, 0x00000003, 0x00000000, 0x00001650, 0x00000000, 0x00000002, 0x00000000, - 0x00000030, 0x00000000, 0x00000014, 0x00000000, 0x00000007, 0x00000000, 0x00000017, 0x00000000, 0x00000270, 0x00000000, 0x0000001e, 0x00000000, - 0x00000008, 0x00000000, 0x6ffffffb, 0x00000000, 0x00000001, 0x00000000, 0x6ffffffe, 0x00000000, 0x00000250, 0x00000000, 0x6fffffff, 0x00000000, - 0x00000001, 0x00000000, 0x6ffffff0, 0x00000000, 0x00000244, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x000002a0, 0x00000000, 0x000002a0, -}; - -} diff --git a/Source/ThirdParty/tracy/client/TracyThread.hpp b/Source/ThirdParty/tracy/client/TracyThread.hpp index edd255e87..5638756ac 100644 --- a/Source/ThirdParty/tracy/client/TracyThread.hpp +++ b/Source/ThirdParty/tracy/client/TracyThread.hpp @@ -1,7 +1,7 @@ #ifndef __TRACYTHREAD_HPP__ #define __TRACYTHREAD_HPP__ -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 # include #else # include @@ -14,18 +14,23 @@ namespace tracy { +#ifdef TRACY_MANUAL_LIFETIME +extern thread_local bool RpThreadInitDone; +#endif + class ThreadExitHandler { public: ~ThreadExitHandler() { #ifdef TRACY_MANUAL_LIFETIME - rpmalloc_thread_finalize(); + rpmalloc_thread_finalize( 1 ); + RpThreadInitDone = false; #endif } }; -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 class Thread { diff --git a/Source/ThirdParty/tracy/client/tracy_SPSCQueue.h b/Source/ThirdParty/tracy/client/tracy_SPSCQueue.h new file mode 100644 index 000000000..7f1752b56 --- /dev/null +++ b/Source/ThirdParty/tracy/client/tracy_SPSCQueue.h @@ -0,0 +1,148 @@ +/* +Copyright (c) 2020 Erik Rigtorp + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + */ + +#pragma once + +#include +#include +#include +#include +#include // std::enable_if, std::is_*_constructible + +#include "../common/TracyAlloc.hpp" + +#if defined (_MSC_VER) +#pragma warning(push) +#pragma warning(disable:4324) +#endif + +namespace tracy { + +template class SPSCQueue { +public: + explicit SPSCQueue(const size_t capacity) + : capacity_(capacity) { + capacity_++; // Needs one slack element + slots_ = (T*)tracy_malloc(sizeof(T) * (capacity_ + 2 * kPadding)); + + static_assert(alignof(SPSCQueue) == kCacheLineSize, ""); + static_assert(sizeof(SPSCQueue) >= 3 * kCacheLineSize, ""); + assert(reinterpret_cast(&readIdx_) - + reinterpret_cast(&writeIdx_) >= + static_cast(kCacheLineSize)); + } + + ~SPSCQueue() { + while (front()) { + pop(); + } + tracy_free(slots_); + } + + // non-copyable and non-movable + SPSCQueue(const SPSCQueue &) = delete; + SPSCQueue &operator=(const SPSCQueue &) = delete; + + template + void emplace(Args &&...args) noexcept( + std::is_nothrow_constructible::value) { + static_assert(std::is_constructible::value, + "T must be constructible with Args&&..."); + auto const writeIdx = writeIdx_.load(std::memory_order_relaxed); + auto nextWriteIdx = writeIdx + 1; + if (nextWriteIdx == capacity_) { + nextWriteIdx = 0; + } + while (nextWriteIdx == readIdxCache_) { + readIdxCache_ = readIdx_.load(std::memory_order_acquire); + } + new (&slots_[writeIdx + kPadding]) T(std::forward(args)...); + writeIdx_.store(nextWriteIdx, std::memory_order_release); + } + + T *front() noexcept { + auto const readIdx = readIdx_.load(std::memory_order_relaxed); + if (readIdx == writeIdxCache_) { + writeIdxCache_ = writeIdx_.load(std::memory_order_acquire); + if (writeIdxCache_ == readIdx) { + return nullptr; + } + } + return &slots_[readIdx + kPadding]; + } + + void pop() noexcept { + static_assert(std::is_nothrow_destructible::value, + "T must be nothrow destructible"); + auto const readIdx = readIdx_.load(std::memory_order_relaxed); + assert(writeIdx_.load(std::memory_order_acquire) != readIdx); + slots_[readIdx + kPadding].~T(); + auto nextReadIdx = readIdx + 1; + if (nextReadIdx == capacity_) { + nextReadIdx = 0; + } + readIdx_.store(nextReadIdx, std::memory_order_release); + } + + size_t size() const noexcept { + std::ptrdiff_t diff = writeIdx_.load(std::memory_order_acquire) - + readIdx_.load(std::memory_order_acquire); + if (diff < 0) { + diff += capacity_; + } + return static_cast(diff); + } + + bool empty() const noexcept { + return writeIdx_.load(std::memory_order_acquire) == + readIdx_.load(std::memory_order_acquire); + } + + size_t capacity() const noexcept { return capacity_ - 1; } + +private: + static constexpr size_t kCacheLineSize = 64; + + // Padding to avoid false sharing between slots_ and adjacent allocations + static constexpr size_t kPadding = (kCacheLineSize - 1) / sizeof(T) + 1; + +private: + size_t capacity_; + T *slots_; + + // Align to cache line size in order to avoid false sharing + // readIdxCache_ and writeIdxCache_ is used to reduce the amount of cache + // coherency traffic + alignas(kCacheLineSize) std::atomic writeIdx_ = {0}; + alignas(kCacheLineSize) size_t readIdxCache_ = 0; + alignas(kCacheLineSize) std::atomic readIdx_ = {0}; + alignas(kCacheLineSize) size_t writeIdxCache_ = 0; + + // Padding to avoid adjacent allocations to share cache line with + // writeIdxCache_ + char padding_[kCacheLineSize - sizeof(SPSCQueue::writeIdxCache_)]; +}; +} // namespace rigtorp + +#if defined (_MSC_VER) +#pragma warning(pop) +#endif diff --git a/Source/ThirdParty/tracy/client/tracy_concurrentqueue.h b/Source/ThirdParty/tracy/client/tracy_concurrentqueue.h index bf095bc36..3b88b20b6 100644 --- a/Source/ThirdParty/tracy/client/tracy_concurrentqueue.h +++ b/Source/ThirdParty/tracy/client/tracy_concurrentqueue.h @@ -201,7 +201,7 @@ namespace details ConcurrentQueueProducerTypelessBase* next; std::atomic inactive; ProducerToken* token; - uint64_t threadId; + uint32_t threadId; ConcurrentQueueProducerTypelessBase() : next(nullptr), inactive(false), token(nullptr), threadId(0) @@ -209,19 +209,19 @@ namespace details } }; - template - static inline bool circular_less_than(T a, T b) - { #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable: 4554) #endif + template + static inline bool circular_less_than(T a, T b) + { static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "circular_less_than is intended to be used only with unsigned integer types"); - return static_cast(a - b) > static_cast(static_cast(1) << static_cast(sizeof(T) * CHAR_BIT - 1)); + return static_cast(a - b) > (static_cast(static_cast(1) << static_cast(sizeof(T) * CHAR_BIT - 1))); + } #ifdef _MSC_VER #pragma warning(pop) #endif - } template static inline char* align_for(char* ptr) diff --git a/Source/ThirdParty/tracy/client/tracy_rpmalloc.cpp b/Source/ThirdParty/tracy/client/tracy_rpmalloc.cpp index 8aae78e03..8efa626a9 100644 --- a/Source/ThirdParty/tracy/client/tracy_rpmalloc.cpp +++ b/Source/ThirdParty/tracy/client/tracy_rpmalloc.cpp @@ -1,6 +1,6 @@ #ifdef TRACY_ENABLE -/* rpmalloc.c - Memory allocator - Public Domain - 2016 Mattias Jansson +/* rpmalloc.c - Memory allocator - Public Domain - 2016-2020 Mattias Jansson * * This library provides a cross-platform lock free thread caching malloc implementation in C11. * The latest source code is always available at @@ -13,7 +13,26 @@ #include "tracy_rpmalloc.hpp" +#define BUILD_DYNAMIC_LINK 1 + +//////////// +/// /// Build time configurable limits +/// +////// + +#if defined(__clang__) +#pragma clang diagnostic ignored "-Wunused-macros" +#pragma clang diagnostic ignored "-Wunused-function" +#if __has_warning("-Wreserved-identifier") +#pragma clang diagnostic ignored "-Wreserved-identifier" +#endif +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wunused-macros" +#pragma GCC diagnostic ignored "-Wunused-function" +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif + #ifndef HEAP_ARRAY_SIZE //! Size of heap hashmap #define HEAP_ARRAY_SIZE 47 @@ -47,59 +66,46 @@ #define ENABLE_PRELOAD 0 #endif #ifndef DISABLE_UNMAP -//! Disable unmapping memory pages +//! Disable unmapping memory pages (also enables unlimited cache) #define DISABLE_UNMAP 0 #endif +#ifndef ENABLE_UNLIMITED_CACHE +//! Enable unlimited global cache (no unmapping until finalization) +#define ENABLE_UNLIMITED_CACHE 0 +#endif +#ifndef ENABLE_ADAPTIVE_THREAD_CACHE +//! Enable adaptive thread cache size based on use heuristics +#define ENABLE_ADAPTIVE_THREAD_CACHE 0 +#endif #ifndef DEFAULT_SPAN_MAP_COUNT //! Default number of spans to map in call to map more virtual memory (default values yield 4MiB here) #define DEFAULT_SPAN_MAP_COUNT 64 #endif - -#if ENABLE_THREAD_CACHE -#ifndef ENABLE_UNLIMITED_CACHE -//! Unlimited thread and global cache -#define ENABLE_UNLIMITED_CACHE 0 -#endif -#ifndef ENABLE_UNLIMITED_THREAD_CACHE -//! Unlimited cache disables any thread cache limitations -#define ENABLE_UNLIMITED_THREAD_CACHE ENABLE_UNLIMITED_CACHE -#endif -#if !ENABLE_UNLIMITED_THREAD_CACHE -#ifndef THREAD_CACHE_MULTIPLIER -//! Multiplier for thread cache (cache limit will be span release count multiplied by this value) -#define THREAD_CACHE_MULTIPLIER 16 -#endif -#ifndef ENABLE_ADAPTIVE_THREAD_CACHE -//! Enable adaptive size of per-thread cache (still bounded by THREAD_CACHE_MULTIPLIER hard limit) -#define ENABLE_ADAPTIVE_THREAD_CACHE 0 -#endif -#endif -#endif - -#if ENABLE_GLOBAL_CACHE && ENABLE_THREAD_CACHE -#ifndef ENABLE_UNLIMITED_GLOBAL_CACHE -//! Unlimited cache disables any global cache limitations -#define ENABLE_UNLIMITED_GLOBAL_CACHE ENABLE_UNLIMITED_CACHE -#endif -#if !ENABLE_UNLIMITED_GLOBAL_CACHE -//! Multiplier for global cache (cache limit will be span release count multiplied by this value) -#define GLOBAL_CACHE_MULTIPLIER (THREAD_CACHE_MULTIPLIER * 6) -#endif -#else -# undef ENABLE_GLOBAL_CACHE -# define ENABLE_GLOBAL_CACHE 0 -#endif - -#if !ENABLE_THREAD_CACHE || ENABLE_UNLIMITED_THREAD_CACHE -# undef ENABLE_ADAPTIVE_THREAD_CACHE -# define ENABLE_ADAPTIVE_THREAD_CACHE 0 +#ifndef GLOBAL_CACHE_MULTIPLIER +//! Multiplier for global cache +#define GLOBAL_CACHE_MULTIPLIER 8 #endif #if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE -# error Must use global cache if unmap is disabled +#error Must use global cache if unmap is disabled #endif -#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 ) +#if DISABLE_UNMAP +#undef ENABLE_UNLIMITED_CACHE +#define ENABLE_UNLIMITED_CACHE 1 +#endif + +#if !ENABLE_GLOBAL_CACHE +#undef ENABLE_UNLIMITED_CACHE +#define ENABLE_UNLIMITED_CACHE 0 +#endif + +#if !ENABLE_THREAD_CACHE +#undef ENABLE_ADAPTIVE_THREAD_CACHE +#define ENABLE_ADAPTIVE_THREAD_CACHE 0 +#endif + +#if defined(_WIN32) || defined(__WIN32__) || defined(_WIN64) # define PLATFORM_WINDOWS 1 # define PLATFORM_POSIX 0 #else @@ -107,13 +113,14 @@ # define PLATFORM_POSIX 1 #endif -#define _Static_assert static_assert - /// Platform and arch specifics -#ifndef FORCEINLINE -# if defined(_MSC_VER) && !defined(__clang__) +#if defined(_MSC_VER) && !defined(__clang__) +# pragma warning (disable: 5105) +# ifndef FORCEINLINE # define FORCEINLINE inline __forceinline -# else +# endif +#else +# ifndef FORCEINLINE # define FORCEINLINE inline __attribute__((__always_inline__)) # endif #endif @@ -123,25 +130,62 @@ # endif # include # if ENABLE_VALIDATE_ARGS -# include +# include # endif #else # include # include # include +# include +# if defined(__linux__) || defined(__ANDROID__) +# include +# if !defined(PR_SET_VMA) +# define PR_SET_VMA 0x53564d41 +# define PR_SET_VMA_ANON_NAME 0 +# endif +# endif # if defined(__APPLE__) +# include +# if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR # include # include +# endif # include # endif -# if defined(__HAIKU__) -# include +# if defined(__HAIKU__) || defined(__TINYC__) # include # endif #endif #include #include +#include + +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) +#include +static DWORD fls_key; +#endif + +#if PLATFORM_POSIX +# include +# include +# ifdef __FreeBSD__ +# include +# define MAP_HUGETLB MAP_ALIGNED_SUPER +# ifndef PROT_MAX +# define PROT_MAX(f) 0 +# endif +# else +# define PROT_MAX(f) 0 +# endif +# ifdef __sun +extern int madvise(caddr_t, size_t, int); +# endif +# ifndef MAP_UNINITIALIZED +# define MAP_UNINITIALIZED 0 +# endif +#endif +#include #if ENABLE_ASSERTS # undef NDEBUG @@ -149,47 +193,105 @@ # define _DEBUG # endif # include +#define RPMALLOC_TOSTRING_M(x) #x +#define RPMALLOC_TOSTRING(x) RPMALLOC_TOSTRING_M(x) +#define rpmalloc_assert(truth, message) \ + do { \ + if (!(truth)) { \ + if (_memory_config.error_callback) { \ + _memory_config.error_callback( \ + message " (" RPMALLOC_TOSTRING(truth) ") at " __FILE__ ":" RPMALLOC_TOSTRING(__LINE__)); \ + } else { \ + assert((truth) && message); \ + } \ + } \ + } while (0) #else -# undef assert -# define assert(x) do {} while(0) +# define rpmalloc_assert(truth, message) do {} while(0) #endif #if ENABLE_STATISTICS # include #endif -#include +////// +/// +/// Atomic access abstraction (since MSVC does not do C11 yet) +/// +////// -namespace tracy -{ +#include typedef std::atomic atomic32_t; typedef std::atomic atomic64_t; typedef std::atomic atomicptr_t; -#define atomic_thread_fence_acquire() std::atomic_thread_fence(std::memory_order_acquire) -#define atomic_thread_fence_release() std::atomic_thread_fence(std::memory_order_release) - static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return std::atomic_load_explicit(src, std::memory_order_relaxed); } static FORCEINLINE void atomic_store32(atomic32_t* dst, int32_t val) { std::atomic_store_explicit(dst, val, std::memory_order_relaxed); } static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return std::atomic_fetch_add_explicit(val, 1, std::memory_order_relaxed) + 1; } -#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE -static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; } -#endif +static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return std::atomic_fetch_add_explicit(val, -1, std::memory_order_relaxed) - 1; } static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return std::atomic_fetch_add_explicit(val, add, std::memory_order_relaxed) + add; } +static FORCEINLINE int atomic_cas32_acquire(atomic32_t* dst, int32_t val, int32_t ref) { return std::atomic_compare_exchange_weak_explicit(dst, &ref, val, std::memory_order_acquire, std::memory_order_relaxed); } +static FORCEINLINE void atomic_store32_release(atomic32_t* dst, int32_t val) { std::atomic_store_explicit(dst, val, std::memory_order_release); } +static FORCEINLINE int64_t atomic_load64(atomic64_t* val) { return std::atomic_load_explicit(val, std::memory_order_relaxed); } +static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return std::atomic_fetch_add_explicit(val, add, std::memory_order_relaxed) + add; } static FORCEINLINE void* atomic_load_ptr(atomicptr_t* src) { return std::atomic_load_explicit(src, std::memory_order_relaxed); } static FORCEINLINE void atomic_store_ptr(atomicptr_t* dst, void* val) { std::atomic_store_explicit(dst, val, std::memory_order_relaxed); } -static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return std::atomic_compare_exchange_weak_explicit(dst, &ref, val, std::memory_order_release, std::memory_order_acquire); } +static FORCEINLINE void atomic_store_ptr_release(atomicptr_t* dst, void* val) { std::atomic_store_explicit(dst, val, std::memory_order_release); } +static FORCEINLINE void* atomic_exchange_ptr_acquire(atomicptr_t* dst, void* val) { return std::atomic_exchange_explicit(dst, val, std::memory_order_acquire); } +static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return std::atomic_compare_exchange_weak_explicit(dst, &ref, val, std::memory_order_relaxed, std::memory_order_relaxed); } #if defined(_MSC_VER) && !defined(__clang__) -# define EXPECTED(x) (x) -# define UNEXPECTED(x) (x) + +#define EXPECTED(x) (x) +#define UNEXPECTED(x) (x) + #else -# define EXPECTED(x) __builtin_expect((x), 1) -# define UNEXPECTED(x) __builtin_expect((x), 0) + +#define EXPECTED(x) __builtin_expect((x), 1) +#define UNEXPECTED(x) __builtin_expect((x), 0) + #endif +//////////// +/// +/// Statistics related functions (evaluate to nothing when statistics not enabled) +/// +////// + +#if ENABLE_STATISTICS +# define _rpmalloc_stat_inc(counter) atomic_incr32(counter) +# define _rpmalloc_stat_dec(counter) atomic_decr32(counter) +# define _rpmalloc_stat_add(counter, value) atomic_add32(counter, (int32_t)(value)) +# define _rpmalloc_stat_add64(counter, value) atomic_add64(counter, (int64_t)(value)) +# define _rpmalloc_stat_add_peak(counter, value, peak) do { int32_t _cur_count = atomic_add32(counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0) +# define _rpmalloc_stat_sub(counter, value) atomic_add32(counter, -(int32_t)(value)) +# define _rpmalloc_stat_inc_alloc(heap, class_idx) do { \ + int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \ + if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \ + heap->size_class_use[class_idx].alloc_peak = alloc_current; \ + atomic_incr32(&heap->size_class_use[class_idx].alloc_total); \ +} while(0) +# define _rpmalloc_stat_inc_free(heap, class_idx) do { \ + atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \ + atomic_incr32(&heap->size_class_use[class_idx].free_total); \ +} while(0) +#else +# define _rpmalloc_stat_inc(counter) do {} while(0) +# define _rpmalloc_stat_dec(counter) do {} while(0) +# define _rpmalloc_stat_add(counter, value) do {} while(0) +# define _rpmalloc_stat_add64(counter, value) do {} while(0) +# define _rpmalloc_stat_add_peak(counter, value, peak) do {} while (0) +# define _rpmalloc_stat_sub(counter, value) do {} while(0) +# define _rpmalloc_stat_inc_alloc(heap, class_idx) do {} while(0) +# define _rpmalloc_stat_inc_free(heap, class_idx) do {} while(0) +#endif + + +/// /// Preconfigured limits and sizes -//! Granularity of a small allocation block +/// + +//! Granularity of a small allocation block (must be power of two) #define SMALL_GRANULARITY 16 //! Small granularity shift count #define SMALL_GRANULARITY_SHIFT 4 @@ -206,13 +308,24 @@ static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref //! Total number of small + medium size classes #define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT) //! Number of large block size classes -#define LARGE_CLASS_COUNT 32 +#define LARGE_CLASS_COUNT 63 //! Maximum size of a medium block #define MEDIUM_SIZE_LIMIT (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT)) //! Maximum size of a large block #define LARGE_SIZE_LIMIT ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE) -//! Size of a span header (must be a multiple of SMALL_GRANULARITY) -#define SPAN_HEADER_SIZE 96 +//! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power of two) +#define SPAN_HEADER_SIZE 128 +//! Number of spans in thread cache +#define MAX_THREAD_SPAN_CACHE 400 +//! Number of spans to transfer between thread and global cache +#define THREAD_SPAN_CACHE_TRANSFER 64 +//! Number of spans in thread cache for large spans (must be greater than LARGE_CLASS_COUNT / 2) +#define MAX_THREAD_SPAN_LARGE_CACHE 100 +//! Number of spans to transfer between thread and global cache for large spans +#define THREAD_SPAN_LARGE_CACHE_TRANSFER 6 + +static_assert((SMALL_GRANULARITY & (SMALL_GRANULARITY - 1)) == 0, "Small granularity must be power of two"); +static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0, "Span header size must be power of two"); #if ENABLE_VALIDATE_ARGS //! Maximum allocation size to avoid integer overflow @@ -225,11 +338,20 @@ static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref #define INVALID_POINTER ((void*)((uintptr_t)-1)) +#define SIZE_CLASS_LARGE SIZE_CLASS_COUNT +#define SIZE_CLASS_HUGE ((uint32_t)-1) + +//////////// +/// /// Data types +/// +////// + +namespace tracy +{ + //! A memory heap, per thread typedef struct heap_t heap_t; -//! Heap spans per size class -typedef struct heap_class_t heap_class_t; //! Span of memory pages typedef struct span_t span_t; //! Span list @@ -247,28 +369,32 @@ typedef struct global_cache_t global_cache_t; #define SPAN_FLAG_SUBSPAN 2U //! Flag indicating span has blocks with increased alignment #define SPAN_FLAG_ALIGNED_BLOCKS 4U +//! Flag indicating an unmapped master span +#define SPAN_FLAG_UNMAPPED_MASTER 8U #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS struct span_use_t { //! Current number of spans used (actually used, not in cache) atomic32_t current; //! High water mark of spans used - uint32_t high; + atomic32_t high; #if ENABLE_STATISTICS + //! Number of spans in deferred list + atomic32_t spans_deferred; //! Number of spans transitioned to global cache - uint32_t spans_to_global; + atomic32_t spans_to_global; //! Number of spans transitioned from global cache - uint32_t spans_from_global; + atomic32_t spans_from_global; //! Number of spans transitioned to thread cache - uint32_t spans_to_cache; + atomic32_t spans_to_cache; //! Number of spans transitioned from thread cache - uint32_t spans_from_cache; + atomic32_t spans_from_cache; //! Number of spans transitioned to reserved state - uint32_t spans_to_reserved; + atomic32_t spans_to_reserved; //! Number of spans transitioned from reserved state - uint32_t spans_from_reserved; + atomic32_t spans_from_reserved; //! Number of raw memory map calls - uint32_t spans_map_calls; + atomic32_t spans_map_calls; #endif }; typedef struct span_use_t span_use_t; @@ -281,64 +407,59 @@ struct size_class_use_t { //! Peak number of allocations int32_t alloc_peak; //! Total number of allocations - int32_t alloc_total; + atomic32_t alloc_total; //! Total number of frees atomic32_t free_total; //! Number of spans in use - uint32_t spans_current; + atomic32_t spans_current; //! Number of spans transitioned to cache - uint32_t spans_peak; + int32_t spans_peak; //! Number of spans transitioned to cache - uint32_t spans_to_cache; + atomic32_t spans_to_cache; //! Number of spans transitioned from cache - uint32_t spans_from_cache; + atomic32_t spans_from_cache; //! Number of spans transitioned from reserved state - uint32_t spans_from_reserved; + atomic32_t spans_from_reserved; //! Number of spans mapped - uint32_t spans_map_calls; + atomic32_t spans_map_calls; + int32_t unused; }; typedef struct size_class_use_t size_class_use_t; #endif -typedef enum span_state_t { - SPAN_STATE_ACTIVE = 0, - SPAN_STATE_PARTIAL, - SPAN_STATE_FULL -} span_state_t; - -//A span can either represent a single span of memory pages with size declared by span_map_count configuration variable, -//or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single -//span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first -//(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans -//that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire -//superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released -//in the same call to release the virtual memory range, but individual subranges can be decommitted individually -//to reduce physical memory use). +// A span can either represent a single span of memory pages with size declared by span_map_count configuration variable, +// or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single +// span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first +// (super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans +// that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire +// superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released +// in the same call to release the virtual memory range, but individual subranges can be decommitted individually +// to reduce physical memory use). struct span_t { //! Free list void* free_list; - //! State - uint32_t state; - //! Used count when not active (not including deferred free list) - uint32_t used_count; - //! Block count + //! Total block count of size class uint32_t block_count; //! Size class uint32_t size_class; //! Index of last block initialized in free list uint32_t free_list_limit; - //! Span list size when part of a cache list, or size of deferred free list when partial/full - uint32_t list_size; + //! Number of used blocks remaining when in partial state + uint32_t used_count; //! Deferred free list atomicptr_t free_list_deferred; + //! Size of deferred free list, or list of spans when part of a cache list + uint32_t list_size; //! Size of a block uint32_t block_size; //! Flags and counters uint32_t flags; //! Number of spans uint32_t span_count; - //! Total span counter for master spans, distance for subspans - uint32_t total_spans_or_distance; + //! Total span counter for master spans + uint32_t total_spans; + //! Offset from master span for subspans + uint32_t offset_from_master; //! Remaining span counter, for master spans atomic32_t remaining_spans; //! Alignment offset @@ -350,53 +471,89 @@ struct span_t { //! Previous span span_t* prev; }; -_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch"); +static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch"); -struct heap_class_t { +struct span_cache_t { + size_t count; + span_t* span[MAX_THREAD_SPAN_CACHE]; +}; +typedef struct span_cache_t span_cache_t; + +struct span_large_cache_t { + size_t count; + span_t* span[MAX_THREAD_SPAN_LARGE_CACHE]; +}; +typedef struct span_large_cache_t span_large_cache_t; + +struct heap_size_class_t { //! Free list of active span void* free_list; - //! Double linked list of partially used spans with free blocks for each size class. - // Current active span is at head of list. Previous span pointer in head points to tail span of list. + //! Double linked list of partially used spans with free blocks. + // Previous span pointer in head points to tail span of list. span_t* partial_span; + //! Early level cache of fully free spans + span_t* cache; }; +typedef struct heap_size_class_t heap_size_class_t; +// Control structure for a heap, either a thread heap or a first class heap if enabled struct heap_t { - //! Active and semi-used span data per size class - heap_class_t span_class[SIZE_CLASS_COUNT]; + //! Owning thread ID + uintptr_t owner_thread; + //! Free lists for each size class + heap_size_class_t size_class[SIZE_CLASS_COUNT]; #if ENABLE_THREAD_CACHE - //! List of free spans (single linked list) - span_t* span_cache[LARGE_CLASS_COUNT]; - //! List of deferred free spans of class 0 (single linked list) - atomicptr_t span_cache_deferred; -#endif -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS - //! Current and high water mark of spans used per span count - span_use_t span_use[LARGE_CLASS_COUNT]; + //! Arrays of fully freed spans, single span + span_cache_t span_cache; #endif + //! List of deferred free spans (single linked list) + atomicptr_t span_free_deferred; + //! Number of full spans + size_t full_span_count; //! Mapped but unused spans span_t* span_reserve; //! Master span for mapped but unused spans span_t* span_reserve_master; //! Number of mapped but unused spans - size_t spans_reserved; + uint32_t spans_reserved; + //! Child count + atomic32_t child_count; //! Next heap in id list heap_t* next_heap; //! Next heap in orphan list heap_t* next_orphan; - //! Memory pages alignment offset - size_t align_offset; //! Heap ID int32_t id; + //! Finalization state flag + int finalize; + //! Master heap owning the memory pages + heap_t* master_heap; +#if ENABLE_THREAD_CACHE + //! Arrays of fully freed spans, large spans with > 1 span count + span_large_cache_t span_large_cache[LARGE_CLASS_COUNT - 1]; +#endif +#if RPMALLOC_FIRST_CLASS_HEAPS + //! Double linked list of fully utilized spans with free blocks for each size class. + // Previous span pointer in head points to tail span of list. + span_t* full_span[SIZE_CLASS_COUNT]; + //! Double linked list of large and huge spans allocated by this heap + span_t* large_huge_span; +#endif +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + //! Current and high water mark of spans used per span count + span_use_t span_use[LARGE_CLASS_COUNT]; +#endif #if ENABLE_STATISTICS - //! Number of bytes transitioned thread -> global - size_t thread_to_global; - //! Number of bytes transitioned global -> thread - size_t global_to_thread; //! Allocation stats per size class size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1]; + //! Number of bytes transitioned thread -> global + atomic64_t thread_to_global; + //! Number of bytes transitioned global -> thread + atomic64_t global_to_thread; #endif }; +// Size class for defining a block size bucket struct size_class_t { //! Size of blocks in this class uint32_t block_size; @@ -405,20 +562,40 @@ struct size_class_t { //! Class index this class is merged with uint16_t class_idx; }; -_Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch"); +static_assert(sizeof(size_class_t) == 8, "Size class size mismatch"); struct global_cache_t { - //! Cache list pointer - atomicptr_t cache; - //! Cache size - atomic32_t size; - //! ABA counter - atomic32_t counter; + //! Cache lock + atomic32_t lock; + //! Cache count + uint32_t count; +#if ENABLE_STATISTICS + //! Insert count + size_t insert_count; + //! Extract count + size_t extract_count; +#endif + //! Cached spans + span_t* span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE]; + //! Unlimited cache overflow + span_t* overflow; }; +//////////// +/// /// Global data +/// +////// + +//! Default span size (64KiB) +#define _memory_default_span_size (64 * 1024) +#define _memory_default_span_size_shift 16 +#define _memory_default_span_mask (~((uintptr_t)(_memory_span_size - 1))) + //! Initialized flag static int _rpmalloc_initialized; +//! Main thread ID +static uintptr_t _rpmalloc_main_thread_id; //! Configuration static rpmalloc_config_t _memory_config; //! Memory page size @@ -435,17 +612,15 @@ static size_t _memory_span_size_shift; //! Mask to get to start of a memory span static uintptr_t _memory_span_mask; #else -//! Hardwired span size (64KiB) -#define _memory_span_size (64 * 1024) -#define _memory_span_size_shift 16 -#define _memory_span_mask (~((uintptr_t)(_memory_span_size - 1))) +//! Hardwired span size +#define _memory_span_size _memory_default_span_size +#define _memory_span_size_shift _memory_default_span_size_shift +#define _memory_span_mask _memory_default_span_mask #endif //! Number of spans to map in each map call static size_t _memory_span_map_count; -//! Number of spans to release from thread cache to global cache (single spans) -static size_t _memory_span_release_count; -//! Number of spans to release from thread cache to global cache (large multiple spans) -static size_t _memory_span_release_count_large; +//! Number of spans to keep reserved in each heap +static size_t _memory_heap_reserve_count; //! Global size classes static size_class_t _memory_size_class[SIZE_CLASS_COUNT]; //! Run-time size limit of medium blocks @@ -458,21 +633,37 @@ static int _memory_huge_pages; //! Global span cache static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT]; #endif +//! Global reserved spans +static span_t* _memory_global_reserve; +//! Global reserved count +static size_t _memory_global_reserve_count; +//! Global reserved master +static span_t* _memory_global_reserve_master; //! All heaps -static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE]; +static heap_t* _memory_heaps[HEAP_ARRAY_SIZE]; +//! Used to restrict access to mapping memory for huge pages +static atomic32_t _memory_global_lock; //! Orphaned heaps -static atomicptr_t _memory_orphan_heaps; -//! Running orphan counter to avoid ABA issues in linked list -static atomic32_t _memory_orphan_counter; +static heap_t* _memory_orphan_heaps; +#if RPMALLOC_FIRST_CLASS_HEAPS +//! Orphaned heaps (first class heaps) +static heap_t* _memory_first_class_orphan_heaps; +#endif #if ENABLE_STATISTICS +//! Allocations counter +static atomic64_t _allocation_counter; +//! Deallocations counter +static atomic64_t _deallocation_counter; //! Active heap count static atomic32_t _memory_active_heaps; //! Number of currently mapped memory pages static atomic32_t _mapped_pages; //! Peak number of concurrently mapped memory pages static int32_t _mapped_pages_peak; -//! Number of currently unused spans -static atomic32_t _reserved_spans; +//! Number of mapped master spans +static atomic32_t _master_spans; +//! Number of unmapped dangling master spans +static atomic32_t _unmapped_master_spans; //! Running counter of total number of mapped memory pages since start static atomic32_t _mapped_total; //! Running counter of total number of unmapped memory pages since start @@ -485,15 +676,25 @@ static atomic32_t _huge_pages_current; static int32_t _huge_pages_peak; #endif +//////////// +/// +/// Thread local heap and ID +/// +////// + //! Current thread heap -#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD +#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || defined(__TINYC__) static pthread_key_t _memory_thread_heap; #else # ifdef _MSC_VER # define _Thread_local __declspec(thread) # define TLS_MODEL # else -# define TLS_MODEL __attribute__((tls_model("initial-exec"))) +# ifndef __HAIKU__ +# define TLS_MODEL __attribute__((tls_model("initial-exec"))) +# else +# define TLS_MODEL +# endif # if !defined(__clang__) && defined(__GNUC__) # define _Thread_local __thread # endif @@ -524,93 +725,355 @@ get_thread_heap(void) { #endif } +//! Fast thread ID +static inline uintptr_t +get_thread_id(void) { +#if defined(_WIN32) + return (uintptr_t)((void*)NtCurrentTeb()); +#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__) + uintptr_t tid; +# if defined(__i386__) + __asm__("movl %%gs:0, %0" : "=r" (tid) : : ); +# elif defined(__x86_64__) +# if defined(__MACH__) + __asm__("movq %%gs:0, %0" : "=r" (tid) : : ); +# else + __asm__("movq %%fs:0, %0" : "=r" (tid) : : ); +# endif +# elif defined(__arm__) + __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid)); +# elif defined(__aarch64__) +# if defined(__MACH__) + // tpidr_el0 likely unused, always return 0 on iOS + __asm__ volatile ("mrs %0, tpidrro_el0" : "=r" (tid)); +# else + __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tid)); +# endif +# else + tid = (uintptr_t)((void*)get_thread_heap_raw()); +# endif + return tid; +#else + return (uintptr_t)((void*)get_thread_heap_raw()); +#endif +} + //! Set the current thread heap static void set_thread_heap(heap_t* heap) { -#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD +#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || defined(__TINYC__) pthread_setspecific(_memory_thread_heap, heap); #else _memory_thread_heap = heap; #endif + if (heap) + heap->owner_thread = get_thread_id(); } -//! Default implementation to map more virtual memory -static void* -_memory_map_os(size_t size, size_t* offset); +//! Set main thread ID +extern void +rpmalloc_set_main_thread(void); + +void +rpmalloc_set_main_thread(void) { + _rpmalloc_main_thread_id = get_thread_id(); +} -//! Default implementation to unmap virtual memory static void -_memory_unmap_os(void* address, size_t size, size_t offset, size_t release); - -//! Lookup a memory heap from heap ID -static heap_t* -_memory_heap_lookup(int32_t id) { - uint32_t list_idx = id % HEAP_ARRAY_SIZE; - heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); - while (heap && (heap->id != id)) - heap = heap->next_heap; - return heap; +_rpmalloc_spin(void) { +#if defined(_MSC_VER) + _mm_pause(); +#elif defined(__x86_64__) || defined(__i386__) + __asm__ volatile("pause" ::: "memory"); +#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH >= 7) + __asm__ volatile("yield" ::: "memory"); +#elif defined(__powerpc__) || defined(__powerpc64__) + // No idea if ever been compiled in such archs but ... as precaution + __asm__ volatile("or 27,27,27"); +#elif defined(__sparc__) + __asm__ volatile("rd %ccr, %g0 \n\trd %ccr, %g0 \n\trd %ccr, %g0"); +#else + struct timespec ts = {0}; + nanosleep(&ts, 0); +#endif } -#if ENABLE_STATISTICS -# define _memory_statistics_inc(counter, value) counter += value -# define _memory_statistics_dec(counter, value) counter -= value -# define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value)) -# define _memory_statistics_add_peak(atomic_counter, value, peak) do { int32_t _cur_count = atomic_add32(atomic_counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0) -# define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value)) -# define _memory_statistics_inc_alloc(heap, class_idx) do { \ - int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \ - if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \ - heap->size_class_use[class_idx].alloc_peak = alloc_current; \ - heap->size_class_use[class_idx].alloc_total++; \ -} while(0) -# define _memory_statistics_inc_free(heap, class_idx) do { \ - atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \ - atomic_incr32(&heap->size_class_use[class_idx].free_total); \ -} while(0) -#else -# define _memory_statistics_inc(counter, value) do {} while(0) -# define _memory_statistics_dec(counter, value) do {} while(0) -# define _memory_statistics_add(atomic_counter, value) do {} while(0) -# define _memory_statistics_add_peak(atomic_counter, value, peak) do {} while (0) -# define _memory_statistics_sub(atomic_counter, value) do {} while(0) -# define _memory_statistics_inc_alloc(heap, class_idx) do {} while(0) -# define _memory_statistics_inc_free(heap, class_idx) do {} while(0) +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) +static void NTAPI +_rpmalloc_thread_destructor(void* value) { +#if ENABLE_OVERRIDE + // If this is called on main thread it means rpmalloc_finalize + // has not been called and shutdown is forced (through _exit) or unclean + if (get_thread_id() == _rpmalloc_main_thread_id) + return; +#endif + if (value) + rpmalloc_thread_finalize(1); +} #endif + +//////////// +/// +/// Low level memory map/unmap +/// +////// + static void -_memory_heap_cache_insert(heap_t* heap, span_t* span); +_rpmalloc_set_name(void* address, size_t size) { +#if defined(__linux__) || defined(__ANDROID__) + const char *name = _memory_huge_pages ? _memory_config.huge_page_name : _memory_config.page_name; + if (address == MAP_FAILED || !name) + return; + // If the kernel does not support CONFIG_ANON_VMA_NAME or if the call fails + // (e.g. invalid name) it is a no-op basically. + (void)prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (uintptr_t)address, size, (uintptr_t)name); +#else + (void)sizeof(size); + (void)sizeof(address); +#endif +} + //! Map more virtual memory +// size is number of bytes to map +// offset receives the offset in bytes from start of mapped region +// returns address to start of mapped region to use static void* -_memory_map(size_t size, size_t* offset) { - assert(!(size % _memory_page_size)); - assert(size >= _memory_page_size); - _memory_statistics_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak); - _memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift)); - return _memory_config.memory_map(size, offset); +_rpmalloc_mmap(size_t size, size_t* offset) { + rpmalloc_assert(!(size % _memory_page_size), "Invalid mmap size"); + rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size"); + void* address = _memory_config.memory_map(size, offset); + if (EXPECTED(address != 0)) { + _rpmalloc_stat_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak); + _rpmalloc_stat_add(&_mapped_total, (size >> _memory_page_size_shift)); + } + return address; } //! Unmap virtual memory +// address is the memory address to unmap, as returned from _memory_map +// size is the number of bytes to unmap, which might be less than full region for a partial unmap +// offset is the offset in bytes to the actual mapped region, as set by _memory_map +// release is set to 0 for partial unmap, or size of entire range for a full unmap static void -_memory_unmap(void* address, size_t size, size_t offset, size_t release) { - assert(!release || (release >= size)); - assert(!release || (release >= _memory_page_size)); +_rpmalloc_unmap(void* address, size_t size, size_t offset, size_t release) { + rpmalloc_assert(!release || (release >= size), "Invalid unmap size"); + rpmalloc_assert(!release || (release >= _memory_page_size), "Invalid unmap size"); if (release) { - assert(!(release % _memory_page_size)); - _memory_statistics_sub(&_mapped_pages, (release >> _memory_page_size_shift)); - _memory_statistics_add(&_unmapped_total, (release >> _memory_page_size_shift)); + rpmalloc_assert(!(release % _memory_page_size), "Invalid unmap size"); + _rpmalloc_stat_sub(&_mapped_pages, (release >> _memory_page_size_shift)); + _rpmalloc_stat_add(&_unmapped_total, (release >> _memory_page_size_shift)); } _memory_config.memory_unmap(address, size, offset, release); } +//! Default implementation to map new pages to virtual memory +static void* +_rpmalloc_mmap_os(size_t size, size_t* offset) { + //Either size is a heap (a single page) or a (multiple) span - we only need to align spans, and only if larger than map granularity + size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0; + rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size"); +#if PLATFORM_WINDOWS + //Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed" + void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + if (!ptr) { + if (_memory_config.map_fail_callback) { + if (_memory_config.map_fail_callback(size + padding)) + return _rpmalloc_mmap_os(size, offset); + } else { + rpmalloc_assert(ptr, "Failed to map virtual memory block"); + } + return 0; + } +#else + int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED; +# if defined(__APPLE__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR + int fd = (int)VM_MAKE_TAG(240U); + if (_memory_huge_pages) + fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB; + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0); +# elif defined(MAP_HUGETLB) + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE | PROT_MAX(PROT_READ | PROT_WRITE), (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0); +# if defined(MADV_HUGEPAGE) + // In some configurations, huge pages allocations might fail thus + // we fallback to normal allocations and promote the region as transparent huge page + if ((ptr == MAP_FAILED || !ptr) && _memory_huge_pages) { + ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); + if (ptr && ptr != MAP_FAILED) { + int prm = madvise(ptr, size + padding, MADV_HUGEPAGE); + (void)prm; + rpmalloc_assert((prm == 0), "Failed to promote the page to THP"); + } + } +# endif + _rpmalloc_set_name(ptr, size + padding); +# elif defined(MAP_ALIGNED) + const size_t align = (sizeof(size_t) * 8) - (size_t)(__builtin_clzl(size - 1)); + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_ALIGNED(align) : 0) | flags, -1, 0); +# elif defined(MAP_ALIGN) + caddr_t base = (_memory_huge_pages ? (caddr_t)(4 << 20) : 0); + void* ptr = mmap(base, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_ALIGN : 0) | flags, -1, 0); +# else + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); +# endif + if ((ptr == MAP_FAILED) || !ptr) { + if (_memory_config.map_fail_callback) { + if (_memory_config.map_fail_callback(size + padding)) + return _rpmalloc_mmap_os(size, offset); + } else if (errno != ENOMEM) { + rpmalloc_assert((ptr != MAP_FAILED) && ptr, "Failed to map virtual memory block"); + } + return 0; + } +#endif + _rpmalloc_stat_add(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift)); + if (padding) { + size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask); + rpmalloc_assert(final_padding <= _memory_span_size, "Internal failure in padding"); + rpmalloc_assert(final_padding <= padding, "Internal failure in padding"); + rpmalloc_assert(!(final_padding % 8), "Internal failure in padding"); + ptr = pointer_offset(ptr, final_padding); + *offset = final_padding >> 3; + } + rpmalloc_assert((size < _memory_span_size) || !((uintptr_t)ptr & ~_memory_span_mask), "Internal failure in padding"); + return ptr; +} + +//! Default implementation to unmap pages from virtual memory +static void +_rpmalloc_unmap_os(void* address, size_t size, size_t offset, size_t release) { + rpmalloc_assert(release || (offset == 0), "Invalid unmap size"); + rpmalloc_assert(!release || (release >= _memory_page_size), "Invalid unmap size"); + rpmalloc_assert(size >= _memory_page_size, "Invalid unmap size"); + if (release && offset) { + offset <<= 3; + address = pointer_offset(address, -(int32_t)offset); + if ((release >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) { + //Padding is always one span size + release += _memory_span_size; + } + } +#if !DISABLE_UNMAP +#if PLATFORM_WINDOWS + if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) { + rpmalloc_assert(0, "Failed to unmap virtual memory block"); + } +#else + if (release) { + if (munmap(address, release)) { + rpmalloc_assert(0, "Failed to unmap virtual memory block"); + } + } else { +#if defined(MADV_FREE_REUSABLE) + int ret; + while ((ret = madvise(address, size, MADV_FREE_REUSABLE)) == -1 && (errno == EAGAIN)) + errno = 0; + if ((ret == -1) && (errno != 0)) { +#elif defined(MADV_DONTNEED) + if (madvise(address, size, MADV_DONTNEED)) { +#elif defined(MADV_PAGEOUT) + if (madvise(address, size, MADV_PAGEOUT)) { +#elif defined(MADV_FREE) + if (madvise(address, size, MADV_FREE)) { +#else + if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) { +#endif + rpmalloc_assert(0, "Failed to madvise virtual memory block as free"); + } + } +#endif +#endif + if (release) + _rpmalloc_stat_sub(&_mapped_pages_os, release >> _memory_page_size_shift); +} + +static void +_rpmalloc_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count); + +//! Use global reserved spans to fulfill a memory map request (reserve size must be checked by caller) +static span_t* +_rpmalloc_global_get_reserved_spans(size_t span_count) { + span_t* span = _memory_global_reserve; + _rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, span, span_count); + _memory_global_reserve_count -= span_count; + if (_memory_global_reserve_count) + _memory_global_reserve = (span_t*)pointer_offset(span, span_count << _memory_span_size_shift); + else + _memory_global_reserve = 0; + return span; +} + +//! Store the given spans as global reserve (must only be called from within new heap allocation, not thread safe) +static void +_rpmalloc_global_set_reserved_spans(span_t* master, span_t* reserve, size_t reserve_span_count) { + _memory_global_reserve_master = master; + _memory_global_reserve_count = reserve_span_count; + _memory_global_reserve = reserve; +} + + +//////////// +/// +/// Span linked list management +/// +////// + +//! Add a span to double linked list at the head +static void +_rpmalloc_span_double_link_list_add(span_t** head, span_t* span) { + if (*head) + (*head)->prev = span; + span->next = *head; + *head = span; +} + +//! Pop head span from double linked list +static void +_rpmalloc_span_double_link_list_pop_head(span_t** head, span_t* span) { + rpmalloc_assert(*head == span, "Linked list corrupted"); + span = *head; + *head = span->next; +} + +//! Remove a span from double linked list +static void +_rpmalloc_span_double_link_list_remove(span_t** head, span_t* span) { + rpmalloc_assert(*head, "Linked list corrupted"); + if (*head == span) { + *head = span->next; + } else { + span_t* next_span = span->next; + span_t* prev_span = span->prev; + prev_span->next = next_span; + if (EXPECTED(next_span != 0)) + next_span->prev = prev_span; + } +} + + +//////////// +/// +/// Span control +/// +////// + +static void +_rpmalloc_heap_cache_insert(heap_t* heap, span_t* span); + +static void +_rpmalloc_heap_finalize(heap_t* heap); + +static void +_rpmalloc_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count); + //! Declare the span to be a subspan and store distance from master span and span count static void -_memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) { - assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER)); +_rpmalloc_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) { + rpmalloc_assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER), "Span master pointer and/or flag mismatch"); if (subspan != master) { subspan->flags = SPAN_FLAG_SUBSPAN; - subspan->total_spans_or_distance = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift); + subspan->offset_from_master = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift); subspan->align_offset = 0; } subspan->span_count = (uint32_t)span_count; @@ -618,496 +1081,178 @@ _memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size //! Use reserved spans to fulfill a memory map request (reserve size must be checked by caller) static span_t* -_memory_map_from_reserve(heap_t* heap, size_t span_count) { +_rpmalloc_span_map_from_reserve(heap_t* heap, size_t span_count) { //Update the heap span reserve span_t* span = heap->span_reserve; heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size); - heap->spans_reserved -= span_count; + heap->spans_reserved -= (uint32_t)span_count; - _memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count); + _rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count); if (span_count <= LARGE_CLASS_COUNT) - _memory_statistics_inc(heap->span_use[span_count - 1].spans_from_reserved, 1); + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_reserved); return span; } //! Get the aligned number of spans to map in based on wanted count, configured mapping granularity and the page size static size_t -_memory_map_align_span_count(size_t span_count) { +_rpmalloc_span_align_count(size_t span_count) { size_t request_count = (span_count > _memory_span_map_count) ? span_count : _memory_span_map_count; if ((_memory_page_size > _memory_span_size) && ((request_count * _memory_span_size) % _memory_page_size)) - request_count += _memory_span_map_count - (request_count % _memory_span_map_count); + request_count += _memory_span_map_count - (request_count % _memory_span_map_count); return request_count; } -//! Store the given spans as reserve in the given heap -static void -_memory_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) { - heap->span_reserve_master = master; - heap->span_reserve = reserve; - heap->spans_reserved = reserve_span_count; -} - //! Setup a newly mapped span static void -_memory_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) { - span->total_spans_or_distance = (uint32_t)total_span_count; +_rpmalloc_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) { + span->total_spans = (uint32_t)total_span_count; span->span_count = (uint32_t)span_count; span->align_offset = (uint32_t)align_offset; span->flags = SPAN_FLAG_MASTER; - atomic_store32(&span->remaining_spans, (int32_t)total_span_count); + atomic_store32(&span->remaining_spans, (int32_t)total_span_count); } -//! Map a akigned set of spans, taking configured mapping granularity and the page size into account +static void +_rpmalloc_span_unmap(span_t* span); + +//! Map an aligned set of spans, taking configured mapping granularity and the page size into account static span_t* -_memory_map_aligned_span_count(heap_t* heap, size_t span_count) { +_rpmalloc_span_map_aligned_count(heap_t* heap, size_t span_count) { //If we already have some, but not enough, reserved spans, release those to heap cache and map a new //full set of spans. Otherwise we would waste memory if page size > span size (huge pages) - size_t aligned_span_count = _memory_map_align_span_count(span_count); + size_t aligned_span_count = _rpmalloc_span_align_count(span_count); size_t align_offset = 0; - span_t* span = (span_t*)_memory_map(aligned_span_count * _memory_span_size, &align_offset); + span_t* span = (span_t*)_rpmalloc_mmap(aligned_span_count * _memory_span_size, &align_offset); if (!span) return 0; - _memory_span_initialize(span, aligned_span_count, span_count, align_offset); - _memory_statistics_add(&_reserved_spans, aligned_span_count); + _rpmalloc_span_initialize(span, aligned_span_count, span_count, align_offset); + _rpmalloc_stat_inc(&_master_spans); if (span_count <= LARGE_CLASS_COUNT) - _memory_statistics_inc(heap->span_use[span_count - 1].spans_map_calls, 1); + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_map_calls); if (aligned_span_count > span_count) { + span_t* reserved_spans = (span_t*)pointer_offset(span, span_count * _memory_span_size); + size_t reserved_count = aligned_span_count - span_count; if (heap->spans_reserved) { - _memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved); - _memory_heap_cache_insert(heap, heap->span_reserve); + _rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved); + _rpmalloc_heap_cache_insert(heap, heap->span_reserve); } - _memory_heap_set_reserved_spans(heap, span, (span_t*)pointer_offset(span, span_count * _memory_span_size), aligned_span_count - span_count); + if (reserved_count > _memory_heap_reserve_count) { + // If huge pages or eager spam map count, the global reserve spin lock is held by caller, _rpmalloc_span_map + rpmalloc_assert(atomic_load32(&_memory_global_lock) == 1, "Global spin lock not held as expected"); + size_t remain_count = reserved_count - _memory_heap_reserve_count; + reserved_count = _memory_heap_reserve_count; + span_t* remain_span = (span_t*)pointer_offset(reserved_spans, reserved_count * _memory_span_size); + if (_memory_global_reserve) { + _rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, _memory_global_reserve, _memory_global_reserve_count); + _rpmalloc_span_unmap(_memory_global_reserve); + } + _rpmalloc_global_set_reserved_spans(span, remain_span, remain_count); + } + _rpmalloc_heap_set_reserved_spans(heap, span, reserved_spans, reserved_count); } return span; } //! Map in memory pages for the given number of spans (or use previously reserved pages) static span_t* -_memory_map_spans(heap_t* heap, size_t span_count) { +_rpmalloc_span_map(heap_t* heap, size_t span_count) { if (span_count <= heap->spans_reserved) - return _memory_map_from_reserve(heap, span_count); - return _memory_map_aligned_span_count(heap, span_count); + return _rpmalloc_span_map_from_reserve(heap, span_count); + span_t* span = 0; + int use_global_reserve = (_memory_page_size > _memory_span_size) || (_memory_span_map_count > _memory_heap_reserve_count); + if (use_global_reserve) { + // If huge pages, make sure only one thread maps more memory to avoid bloat + while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) + _rpmalloc_spin(); + if (_memory_global_reserve_count >= span_count) { + size_t reserve_count = (!heap->spans_reserved ? _memory_heap_reserve_count : span_count); + if (_memory_global_reserve_count < reserve_count) + reserve_count = _memory_global_reserve_count; + span = _rpmalloc_global_get_reserved_spans(reserve_count); + if (span) { + if (reserve_count > span_count) { + span_t* reserved_span = (span_t*)pointer_offset(span, span_count << _memory_span_size_shift); + _rpmalloc_heap_set_reserved_spans(heap, _memory_global_reserve_master, reserved_span, reserve_count - span_count); + } + // Already marked as subspan in _rpmalloc_global_get_reserved_spans + span->span_count = (uint32_t)span_count; + } + } + } + if (!span) + span = _rpmalloc_span_map_aligned_count(heap, span_count); + if (use_global_reserve) + atomic_store32_release(&_memory_global_lock, 0); + return span; } //! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings) static void -_memory_unmap_span(span_t* span) { - assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN)); - assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN)); +_rpmalloc_span_unmap(span_t* span) { + rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted"); + rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted"); int is_master = !!(span->flags & SPAN_FLAG_MASTER); - span_t* master = is_master ? span : (span_t*)(pointer_offset(span, -(int32_t)(span->total_spans_or_distance * _memory_span_size))); - assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN)); - assert(master->flags & SPAN_FLAG_MASTER); + span_t* master = is_master ? span : ((span_t*)pointer_offset(span, -(intptr_t)((uintptr_t)span->offset_from_master * _memory_span_size))); + rpmalloc_assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted"); + rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted"); size_t span_count = span->span_count; if (!is_master) { //Directly unmap subspans (unless huge pages, in which case we defer and unmap entire page range with master) - assert(span->align_offset == 0); - if (_memory_span_size >= _memory_page_size) { - _memory_unmap(span, span_count * _memory_span_size, 0, 0); - _memory_statistics_sub(&_reserved_spans, span_count); - } + rpmalloc_assert(span->align_offset == 0, "Span align offset corrupted"); + if (_memory_span_size >= _memory_page_size) + _rpmalloc_unmap(span, span_count * _memory_span_size, 0, 0); } else { //Special double flag to denote an unmapped master //It must be kept in memory since span header must be used - span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN; + span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN | SPAN_FLAG_UNMAPPED_MASTER; + _rpmalloc_stat_add(&_unmapped_master_spans, 1); } if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) { //Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span - assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN)); + rpmalloc_assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted"); size_t unmap_count = master->span_count; if (_memory_span_size < _memory_page_size) - unmap_count = master->total_spans_or_distance; - _memory_statistics_sub(&_reserved_spans, unmap_count); - _memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, master->total_spans_or_distance * _memory_span_size); + unmap_count = master->total_spans; + _rpmalloc_stat_sub(&_master_spans, 1); + _rpmalloc_stat_sub(&_unmapped_master_spans, 1); + _rpmalloc_unmap(master, unmap_count * _memory_span_size, master->align_offset, (size_t)master->total_spans * _memory_span_size); } } -#if ENABLE_THREAD_CACHE - -//! Unmap a single linked list of spans -static void -_memory_unmap_span_list(span_t* span) { - size_t list_size = span->list_size; - for (size_t ispan = 0; ispan < list_size; ++ispan) { - span_t* next_span = span->next; - _memory_unmap_span(span); - span = next_span; - } - assert(!span); -} - -//! Add span to head of single linked span list -static size_t -_memory_span_list_push(span_t** head, span_t* span) { - span->next = *head; - if (*head) - span->list_size = (*head)->list_size + 1; - else - span->list_size = 1; - *head = span; - return span->list_size; -} - -//! Remove span from head of single linked span list, returns the new list head -static span_t* -_memory_span_list_pop(span_t** head) { - span_t* span = *head; - span_t* next_span = 0; - if (span->list_size > 1) { - assert(span->next); - next_span = span->next; - assert(next_span); - next_span->list_size = span->list_size - 1; - } - *head = next_span; - return span; -} - -//! Split a single linked span list -static span_t* -_memory_span_list_split(span_t* span, size_t limit) { - span_t* next = 0; - if (limit < 2) - limit = 2; - if (span->list_size > limit) { - uint32_t list_size = 1; - span_t* last = span; - next = span->next; - while (list_size < limit) { - last = next; - next = next->next; - ++list_size; - } - last->next = 0; - assert(next); - next->list_size = span->list_size - list_size; - span->list_size = list_size; - span->prev = 0; - } - return next; -} - -#endif - -//! Add a span to partial span double linked list at the head -static void -_memory_span_partial_list_add(span_t** head, span_t* span) { - if (*head) { - span->next = *head; - //Maintain pointer to tail span - span->prev = (*head)->prev; - (*head)->prev = span; - } else { - span->next = 0; - span->prev = span; - } - *head = span; -} - -//! Add a span to partial span double linked list at the tail -static void -_memory_span_partial_list_add_tail(span_t** head, span_t* span) { - span->next = 0; - if (*head) { - span_t* tail = (*head)->prev; - tail->next = span; - span->prev = tail; - //Maintain pointer to tail span - (*head)->prev = span; - } else { - span->prev = span; - *head = span; - } -} - -//! Pop head span from partial span double linked list -static void -_memory_span_partial_list_pop_head(span_t** head) { - span_t* span = *head; - *head = span->next; - if (*head) { - //Maintain pointer to tail span - (*head)->prev = span->prev; - } -} - -//! Remove a span from partial span double linked list -static void -_memory_span_partial_list_remove(span_t** head, span_t* span) { - if (UNEXPECTED(*head == span)) { - _memory_span_partial_list_pop_head(head); - } else { - span_t* next_span = span->next; - span_t* prev_span = span->prev; - prev_span->next = next_span; - if (EXPECTED(next_span != 0)) { - next_span->prev = prev_span; - } else { - //Update pointer to tail span - (*head)->prev = prev_span; - } - } -} - -#if ENABLE_GLOBAL_CACHE - -//! Insert the given list of memory page spans in the global cache -static void -_memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) { - assert((span->list_size == 1) || (span->next != 0)); - int32_t list_size = (int32_t)span->list_size; - //Unmap if cache has reached the limit - if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) { -#if !ENABLE_UNLIMITED_GLOBAL_CACHE - _memory_unmap_span_list(span); - atomic_add32(&cache->size, -list_size); - return; -#endif - } - void* current_cache, *new_cache; - do { - current_cache = atomic_load_ptr(&cache->cache); - span->prev = (span_t*)((uintptr_t)current_cache & _memory_span_mask); - new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask)); - } while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache)); -} - -//! Extract a number of memory page spans from the global cache -static span_t* -_memory_cache_extract(global_cache_t* cache) { - uintptr_t span_ptr; - do { - void* global_span = atomic_load_ptr(&cache->cache); - span_ptr = (uintptr_t)global_span & _memory_span_mask; - if (span_ptr) { - span_t* span = (span_t*)span_ptr; - //By accessing the span ptr before it is swapped out of list we assume that a contending thread - //does not manage to traverse the span to being unmapped before we access it - void* new_cache = (void*)((uintptr_t)span->prev | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask)); - if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) { - atomic_add32(&cache->size, -(int32_t)span->list_size); - return span; - } - } - } while (span_ptr); - return 0; -} - -//! Finalize a global cache, only valid from allocator finalization (not thread safe) -static void -_memory_cache_finalize(global_cache_t* cache) { - void* current_cache = atomic_load_ptr(&cache->cache); - span_t* span = (span_t*)((uintptr_t)current_cache & _memory_span_mask); - while (span) { - span_t* skip_span = (span_t*)((uintptr_t)span->prev & _memory_span_mask); - atomic_add32(&cache->size, -(int32_t)span->list_size); - _memory_unmap_span_list(span); - span = skip_span; - } - assert(!atomic_load32(&cache->size)); - atomic_store_ptr(&cache->cache, 0); - atomic_store32(&cache->size, 0); -} - -//! Insert the given list of memory page spans in the global cache -static void -_memory_global_cache_insert(span_t* span) { - size_t span_count = span->span_count; -#if ENABLE_UNLIMITED_GLOBAL_CACHE - _memory_cache_insert(&_memory_span_cache[span_count - 1], span, 0); -#else - const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * ((span_count == 1) ? _memory_span_release_count : _memory_span_release_count_large)); - _memory_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit); -#endif -} - -//! Extract a number of memory page spans from the global cache for large blocks -static span_t* -_memory_global_cache_extract(size_t span_count) { - span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]); - assert(!span || (span->span_count == span_count)); - return span; -} - -#endif - -#if ENABLE_THREAD_CACHE -//! Adopt the deferred span cache list -static void -_memory_heap_cache_adopt_deferred(heap_t* heap) { - atomic_thread_fence_acquire(); - span_t* span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred); - if (!span) - return; - do { - span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred); - } while (!atomic_cas_ptr(&heap->span_cache_deferred, 0, span)); - while (span) { - span_t* next_span = span->next; - _memory_span_list_push(&heap->span_cache[0], span); -#if ENABLE_STATISTICS - atomic_decr32(&heap->span_use[span->span_count - 1].current); - ++heap->size_class_use[span->size_class].spans_to_cache; - --heap->size_class_use[span->size_class].spans_current; -#endif - span = next_span; - } -} -#endif - -//! Insert a single span into thread heap cache, releasing to global cache if overflow -static void -_memory_heap_cache_insert(heap_t* heap, span_t* span) { -#if ENABLE_THREAD_CACHE - size_t span_count = span->span_count; - size_t idx = span_count - 1; - _memory_statistics_inc(heap->span_use[idx].spans_to_cache, 1); - if (!idx) - _memory_heap_cache_adopt_deferred(heap); -#if ENABLE_UNLIMITED_THREAD_CACHE - _memory_span_list_push(&heap->span_cache[idx], span); -#else - const size_t release_count = (!idx ? _memory_span_release_count : _memory_span_release_count_large); - size_t current_cache_size = _memory_span_list_push(&heap->span_cache[idx], span); - if (current_cache_size <= release_count) - return; - const size_t hard_limit = release_count * THREAD_CACHE_MULTIPLIER; - if (current_cache_size <= hard_limit) { -#if ENABLE_ADAPTIVE_THREAD_CACHE - //Require 25% of high water mark to remain in cache (and at least 1, if use is 0) - const size_t high_mark = heap->span_use[idx].high; - const size_t min_limit = (high_mark >> 2) + release_count + 1; - if (current_cache_size < min_limit) - return; -#else - return; -#endif - } - heap->span_cache[idx] = _memory_span_list_split(span, release_count); - assert(span->list_size == release_count); -#if ENABLE_STATISTICS - heap->thread_to_global += (size_t)span->list_size * span_count * _memory_span_size; - heap->span_use[idx].spans_to_global += span->list_size; -#endif -#if ENABLE_GLOBAL_CACHE - _memory_global_cache_insert(span); -#else - _memory_unmap_span_list(span); -#endif -#endif -#else - (void)sizeof(heap); - _memory_unmap_span(span); -#endif -} - -//! Extract the given number of spans from the different cache levels -static span_t* -_memory_heap_thread_cache_extract(heap_t* heap, size_t span_count) { -#if ENABLE_THREAD_CACHE - size_t idx = span_count - 1; - if (!idx) - _memory_heap_cache_adopt_deferred(heap); - if (heap->span_cache[idx]) { -#if ENABLE_STATISTICS - heap->span_use[idx].spans_from_cache++; -#endif - return _memory_span_list_pop(&heap->span_cache[idx]); - } -#endif - return 0; -} - -static span_t* -_memory_heap_reserved_extract(heap_t* heap, size_t span_count) { - if (heap->spans_reserved >= span_count) - return _memory_map_spans(heap, span_count); - return 0; -} - -//! Extract a span from the global cache -static span_t* -_memory_heap_global_cache_extract(heap_t* heap, size_t span_count) { -#if ENABLE_GLOBAL_CACHE - size_t idx = span_count - 1; - heap->span_cache[idx] = _memory_global_cache_extract(span_count); - if (heap->span_cache[idx]) { -#if ENABLE_STATISTICS - heap->global_to_thread += (size_t)heap->span_cache[idx]->list_size * span_count * _memory_span_size; - heap->span_use[idx].spans_from_global += heap->span_cache[idx]->list_size; -#endif - return _memory_span_list_pop(&heap->span_cache[idx]); - } -#endif - return 0; -} - -//! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory -static span_t* -_memory_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_idx) { - (void)sizeof(class_idx); -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS - uint32_t idx = (uint32_t)span_count - 1; - uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current); - if (current_count > heap->span_use[idx].high) - heap->span_use[idx].high = current_count; -#if ENABLE_STATISTICS - uint32_t spans_current = ++heap->size_class_use[class_idx].spans_current; - if (spans_current > heap->size_class_use[class_idx].spans_peak) - heap->size_class_use[class_idx].spans_peak = spans_current; -#endif -#endif - span_t* span = _memory_heap_thread_cache_extract(heap, span_count); - if (EXPECTED(span != 0)) { - _memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1); - return span; - } - span = _memory_heap_reserved_extract(heap, span_count); - if (EXPECTED(span != 0)) { - _memory_statistics_inc(heap->size_class_use[class_idx].spans_from_reserved, 1); - return span; - } - span = _memory_heap_global_cache_extract(heap, span_count); - if (EXPECTED(span != 0)) { - _memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1); - return span; - } - //Final fallback, map in more virtual memory - span = _memory_map_spans(heap, span_count); - _memory_statistics_inc(heap->size_class_use[class_idx].spans_map_calls, 1); - return span; -} - //! Move the span (used for small or medium allocations) to the heap thread cache static void -_memory_span_release_to_cache(heap_t* heap, span_t* span) { - heap_class_t* heap_class = heap->span_class + span->size_class; - assert(heap_class->partial_span != span); - if (span->state == SPAN_STATE_PARTIAL) - _memory_span_partial_list_remove(&heap_class->partial_span, span); +_rpmalloc_span_release_to_cache(heap_t* heap, span_t* span) { + rpmalloc_assert(heap == span->heap, "Span heap pointer corrupted"); + rpmalloc_assert(span->size_class < SIZE_CLASS_COUNT, "Invalid span size class"); + rpmalloc_assert(span->span_count == 1, "Invalid span count"); #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS atomic_decr32(&heap->span_use[0].current); #endif - _memory_statistics_inc(heap->span_use[0].spans_to_cache, 1); - _memory_statistics_inc(heap->size_class_use[span->size_class].spans_to_cache, 1); - _memory_statistics_dec(heap->size_class_use[span->size_class].spans_current, 1); - _memory_heap_cache_insert(heap, span); + _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current); + if (!heap->finalize) { + _rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache); + _rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache); + if (heap->size_class[span->size_class].cache) + _rpmalloc_heap_cache_insert(heap, heap->size_class[span->size_class].cache); + heap->size_class[span->size_class].cache = span; + } else { + _rpmalloc_span_unmap(span); + } } //! Initialize a (partial) free list up to next system memory page, while reserving the first block //! as allocated, returning number of blocks in list static uint32_t -free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start, - uint32_t block_count, uint32_t block_size) { - assert(block_count); +free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start, uint32_t block_count, uint32_t block_size) { + rpmalloc_assert(block_count, "Internal failure"); *first_block = block_start; if (block_count > 1) { void* free_block = pointer_offset(block_start, block_size); - void* block_end = pointer_offset(block_start, block_size * block_count); + void* block_end = pointer_offset(block_start, (size_t)block_size * block_count); //If block size is less than half a memory page, bound init to next memory page boundary if (block_size < (_memory_page_size >> 1)) { void* page_end = pointer_offset(page_start, _memory_page_size); @@ -1130,75 +1275,802 @@ free_list_partial_init(void** list, void** first_block, void* page_start, void* return block_count; } -//! Initialize an unused span (from cache or mapped) to be new active span +//! Initialize an unused span (from cache or mapped) to be new active span, putting the initial free list in heap class free list static void* -_memory_span_set_new_active(heap_t* heap, heap_class_t* heap_class, span_t* span, uint32_t class_idx) { - assert(span->span_count == 1); +_rpmalloc_span_initialize_new(heap_t* heap, heap_size_class_t* heap_size_class, span_t* span, uint32_t class_idx) { + rpmalloc_assert(span->span_count == 1, "Internal failure"); size_class_t* size_class = _memory_size_class + class_idx; span->size_class = class_idx; span->heap = heap; span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS; - span->block_count = size_class->block_count; span->block_size = size_class->block_size; - span->state = SPAN_STATE_ACTIVE; + span->block_count = size_class->block_count; span->free_list = 0; + span->list_size = 0; + atomic_store_ptr_release(&span->free_list_deferred, 0); //Setup free list. Only initialize one system page worth of free blocks in list void* block; - span->free_list_limit = free_list_partial_init(&heap_class->free_list, &block, + span->free_list_limit = free_list_partial_init(&heap_size_class->free_list, &block, span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size); - atomic_store_ptr(&span->free_list_deferred, 0); - span->list_size = 0; - atomic_thread_fence_release(); - - _memory_span_partial_list_add(&heap_class->partial_span, span); + //Link span as partial if there remains blocks to be initialized as free list, or full if fully initialized + if (span->free_list_limit < span->block_count) { + _rpmalloc_span_double_link_list_add(&heap_size_class->partial_span, span); + span->used_count = span->free_list_limit; + } else { +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span); +#endif + ++heap->full_span_count; + span->used_count = span->block_count; + } return block; } -//! Promote a partially used span (from heap used list) to be new active span static void -_memory_span_set_partial_active(heap_class_t* heap_class, span_t* span) { - assert(span->state == SPAN_STATE_PARTIAL); - assert(span->block_count == _memory_size_class[span->size_class].block_count); - //Move data to heap size class and set span as active - heap_class->free_list = span->free_list; - span->state = SPAN_STATE_ACTIVE; - span->free_list = 0; - assert(heap_class->free_list); -} - -//! Mark span as full (from active) -static void -_memory_span_set_active_full(heap_class_t* heap_class, span_t* span) { - assert(span->state == SPAN_STATE_ACTIVE); - assert(span == heap_class->partial_span); - _memory_span_partial_list_pop_head(&heap_class->partial_span); - span->used_count = span->block_count; - span->state = SPAN_STATE_FULL; - span->free_list = 0; -} - -//! Move span from full to partial state -static void -_memory_span_set_full_partial(heap_t* heap, span_t* span) { - assert(span->state == SPAN_STATE_FULL); - heap_class_t* heap_class = &heap->span_class[span->size_class]; - span->state = SPAN_STATE_PARTIAL; - _memory_span_partial_list_add_tail(&heap_class->partial_span, span); -} - -static void* -_memory_span_extract_deferred(span_t* span) { - void* free_list; +_rpmalloc_span_extract_free_list_deferred(span_t* span) { + // We need acquire semantics on the CAS operation since we are interested in the list size + // Refer to _rpmalloc_deallocate_defer_small_or_medium for further comments on this dependency do { - free_list = atomic_load_ptr(&span->free_list_deferred); - } while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list)); + span->free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER); + } while (span->free_list == INVALID_POINTER); + span->used_count -= span->list_size; span->list_size = 0; - atomic_store_ptr(&span->free_list_deferred, 0); - atomic_thread_fence_release(); - return free_list; + atomic_store_ptr_release(&span->free_list_deferred, 0); } +static int +_rpmalloc_span_is_fully_utilized(span_t* span) { + rpmalloc_assert(span->free_list_limit <= span->block_count, "Span free list corrupted"); + return !span->free_list && (span->free_list_limit >= span->block_count); +} + +static int +_rpmalloc_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list_head) { + void* free_list = heap->size_class[iclass].free_list; + span_t* class_span = (span_t*)((uintptr_t)free_list & _memory_span_mask); + if (span == class_span) { + // Adopt the heap class free list back into the span free list + void* block = span->free_list; + void* last_block = 0; + while (block) { + last_block = block; + block = *((void**)block); + } + uint32_t free_count = 0; + block = free_list; + while (block) { + ++free_count; + block = *((void**)block); + } + if (last_block) { + *((void**)last_block) = free_list; + } else { + span->free_list = free_list; + } + heap->size_class[iclass].free_list = 0; + span->used_count -= free_count; + } + //If this assert triggers you have memory leaks + rpmalloc_assert(span->list_size == span->used_count, "Memory leak detected"); + if (span->list_size == span->used_count) { + _rpmalloc_stat_dec(&heap->span_use[0].current); + _rpmalloc_stat_dec(&heap->size_class_use[iclass].spans_current); + // This function only used for spans in double linked lists + if (list_head) + _rpmalloc_span_double_link_list_remove(list_head, span); + _rpmalloc_span_unmap(span); + return 1; + } + return 0; +} + + +//////////// +/// +/// Global cache +/// +////// + +#if ENABLE_GLOBAL_CACHE + +//! Finalize a global cache +static void +_rpmalloc_global_cache_finalize(global_cache_t* cache) { + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + + for (size_t ispan = 0; ispan < cache->count; ++ispan) + _rpmalloc_span_unmap(cache->span[ispan]); + cache->count = 0; + + while (cache->overflow) { + span_t* span = cache->overflow; + cache->overflow = span->next; + _rpmalloc_span_unmap(span); + } + + atomic_store32_release(&cache->lock, 0); +} + +static void +_rpmalloc_global_cache_insert_spans(span_t** span, size_t span_count, size_t count) { + const size_t cache_limit = (span_count == 1) ? + GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE : + GLOBAL_CACHE_MULTIPLIER * (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1)); + + global_cache_t* cache = &_memory_span_cache[span_count - 1]; + + size_t insert_count = count; + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + +#if ENABLE_STATISTICS + cache->insert_count += count; +#endif + if ((cache->count + insert_count) > cache_limit) + insert_count = cache_limit - cache->count; + + memcpy(cache->span + cache->count, span, sizeof(span_t*) * insert_count); + cache->count += (uint32_t)insert_count; + +#if ENABLE_UNLIMITED_CACHE + while (insert_count < count) { +#else + // Enable unlimited cache if huge pages, or we will leak since it is unlikely that an entire huge page + // will be unmapped, and we're unable to partially decommit a huge page + while ((_memory_page_size > _memory_span_size) && (insert_count < count)) { +#endif + span_t* current_span = span[insert_count++]; + current_span->next = cache->overflow; + cache->overflow = current_span; + } + atomic_store32_release(&cache->lock, 0); + + span_t* keep = 0; + for (size_t ispan = insert_count; ispan < count; ++ispan) { + span_t* current_span = span[ispan]; + // Keep master spans that has remaining subspans to avoid dangling them + if ((current_span->flags & SPAN_FLAG_MASTER) && + (atomic_load32(¤t_span->remaining_spans) > (int32_t)current_span->span_count)) { + current_span->next = keep; + keep = current_span; + } else { + _rpmalloc_span_unmap(current_span); + } + } + + if (keep) { + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + + size_t islot = 0; + while (keep) { + for (; islot < cache->count; ++islot) { + span_t* current_span = cache->span[islot]; + if (!(current_span->flags & SPAN_FLAG_MASTER) || ((current_span->flags & SPAN_FLAG_MASTER) && + (atomic_load32(¤t_span->remaining_spans) <= (int32_t)current_span->span_count))) { + _rpmalloc_span_unmap(current_span); + cache->span[islot] = keep; + break; + } + } + if (islot == cache->count) + break; + keep = keep->next; + } + + if (keep) { + span_t* tail = keep; + while (tail->next) + tail = tail->next; + tail->next = cache->overflow; + cache->overflow = keep; + } + + atomic_store32_release(&cache->lock, 0); + } +} + +static size_t +_rpmalloc_global_cache_extract_spans(span_t** span, size_t span_count, size_t count) { + global_cache_t* cache = &_memory_span_cache[span_count - 1]; + + size_t extract_count = 0; + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + +#if ENABLE_STATISTICS + cache->extract_count += count; +#endif + size_t want = count - extract_count; + if (want > cache->count) + want = cache->count; + + memcpy(span + extract_count, cache->span + (cache->count - want), sizeof(span_t*) * want); + cache->count -= (uint32_t)want; + extract_count += want; + + while ((extract_count < count) && cache->overflow) { + span_t* current_span = cache->overflow; + span[extract_count++] = current_span; + cache->overflow = current_span->next; + } + +#if ENABLE_ASSERTS + for (size_t ispan = 0; ispan < extract_count; ++ispan) { + assert(span[ispan]->span_count == span_count); + } +#endif + + atomic_store32_release(&cache->lock, 0); + + return extract_count; +} + +#endif + +//////////// +/// +/// Heap control +/// +////// + +static void _rpmalloc_deallocate_huge(span_t*); + +//! Store the given spans as reserve in the given heap +static void +_rpmalloc_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) { + heap->span_reserve_master = master; + heap->span_reserve = reserve; + heap->spans_reserved = (uint32_t)reserve_span_count; +} + +//! Adopt the deferred span cache list, optionally extracting the first single span for immediate re-use +static void +_rpmalloc_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) { + span_t* span = (span_t*)((void*)atomic_exchange_ptr_acquire(&heap->span_free_deferred, 0)); + while (span) { + span_t* next_span = (span_t*)span->free_list; + rpmalloc_assert(span->heap == heap, "Span heap pointer corrupted"); + if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) { + rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted"); + --heap->full_span_count; + _rpmalloc_stat_dec(&heap->span_use[0].spans_deferred); +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span); +#endif + _rpmalloc_stat_dec(&heap->span_use[0].current); + _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current); + if (single_span && !*single_span) + *single_span = span; + else + _rpmalloc_heap_cache_insert(heap, span); + } else { + if (span->size_class == SIZE_CLASS_HUGE) { + _rpmalloc_deallocate_huge(span); + } else { + rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, "Span size class invalid"); + rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted"); + --heap->full_span_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&heap->large_huge_span, span); +#endif + uint32_t idx = span->span_count - 1; + _rpmalloc_stat_dec(&heap->span_use[idx].spans_deferred); + _rpmalloc_stat_dec(&heap->span_use[idx].current); + if (!idx && single_span && !*single_span) + *single_span = span; + else + _rpmalloc_heap_cache_insert(heap, span); + } + } + span = next_span; + } +} + +static void +_rpmalloc_heap_unmap(heap_t* heap) { + if (!heap->master_heap) { + if ((heap->finalize > 1) && !atomic_load32(&heap->child_count)) { + span_t* span = (span_t*)((uintptr_t)heap & _memory_span_mask); + _rpmalloc_span_unmap(span); + } + } else { + if (atomic_decr32(&heap->master_heap->child_count) == 0) { + _rpmalloc_heap_unmap(heap->master_heap); + } + } +} + +static void +_rpmalloc_heap_global_finalize(heap_t* heap) { + if (heap->finalize++ > 1) { + --heap->finalize; + return; + } + + _rpmalloc_heap_finalize(heap); + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t* span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1)); + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); + span_cache->count = 0; + } +#endif + + if (heap->full_span_count) { + --heap->finalize; + return; + } + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + if (heap->size_class[iclass].free_list || heap->size_class[iclass].partial_span) { + --heap->finalize; + return; + } + } + //Heap is now completely free, unmap and remove from heap list + size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE; + heap_t* list_heap = _memory_heaps[list_idx]; + if (list_heap == heap) { + _memory_heaps[list_idx] = heap->next_heap; + } else { + while (list_heap->next_heap != heap) + list_heap = list_heap->next_heap; + list_heap->next_heap = heap->next_heap; + } + + _rpmalloc_heap_unmap(heap); +} + +//! Insert a single span into thread heap cache, releasing to global cache if overflow +static void +_rpmalloc_heap_cache_insert(heap_t* heap, span_t* span) { + if (UNEXPECTED(heap->finalize != 0)) { + _rpmalloc_span_unmap(span); + _rpmalloc_heap_global_finalize(heap); + return; + } +#if ENABLE_THREAD_CACHE + size_t span_count = span->span_count; + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_to_cache); + if (span_count == 1) { + span_cache_t* span_cache = &heap->span_cache; + span_cache->span[span_cache->count++] = span; + if (span_cache->count == MAX_THREAD_SPAN_CACHE) { + const size_t remain_count = MAX_THREAD_SPAN_CACHE - THREAD_SPAN_CACHE_TRANSFER; +#if ENABLE_GLOBAL_CACHE + _rpmalloc_stat_add64(&heap->thread_to_global, THREAD_SPAN_CACHE_TRANSFER * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, THREAD_SPAN_CACHE_TRANSFER); + _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, span_count, THREAD_SPAN_CACHE_TRANSFER); +#else + for (size_t ispan = 0; ispan < THREAD_SPAN_CACHE_TRANSFER; ++ispan) + _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]); +#endif + span_cache->count = remain_count; + } + } else { + size_t cache_idx = span_count - 2; + span_large_cache_t* span_cache = heap->span_large_cache + cache_idx; + span_cache->span[span_cache->count++] = span; + const size_t cache_limit = (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1)); + if (span_cache->count == cache_limit) { + const size_t transfer_limit = 2 + (cache_limit >> 2); + const size_t transfer_count = (THREAD_SPAN_LARGE_CACHE_TRANSFER <= transfer_limit ? THREAD_SPAN_LARGE_CACHE_TRANSFER : transfer_limit); + const size_t remain_count = cache_limit - transfer_count; +#if ENABLE_GLOBAL_CACHE + _rpmalloc_stat_add64(&heap->thread_to_global, transfer_count * span_count * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, transfer_count); + _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, span_count, transfer_count); +#else + for (size_t ispan = 0; ispan < transfer_count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]); +#endif + span_cache->count = remain_count; + } + } +#else + (void)sizeof(heap); + _rpmalloc_span_unmap(span); +#endif +} + +//! Extract the given number of spans from the different cache levels +static span_t* +_rpmalloc_heap_thread_cache_extract(heap_t* heap, size_t span_count) { + span_t* span = 0; +#if ENABLE_THREAD_CACHE + span_cache_t* span_cache; + if (span_count == 1) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t*)(heap->span_large_cache + (span_count - 2)); + if (span_cache->count) { + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_cache); + return span_cache->span[--span_cache->count]; + } +#endif + return span; +} + +static span_t* +_rpmalloc_heap_thread_cache_deferred_extract(heap_t* heap, size_t span_count) { + span_t* span = 0; + if (span_count == 1) { + _rpmalloc_heap_cache_adopt_deferred(heap, &span); + } else { + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + span = _rpmalloc_heap_thread_cache_extract(heap, span_count); + } + return span; +} + +static span_t* +_rpmalloc_heap_reserved_extract(heap_t* heap, size_t span_count) { + if (heap->spans_reserved >= span_count) + return _rpmalloc_span_map(heap, span_count); + return 0; +} + +//! Extract a span from the global cache +static span_t* +_rpmalloc_heap_global_cache_extract(heap_t* heap, size_t span_count) { +#if ENABLE_GLOBAL_CACHE +#if ENABLE_THREAD_CACHE + span_cache_t* span_cache; + size_t wanted_count; + if (span_count == 1) { + span_cache = &heap->span_cache; + wanted_count = THREAD_SPAN_CACHE_TRANSFER; + } else { + span_cache = (span_cache_t*)(heap->span_large_cache + (span_count - 2)); + wanted_count = THREAD_SPAN_LARGE_CACHE_TRANSFER; + } + span_cache->count = _rpmalloc_global_cache_extract_spans(span_cache->span, span_count, wanted_count); + if (span_cache->count) { + _rpmalloc_stat_add64(&heap->global_to_thread, span_count * span_cache->count * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, span_cache->count); + return span_cache->span[--span_cache->count]; + } +#else + span_t* span = 0; + size_t count = _rpmalloc_global_cache_extract_spans(&span, span_count, 1); + if (count) { + _rpmalloc_stat_add64(&heap->global_to_thread, span_count * count * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, count); + return span; + } +#endif +#endif + (void)sizeof(heap); + (void)sizeof(span_count); + return 0; +} + +static void +_rpmalloc_inc_span_statistics(heap_t* heap, size_t span_count, uint32_t class_idx) { + (void)sizeof(heap); + (void)sizeof(span_count); + (void)sizeof(class_idx); +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + uint32_t idx = (uint32_t)span_count - 1; + uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current); + if (current_count > (uint32_t)atomic_load32(&heap->span_use[idx].high)) + atomic_store32(&heap->span_use[idx].high, (int32_t)current_count); + _rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1, heap->size_class_use[class_idx].spans_peak); +#endif +} + +//! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory +static span_t* +_rpmalloc_heap_extract_new_span(heap_t* heap, heap_size_class_t* heap_size_class, size_t span_count, uint32_t class_idx) { + span_t* span; +#if ENABLE_THREAD_CACHE + if (heap_size_class && heap_size_class->cache) { + span = heap_size_class->cache; + heap_size_class->cache = (heap->span_cache.count ? heap->span_cache.span[--heap->span_cache.count] : 0); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } +#endif + (void)sizeof(class_idx); + // Allow 50% overhead to increase cache hits + size_t base_span_count = span_count; + size_t limit_span_count = (span_count > 2) ? (span_count + (span_count >> 1)) : span_count; + if (limit_span_count > LARGE_CLASS_COUNT) + limit_span_count = LARGE_CLASS_COUNT; + do { + span = _rpmalloc_heap_thread_cache_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + span = _rpmalloc_heap_thread_cache_deferred_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + span = _rpmalloc_heap_reserved_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_reserved); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + span = _rpmalloc_heap_global_cache_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + ++span_count; + } while (span_count <= limit_span_count); + //Final fallback, map in more virtual memory + span = _rpmalloc_span_map(heap, base_span_count); + _rpmalloc_inc_span_statistics(heap, base_span_count, class_idx); + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_map_calls); + return span; +} + +static void +_rpmalloc_heap_initialize(heap_t* heap) { + memset((void*)heap, 0, sizeof(heap_t)); + //Get a new heap ID + heap->id = 1 + atomic_incr32(&_memory_heap_id); + + //Link in heap in heap ID map + size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE; + heap->next_heap = _memory_heaps[list_idx]; + _memory_heaps[list_idx] = heap; +} + +static void +_rpmalloc_heap_orphan(heap_t* heap, int first_class) { + heap->owner_thread = (uintptr_t)-1; +#if RPMALLOC_FIRST_CLASS_HEAPS + heap_t** heap_list = (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps); +#else + (void)sizeof(first_class); + heap_t** heap_list = &_memory_orphan_heaps; +#endif + heap->next_orphan = *heap_list; + *heap_list = heap; +} + +//! Allocate a new heap from newly mapped memory pages +static heap_t* +_rpmalloc_heap_allocate_new(void) { + // Map in pages for a 16 heaps. If page size is greater than required size for this, map a page and + // use first part for heaps and remaining part for spans for allocations. Adds a lot of complexity, + // but saves a lot of memory on systems where page size > 64 spans (4MiB) + size_t heap_size = sizeof(heap_t); + size_t aligned_heap_size = 16 * ((heap_size + 15) / 16); + size_t request_heap_count = 16; + size_t heap_span_count = ((aligned_heap_size * request_heap_count) + sizeof(span_t) + _memory_span_size - 1) / _memory_span_size; + size_t block_size = _memory_span_size * heap_span_count; + size_t span_count = heap_span_count; + span_t* span = 0; + // If there are global reserved spans, use these first + if (_memory_global_reserve_count >= heap_span_count) { + span = _rpmalloc_global_get_reserved_spans(heap_span_count); + } + if (!span) { + if (_memory_page_size > block_size) { + span_count = _memory_page_size / _memory_span_size; + block_size = _memory_page_size; + // If using huge pages, make sure to grab enough heaps to avoid reallocating a huge page just to serve new heaps + size_t possible_heap_count = (block_size - sizeof(span_t)) / aligned_heap_size; + if (possible_heap_count >= (request_heap_count * 16)) + request_heap_count *= 16; + else if (possible_heap_count < request_heap_count) + request_heap_count = possible_heap_count; + heap_span_count = ((aligned_heap_size * request_heap_count) + sizeof(span_t) + _memory_span_size - 1) / _memory_span_size; + } + + size_t align_offset = 0; + span = (span_t*)_rpmalloc_mmap(block_size, &align_offset); + if (!span) + return 0; + + // Master span will contain the heaps + _rpmalloc_stat_inc(&_master_spans); + _rpmalloc_span_initialize(span, span_count, heap_span_count, align_offset); + } + + size_t remain_size = _memory_span_size - sizeof(span_t); + heap_t* heap = (heap_t*)pointer_offset(span, sizeof(span_t)); + _rpmalloc_heap_initialize(heap); + + // Put extra heaps as orphans + size_t num_heaps = remain_size / aligned_heap_size; + if (num_heaps < request_heap_count) + num_heaps = request_heap_count; + atomic_store32(&heap->child_count, (int32_t)num_heaps - 1); + heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size); + while (num_heaps > 1) { + _rpmalloc_heap_initialize(extra_heap); + extra_heap->master_heap = heap; + _rpmalloc_heap_orphan(extra_heap, 1); + extra_heap = (heap_t*)pointer_offset(extra_heap, aligned_heap_size); + --num_heaps; + } + + if (span_count > heap_span_count) { + // Cap reserved spans + size_t remain_count = span_count - heap_span_count; + size_t reserve_count = (remain_count > _memory_heap_reserve_count ? _memory_heap_reserve_count : remain_count); + span_t* remain_span = (span_t*)pointer_offset(span, heap_span_count * _memory_span_size); + _rpmalloc_heap_set_reserved_spans(heap, span, remain_span, reserve_count); + + if (remain_count > reserve_count) { + // Set to global reserved spans + remain_span = (span_t*)pointer_offset(remain_span, reserve_count * _memory_span_size); + reserve_count = remain_count - reserve_count; + _rpmalloc_global_set_reserved_spans(span, remain_span, reserve_count); + } + } + + return heap; +} + +static heap_t* +_rpmalloc_heap_extract_orphan(heap_t** heap_list) { + heap_t* heap = *heap_list; + *heap_list = (heap ? heap->next_orphan : 0); + return heap; +} + +//! Allocate a new heap, potentially reusing a previously orphaned heap +static heap_t* +_rpmalloc_heap_allocate(int first_class) { + heap_t* heap = 0; + while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) + _rpmalloc_spin(); + if (first_class == 0) + heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps); +#if RPMALLOC_FIRST_CLASS_HEAPS + if (!heap) + heap = _rpmalloc_heap_extract_orphan(&_memory_first_class_orphan_heaps); +#endif + if (!heap) + heap = _rpmalloc_heap_allocate_new(); + atomic_store32_release(&_memory_global_lock, 0); + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + return heap; +} + +extern thread_local bool RpThreadShutdown; + +static void +_rpmalloc_heap_release(void* heapptr, int first_class, int release_cache) { + heap_t* heap = (heap_t*)heapptr; + if (!heap) + return; + RpThreadShutdown = true; + //Release thread cache spans back to global cache + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + if (release_cache || heap->finalize) { +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t* span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1)); + if (!span_cache->count) + continue; +#if ENABLE_GLOBAL_CACHE + if (heap->finalize) { + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); + } else { + _rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count); + _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count); + } +#else + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); +#endif + span_cache->count = 0; + } +#endif + } + + if (get_thread_heap_raw() == heap) + set_thread_heap(0); + +#if ENABLE_STATISTICS + atomic_decr32(&_memory_active_heaps); + rpmalloc_assert(atomic_load32(&_memory_active_heaps) >= 0, "Still active heaps during finalization"); +#endif + + // If we are forcibly terminating with _exit the state of the + // lock atomic is unknown and it's best to just go ahead and exit + if (get_thread_id() != _rpmalloc_main_thread_id) { + while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) + _rpmalloc_spin(); + } + _rpmalloc_heap_orphan(heap, first_class); + atomic_store32_release(&_memory_global_lock, 0); +} + +static void +_rpmalloc_heap_release_raw(void* heapptr, int release_cache) { + _rpmalloc_heap_release(heapptr, 0, release_cache); +} + +static void +_rpmalloc_heap_release_raw_fc(void* heapptr) { + _rpmalloc_heap_release_raw(heapptr, 1); +} + +static void +_rpmalloc_heap_finalize(heap_t* heap) { + if (heap->spans_reserved) { + span_t* span = _rpmalloc_span_map(heap, heap->spans_reserved); + _rpmalloc_span_unmap(span); + heap->spans_reserved = 0; + } + + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + if (heap->size_class[iclass].cache) + _rpmalloc_span_unmap(heap->size_class[iclass].cache); + heap->size_class[iclass].cache = 0; + span_t* span = heap->size_class[iclass].partial_span; + while (span) { + span_t* next = span->next; + _rpmalloc_span_finalize(heap, iclass, span, &heap->size_class[iclass].partial_span); + span = next; + } + // If class still has a free list it must be a full span + if (heap->size_class[iclass].free_list) { + span_t* class_span = (span_t*)((uintptr_t)heap->size_class[iclass].free_list & _memory_span_mask); + span_t** list = 0; +#if RPMALLOC_FIRST_CLASS_HEAPS + list = &heap->full_span[iclass]; +#endif + --heap->full_span_count; + if (!_rpmalloc_span_finalize(heap, iclass, class_span, list)) { + if (list) + _rpmalloc_span_double_link_list_remove(list, class_span); + _rpmalloc_span_double_link_list_add(&heap->size_class[iclass].partial_span, class_span); + } + } + } + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t* span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1)); + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); + span_cache->count = 0; + } +#endif + rpmalloc_assert(!atomic_load_ptr(&heap->span_free_deferred), "Heaps still active during finalization"); +} + + +//////////// +/// +/// Allocation entry points +/// +////// + //! Pop first block from a free list static void* free_list_pop(void** list) { @@ -1209,84 +2081,85 @@ free_list_pop(void** list) { //! Allocate a small/medium sized memory block from the given heap static void* -_memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) { - heap_class_t* heap_class = &heap->span_class[class_idx]; - void* block; - - span_t* active_span = heap_class->partial_span; - if (EXPECTED(active_span != 0)) { - assert(active_span->state == SPAN_STATE_ACTIVE); - assert(active_span->block_count == _memory_size_class[active_span->size_class].block_count); - //Swap in free list if not empty - if (active_span->free_list) { - heap_class->free_list = active_span->free_list; - active_span->free_list = 0; - return free_list_pop(&heap_class->free_list); - } - //If the span did not fully initialize free list, link up another page worth of blocks - if (active_span->free_list_limit < active_span->block_count) { - void* block_start = pointer_offset(active_span, SPAN_HEADER_SIZE + (active_span->free_list_limit * active_span->block_size)); - active_span->free_list_limit += free_list_partial_init(&heap_class->free_list, &block, +_rpmalloc_allocate_from_heap_fallback(heap_t* heap, heap_size_class_t* heap_size_class, uint32_t class_idx) { + span_t* span = heap_size_class->partial_span; + if (EXPECTED(span != 0)) { + rpmalloc_assert(span->block_count == _memory_size_class[span->size_class].block_count, "Span block count corrupted"); + rpmalloc_assert(!_rpmalloc_span_is_fully_utilized(span), "Internal failure"); + void* block; + if (span->free_list) { + //Span local free list is not empty, swap to size class free list + block = free_list_pop(&span->free_list); + heap_size_class->free_list = span->free_list; + span->free_list = 0; + } else { + //If the span did not fully initialize free list, link up another page worth of blocks + void* block_start = pointer_offset(span, SPAN_HEADER_SIZE + ((size_t)span->free_list_limit * span->block_size)); + span->free_list_limit += free_list_partial_init(&heap_size_class->free_list, &block, (void*)((uintptr_t)block_start & ~(_memory_page_size - 1)), block_start, - active_span->block_count - active_span->free_list_limit, active_span->block_size); + span->block_count - span->free_list_limit, span->block_size); + } + rpmalloc_assert(span->free_list_limit <= span->block_count, "Span block count corrupted"); + span->used_count = span->free_list_limit; + + //Swap in deferred free list if present + if (atomic_load_ptr(&span->free_list_deferred)) + _rpmalloc_span_extract_free_list_deferred(span); + + //If span is still not fully utilized keep it in partial list and early return block + if (!_rpmalloc_span_is_fully_utilized(span)) return block; - } - //Swap in deferred free list - atomic_thread_fence_acquire(); - if (atomic_load_ptr(&active_span->free_list_deferred)) { - heap_class->free_list = _memory_span_extract_deferred(active_span); - return free_list_pop(&heap_class->free_list); - } - //If the active span is fully allocated, mark span as free floating (fully allocated and not part of any list) - assert(!heap_class->free_list); - assert(active_span->free_list_limit >= active_span->block_count); - _memory_span_set_active_full(heap_class, active_span); + //The span is fully utilized, unlink from partial list and add to fully utilized list + _rpmalloc_span_double_link_list_pop_head(&heap_size_class->partial_span, span); +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span); +#endif + ++heap->full_span_count; + return block; } - assert(!heap_class->free_list); - - //Try promoting a semi-used span to active - active_span = heap_class->partial_span; - if (EXPECTED(active_span != 0)) { - _memory_span_set_partial_active(heap_class, active_span); - return free_list_pop(&heap_class->free_list); - } - assert(!heap_class->free_list); - assert(!heap_class->partial_span); //Find a span in one of the cache levels - active_span = _memory_heap_extract_new_span(heap, 1, class_idx); + span = _rpmalloc_heap_extract_new_span(heap, heap_size_class, 1, class_idx); + if (EXPECTED(span != 0)) { + //Mark span as owned by this heap and set base data, return first block + return _rpmalloc_span_initialize_new(heap, heap_size_class, span, class_idx); + } - //Mark span as owned by this heap and set base data, return first block - return _memory_span_set_new_active(heap, heap_class, active_span, class_idx); + return 0; } //! Allocate a small sized memory block from the given heap static void* -_memory_allocate_small(heap_t* heap, size_t size) { +_rpmalloc_allocate_small(heap_t* heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); //Small sizes have unique size classes const uint32_t class_idx = (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT); - _memory_statistics_inc_alloc(heap, class_idx); - if (EXPECTED(heap->span_class[class_idx].free_list != 0)) - return free_list_pop(&heap->span_class[class_idx].free_list); - return _memory_allocate_from_heap_fallback(heap, class_idx); + heap_size_class_t* heap_size_class = heap->size_class + class_idx; + _rpmalloc_stat_inc_alloc(heap, class_idx); + if (EXPECTED(heap_size_class->free_list != 0)) + return free_list_pop(&heap_size_class->free_list); + return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, class_idx); } //! Allocate a medium sized memory block from the given heap static void* -_memory_allocate_medium(heap_t* heap, size_t size) { +_rpmalloc_allocate_medium(heap_t* heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); //Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes) const uint32_t base_idx = (uint32_t)(SMALL_CLASS_COUNT + ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT)); const uint32_t class_idx = _memory_size_class[base_idx].class_idx; - _memory_statistics_inc_alloc(heap, class_idx); - if (EXPECTED(heap->span_class[class_idx].free_list != 0)) - return free_list_pop(&heap->span_class[class_idx].free_list); - return _memory_allocate_from_heap_fallback(heap, class_idx); + heap_size_class_t* heap_size_class = heap->size_class + class_idx; + _rpmalloc_stat_inc_alloc(heap, class_idx); + if (EXPECTED(heap_size_class->free_list != 0)) + return free_list_pop(&heap_size_class->free_list); + return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, class_idx); } //! Allocate a large sized memory block from the given heap static void* -_memory_allocate_large(heap_t* heap, size_t size) { +_rpmalloc_allocate_large(heap_t* heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); //Calculate number of needed max sized spans (including header) //Since this function is never called if size > LARGE_SIZE_LIMIT //the span_count is guaranteed to be <= LARGE_CLASS_COUNT @@ -1294,925 +2167,71 @@ _memory_allocate_large(heap_t* heap, size_t size) { size_t span_count = size >> _memory_span_size_shift; if (size & (_memory_span_size - 1)) ++span_count; - size_t idx = span_count - 1; //Find a span in one of the cache levels - span_t* span = _memory_heap_extract_new_span(heap, span_count, SIZE_CLASS_COUNT); + span_t* span = _rpmalloc_heap_extract_new_span(heap, 0, span_count, SIZE_CLASS_LARGE); + if (!span) + return span; //Mark span as owned by this heap and set base data - assert(span->span_count == span_count); - span->size_class = (uint32_t)(SIZE_CLASS_COUNT + idx); + rpmalloc_assert(span->span_count >= span_count, "Internal failure"); + span->size_class = SIZE_CLASS_LARGE; span->heap = heap; - atomic_thread_fence_release(); + +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); +#endif + ++heap->full_span_count; return pointer_offset(span, SPAN_HEADER_SIZE); } //! Allocate a huge block by mapping memory pages directly static void* -_memory_allocate_huge(size_t size) { +_rpmalloc_allocate_huge(heap_t* heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); + _rpmalloc_heap_cache_adopt_deferred(heap, 0); size += SPAN_HEADER_SIZE; size_t num_pages = size >> _memory_page_size_shift; if (size & (_memory_page_size - 1)) ++num_pages; size_t align_offset = 0; - span_t* span = (span_t*)_memory_map(num_pages * _memory_page_size, &align_offset); + span_t* span = (span_t*)_rpmalloc_mmap(num_pages * _memory_page_size, &align_offset); if (!span) return span; + //Store page count in span_count - span->size_class = (uint32_t)-1; + span->size_class = SIZE_CLASS_HUGE; span->span_count = (uint32_t)num_pages; span->align_offset = (uint32_t)align_offset; - _memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + span->heap = heap; + _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); +#endif + ++heap->full_span_count; return pointer_offset(span, SPAN_HEADER_SIZE); } -//! Allocate a block larger than medium size -static void* -_memory_allocate_oversized(heap_t* heap, size_t size) { - if (size <= LARGE_SIZE_LIMIT) - return _memory_allocate_large(heap, size); - return _memory_allocate_huge(size); -} - //! Allocate a block of the given size static void* -_memory_allocate(heap_t* heap, size_t size) { +_rpmalloc_allocate(heap_t* heap, size_t size) { + _rpmalloc_stat_add64(&_allocation_counter, 1); if (EXPECTED(size <= SMALL_SIZE_LIMIT)) - return _memory_allocate_small(heap, size); + return _rpmalloc_allocate_small(heap, size); else if (size <= _memory_medium_size_limit) - return _memory_allocate_medium(heap, size); - return _memory_allocate_oversized(heap, size); + return _rpmalloc_allocate_medium(heap, size); + else if (size <= LARGE_SIZE_LIMIT) + return _rpmalloc_allocate_large(heap, size); + return _rpmalloc_allocate_huge(heap, size); } -//! Allocate a new heap -static heap_t* -_memory_allocate_heap(void) { - void* raw_heap; - void* next_raw_heap; - uintptr_t orphan_counter; - heap_t* heap; - heap_t* next_heap; - //Try getting an orphaned heap - atomic_thread_fence_acquire(); - do { - raw_heap = atomic_load_ptr(&_memory_orphan_heaps); - heap = (heap_t*)((uintptr_t)raw_heap & ~(uintptr_t)0x1FF); - if (!heap) - break; - next_heap = heap->next_orphan; - orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter); - next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & (uintptr_t)0x1FF)); - } while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap)); - - if (!heap) { - //Map in pages for a new heap - size_t align_offset = 0; - heap = (heap_t*)_memory_map((1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, &align_offset); - if (!heap) - return heap; - memset((char*)heap, 0, sizeof(heap_t)); - heap->align_offset = align_offset; - - //Get a new heap ID - do { - heap->id = atomic_incr32(&_memory_heap_id); - if (_memory_heap_lookup(heap->id)) - heap->id = 0; - } while (!heap->id); - - //Link in heap in heap ID map - size_t list_idx = heap->id % HEAP_ARRAY_SIZE; - do { - next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); - heap->next_heap = next_heap; - } while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap)); - } - - return heap; -} - -//! Deallocate the given small/medium memory block in the current thread local heap -static void -_memory_deallocate_direct(span_t* span, void* block) { - assert(span->heap == get_thread_heap_raw()); - uint32_t state = span->state; - //Add block to free list - *((void**)block) = span->free_list; - span->free_list = block; - if (UNEXPECTED(state == SPAN_STATE_ACTIVE)) - return; - uint32_t used = --span->used_count; - uint32_t free = span->list_size; - if (UNEXPECTED(used == free)) - _memory_span_release_to_cache(span->heap, span); - else if (UNEXPECTED(state == SPAN_STATE_FULL)) - _memory_span_set_full_partial(span->heap, span); -} - -//! Put the block in the deferred free list of the owning span -static void -_memory_deallocate_defer(span_t* span, void* block) { - atomic_thread_fence_acquire(); - if (span->state == SPAN_STATE_FULL) { - if ((span->list_size + 1) == span->block_count) { - //Span will be completely freed by deferred deallocations, no other thread can - //currently touch it. Safe to move to owner heap deferred cache - span_t* last_head; - heap_t* heap = span->heap; - do { - last_head = (span_t*)atomic_load_ptr(&heap->span_cache_deferred); - span->next = last_head; - } while (!atomic_cas_ptr(&heap->span_cache_deferred, span, last_head)); - return; - } - } - - void* free_list; - do { - atomic_thread_fence_acquire(); - free_list = atomic_load_ptr(&span->free_list_deferred); - *((void**)block) = free_list; - } while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list)); - ++span->list_size; - atomic_store_ptr(&span->free_list_deferred, block); -} - -static void -_memory_deallocate_small_or_medium(span_t* span, void* p) { - _memory_statistics_inc_free(span->heap, span->size_class); - if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) { - //Realign pointer to block start - void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); - uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); - p = pointer_offset(p, -(int32_t)(block_offset % span->block_size)); - } - //Check if block belongs to this heap or if deallocation should be deferred - if (span->heap == get_thread_heap_raw()) - _memory_deallocate_direct(span, p); - else - _memory_deallocate_defer(span, p); -} - -//! Deallocate the given large memory block to the current heap -static void -_memory_deallocate_large(span_t* span) { - //Decrease counter - assert(span->span_count == ((size_t)span->size_class - SIZE_CLASS_COUNT + 1)); - assert(span->size_class >= SIZE_CLASS_COUNT); - assert(span->size_class - SIZE_CLASS_COUNT < LARGE_CLASS_COUNT); - assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN)); - assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN)); - //Large blocks can always be deallocated and transferred between heaps - //Investigate if it is better to defer large spans as well through span_cache_deferred, - //possibly with some heuristics to pick either scheme at runtime per deallocation - heap_t* heap = get_thread_heap(); - if (!heap) return; -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS - size_t idx = span->span_count - 1; - atomic_decr32(&span->heap->span_use[idx].current); -#endif - if ((span->span_count > 1) && !heap->spans_reserved) { - heap->span_reserve = span; - heap->spans_reserved = span->span_count; - if (span->flags & SPAN_FLAG_MASTER) { - heap->span_reserve_master = span; - } else { //SPAN_FLAG_SUBSPAN - uint32_t distance = span->total_spans_or_distance; - span_t* master = (span_t*)pointer_offset(span, -(int32_t)(distance * _memory_span_size)); - heap->span_reserve_master = master; - assert(master->flags & SPAN_FLAG_MASTER); - assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count); - } - _memory_statistics_inc(heap->span_use[idx].spans_to_reserved, 1); - } else { - //Insert into cache list - _memory_heap_cache_insert(heap, span); - } -} - -//! Deallocate the given huge span -static void -_memory_deallocate_huge(span_t* span) { - //Oversized allocation, page count is stored in span_count - size_t num_pages = span->span_count; - _memory_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size); - _memory_statistics_sub(&_huge_pages_current, num_pages); -} - -//! Deallocate the given block -static void -_memory_deallocate(void* p) { - //Grab the span (always at start of span, using span alignment) - span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); - if (UNEXPECTED(!span)) - return; - if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) - _memory_deallocate_small_or_medium(span, p); - else if (span->size_class != (uint32_t)-1) - _memory_deallocate_large(span); - else - _memory_deallocate_huge(span); -} - -//! Reallocate the given block to the given size static void* -_memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) { - if (p) { - //Grab the span using guaranteed span alignment - span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); - if (span->heap) { - if (span->size_class < SIZE_CLASS_COUNT) { - //Small/medium sized block - assert(span->span_count == 1); - void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); - uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); - uint32_t block_idx = block_offset / span->block_size; - void* block = pointer_offset(blocks_start, block_idx * span->block_size); - if (!oldsize) - oldsize = span->block_size - (uint32_t)pointer_diff(p, block); - if ((size_t)span->block_size >= size) { - //Still fits in block, never mind trying to save memory, but preserve data if alignment changed - if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) - memmove(block, p, oldsize); - return block; - } - } else { - //Large block - size_t total_size = size + SPAN_HEADER_SIZE; - size_t num_spans = total_size >> _memory_span_size_shift; - if (total_size & (_memory_span_mask - 1)) - ++num_spans; - size_t current_spans = span->span_count; - assert(current_spans == ((span->size_class - SIZE_CLASS_COUNT) + 1)); - void* block = pointer_offset(span, SPAN_HEADER_SIZE); - if (!oldsize) - oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; - if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) { - //Still fits in block, never mind trying to save memory, but preserve data if alignment changed - if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) - memmove(block, p, oldsize); - return block; - } - } - } else { - //Oversized block - size_t total_size = size + SPAN_HEADER_SIZE; - size_t num_pages = total_size >> _memory_page_size_shift; - if (total_size & (_memory_page_size - 1)) - ++num_pages; - //Page count is stored in span_count - size_t current_pages = span->span_count; - void* block = pointer_offset(span, SPAN_HEADER_SIZE); - if (!oldsize) - oldsize = (current_pages * _memory_page_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; - if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) { - //Still fits in block, never mind trying to save memory, but preserve data if alignment changed - if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) - memmove(block, p, oldsize); - return block; - } - } - } else { - oldsize = 0; - } - - //Size is greater than block size, need to allocate a new block and deallocate the old - heap_t* heap = get_thread_heap(); - //Avoid hysteresis by overallocating if increase is small (below 37%) - size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); - size_t new_size = (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size); - void* block = _memory_allocate(heap, new_size); - if (p && block) { - if (!(flags & RPMALLOC_NO_PRESERVE)) - memcpy(block, p, oldsize < new_size ? oldsize : new_size); - _memory_deallocate(p); - } - - return block; -} - -//! Get the usable size of the given block -static size_t -_memory_usable_size(void* p) { - //Grab the span using guaranteed span alignment - span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); - if (span->heap) { - //Small/medium block - if (span->size_class < SIZE_CLASS_COUNT) { - void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); - return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size); - } - - //Large block - size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1; - return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span); - } - - //Oversized block, page count is stored in span_count - size_t current_pages = span->span_count; - return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span); -} - -//! Adjust and optimize the size class properties for the given class -static void -_memory_adjust_size_class(size_t iclass) { - size_t block_size = _memory_size_class[iclass].block_size; - size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size; - - _memory_size_class[iclass].block_count = (uint16_t)block_count; - _memory_size_class[iclass].class_idx = (uint16_t)iclass; - - //Check if previous size classes can be merged - size_t prevclass = iclass; - while (prevclass > 0) { - --prevclass; - //A class can be merged if number of pages and number of blocks are equal - if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count) - memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass])); - else - break; - } -} - -static void -_memory_heap_finalize(void* heapptr) { - heap_t* heap = (heap_t*)heapptr; - if (!heap) - return; - //Release thread cache spans back to global cache -#if ENABLE_THREAD_CACHE - _memory_heap_cache_adopt_deferred(heap); - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - span_t* span = heap->span_cache[iclass]; -#if ENABLE_GLOBAL_CACHE - while (span) { - assert(span->span_count == (iclass + 1)); - size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large); - span_t* next = _memory_span_list_split(span, (uint32_t)release_count); -#if ENABLE_STATISTICS - heap->thread_to_global += (size_t)span->list_size * span->span_count * _memory_span_size; - heap->span_use[iclass].spans_to_global += span->list_size; -#endif - _memory_global_cache_insert(span); - span = next; - } -#else - if (span) - _memory_unmap_span_list(span); -#endif - heap->span_cache[iclass] = 0; - } -#endif - - //Orphan the heap - void* raw_heap; - uintptr_t orphan_counter; - heap_t* last_heap; - do { - last_heap = (heap_t*)atomic_load_ptr(&_memory_orphan_heaps); - heap->next_orphan = (heap_t*)((uintptr_t)last_heap & ~(uintptr_t)0x1FF); - orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter); - raw_heap = (void*)((uintptr_t)heap | (orphan_counter & (uintptr_t)0x1FF)); - } while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap)); - - set_thread_heap(0); - -#if ENABLE_STATISTICS - atomic_decr32(&_memory_active_heaps); - assert(atomic_load32(&_memory_active_heaps) >= 0); -#endif -} - -#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) -#include -static DWORD fls_key; -static void NTAPI -rp_thread_destructor(void* value) { - if (value) - rpmalloc_thread_finalize(); -} -#endif - -#if PLATFORM_POSIX -# include -# include -# ifdef __FreeBSD__ -# include -# define MAP_HUGETLB MAP_ALIGNED_SUPER -# endif -# ifndef MAP_UNINITIALIZED -# define MAP_UNINITIALIZED 0 -# endif -#endif -#include - -//! Initialize the allocator and setup global data -TRACY_API int -rpmalloc_initialize(void) { - if (_rpmalloc_initialized) { - rpmalloc_thread_initialize(); - return 0; - } - memset(&_memory_config, 0, sizeof(rpmalloc_config_t)); - return rpmalloc_initialize_config(0); -} - -int -rpmalloc_initialize_config(const rpmalloc_config_t* config) { - if (_rpmalloc_initialized) { - rpmalloc_thread_initialize(); - return 0; - } - _rpmalloc_initialized = 1; - - if (config) - memcpy(&_memory_config, config, sizeof(rpmalloc_config_t)); - - if (!_memory_config.memory_map || !_memory_config.memory_unmap) { - _memory_config.memory_map = _memory_map_os; - _memory_config.memory_unmap = _memory_unmap_os; - } - -#if RPMALLOC_CONFIGURABLE - _memory_page_size = _memory_config.page_size; -#else - _memory_page_size = 0; -#endif - _memory_huge_pages = 0; - _memory_map_granularity = _memory_page_size; - if (!_memory_page_size) { -#if PLATFORM_WINDOWS - SYSTEM_INFO system_info; - memset(&system_info, 0, sizeof(system_info)); - GetSystemInfo(&system_info); - _memory_page_size = system_info.dwPageSize; - _memory_map_granularity = system_info.dwAllocationGranularity; - if (config && config->enable_huge_pages) { - HANDLE token = 0; - size_t large_page_minimum = GetLargePageMinimum(); - if (large_page_minimum) - OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token); - if (token) { - LUID luid; - if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) { - TOKEN_PRIVILEGES token_privileges; - memset(&token_privileges, 0, sizeof(token_privileges)); - token_privileges.PrivilegeCount = 1; - token_privileges.Privileges[0].Luid = luid; - token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) { - DWORD err = GetLastError(); - if (err == ERROR_SUCCESS) { - _memory_huge_pages = 1; - _memory_page_size = large_page_minimum; - _memory_map_granularity = large_page_minimum; - } - } - } - CloseHandle(token); - } - } -#else - _memory_page_size = (size_t)sysconf(_SC_PAGESIZE); - _memory_map_granularity = _memory_page_size; - if (config && config->enable_huge_pages) { -#if defined(__linux__) - size_t huge_page_size = 0; - FILE* meminfo = fopen("/proc/meminfo", "r"); - if (meminfo) { - char line[128]; - while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) { - line[sizeof(line) - 1] = 0; - if (strstr(line, "Hugepagesize:")) - huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024; - } - fclose(meminfo); - } - if (huge_page_size) { - _memory_huge_pages = 1; - _memory_page_size = huge_page_size; - _memory_map_granularity = huge_page_size; - } -#elif defined(__FreeBSD__) - int rc; - size_t sz = sizeof(rc); - - if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && rc == 1) { - _memory_huge_pages = 1; - _memory_page_size = 2 * 1024 * 1024; - _memory_map_granularity = _memory_page_size; - } -#elif defined(__APPLE__) - _memory_huge_pages = 1; - _memory_page_size = 2 * 1024 * 1024; - _memory_map_granularity = _memory_page_size; -#endif - } -#endif - } else { - if (config && config->enable_huge_pages) - _memory_huge_pages = 1; - } - - //The ABA counter in heap orphan list is tied to using 512 (bitmask 0x1FF) - if (_memory_page_size < 512) - _memory_page_size = 512; - if (_memory_page_size > (64 * 1024 * 1024)) - _memory_page_size = (64 * 1024 * 1024); - _memory_page_size_shift = 0; - size_t page_size_bit = _memory_page_size; - while (page_size_bit != 1) { - ++_memory_page_size_shift; - page_size_bit >>= 1; - } - _memory_page_size = ((size_t)1 << _memory_page_size_shift); - -#if RPMALLOC_CONFIGURABLE - size_t span_size = _memory_config.span_size; - if (!span_size) - span_size = (64 * 1024); - if (span_size > (256 * 1024)) - span_size = (256 * 1024); - _memory_span_size = 4096; - _memory_span_size_shift = 12; - while (_memory_span_size < span_size) { - _memory_span_size <<= 1; - ++_memory_span_size_shift; - } - _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1); -#endif - - _memory_span_map_count = ( _memory_config.span_map_count ? _memory_config.span_map_count : DEFAULT_SPAN_MAP_COUNT); - if ((_memory_span_size * _memory_span_map_count) < _memory_page_size) - _memory_span_map_count = (_memory_page_size / _memory_span_size); - if ((_memory_page_size >= _memory_span_size) && ((_memory_span_map_count * _memory_span_size) % _memory_page_size)) - _memory_span_map_count = (_memory_page_size / _memory_span_size); - - _memory_config.page_size = _memory_page_size; - _memory_config.span_size = _memory_span_size; - _memory_config.span_map_count = _memory_span_map_count; - _memory_config.enable_huge_pages = _memory_huge_pages; - - _memory_span_release_count = (_memory_span_map_count > 4 ? ((_memory_span_map_count < 64) ? _memory_span_map_count : 64) : 4); - _memory_span_release_count_large = (_memory_span_release_count > 8 ? (_memory_span_release_count / 4) : 2); - -#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD - if (pthread_key_create(&_memory_thread_heap, _memory_heap_finalize)) - return -1; -#endif -#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) - fls_key = FlsAlloc(&rp_thread_destructor); -#endif - - atomic_store32(&_memory_heap_id, 0); - atomic_store32(&_memory_orphan_counter, 0); -#if ENABLE_STATISTICS - atomic_store32(&_memory_active_heaps, 0); - atomic_store32(&_reserved_spans, 0); - atomic_store32(&_mapped_pages, 0); - _mapped_pages_peak = 0; - atomic_store32(&_mapped_total, 0); - atomic_store32(&_unmapped_total, 0); - atomic_store32(&_mapped_pages_os, 0); - atomic_store32(&_huge_pages_current, 0); - _huge_pages_peak = 0; -#endif - - //Setup all small and medium size classes - size_t iclass = 0; - _memory_size_class[iclass].block_size = SMALL_GRANULARITY; - _memory_adjust_size_class(iclass); - for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) { - size_t size = iclass * SMALL_GRANULARITY; - _memory_size_class[iclass].block_size = (uint32_t)size; - _memory_adjust_size_class(iclass); - } - //At least two blocks per span, then fall back to large allocations - _memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1; - if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT) - _memory_medium_size_limit = MEDIUM_SIZE_LIMIT; - for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { - size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); - if (size > _memory_medium_size_limit) - break; - _memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size; - _memory_adjust_size_class(SMALL_CLASS_COUNT + iclass); - } - - for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) - atomic_store_ptr(&_memory_heaps[list_idx], 0); - - //Initialize this thread - rpmalloc_thread_initialize(); - return 0; -} - -//! Finalize the allocator -TRACY_API void -rpmalloc_finalize(void) { - atomic_thread_fence_acquire(); - - rpmalloc_thread_finalize(); - //rpmalloc_dump_statistics(stderr); - - //Free all thread caches - for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { - heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); - while (heap) { - if (heap->spans_reserved) { - span_t* span = _memory_map_spans(heap, heap->spans_reserved); - _memory_unmap_span(span); - } - - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - heap_class_t* heap_class = heap->span_class + iclass; - span_t* span = heap_class->partial_span; - while (span) { - span_t* next = span->next; - if (span->state == SPAN_STATE_ACTIVE) { - uint32_t used_blocks = span->block_count; - if (span->free_list_limit < span->block_count) - used_blocks = span->free_list_limit; - uint32_t free_blocks = 0; - void* block = heap_class->free_list; - while (block) { - ++free_blocks; - block = *((void**)block); - } - block = span->free_list; - while (block) { - ++free_blocks; - block = *((void**)block); - } - if (used_blocks == (free_blocks + span->list_size)) - _memory_heap_cache_insert(heap, span); - } else { - if (span->used_count == span->list_size) - _memory_heap_cache_insert(heap, span); - } - span = next; - } - } - -#if ENABLE_THREAD_CACHE - //Free span caches (other thread might have deferred after the thread using this heap finalized) - _memory_heap_cache_adopt_deferred(heap); - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - if (heap->span_cache[iclass]) - _memory_unmap_span_list(heap->span_cache[iclass]); - } -#endif - heap_t* next_heap = heap->next_heap; - size_t heap_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size; - _memory_unmap(heap, heap_size, heap->align_offset, heap_size); - heap = next_heap; - } - } - -#if ENABLE_GLOBAL_CACHE - //Free global caches - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) - _memory_cache_finalize(&_memory_span_cache[iclass]); -#endif - - atomic_store_ptr(&_memory_orphan_heaps, 0); - atomic_thread_fence_release(); - -#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD - pthread_key_delete(_memory_thread_heap); -#endif -#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) - FlsFree(fls_key); -#endif - -#if ENABLE_STATISTICS - //If you hit these asserts you probably have memory leaks or double frees in your code - assert(!atomic_load32(&_mapped_pages)); - assert(!atomic_load32(&_reserved_spans)); - assert(!atomic_load32(&_mapped_pages_os)); -#endif - - _rpmalloc_initialized = 0; -} - -//! Initialize thread, assign heap -TRACY_API void -rpmalloc_thread_initialize(void) { - if (!get_thread_heap_raw()) { - heap_t* heap = _memory_allocate_heap(); - if (heap) { - atomic_thread_fence_acquire(); -#if ENABLE_STATISTICS - atomic_incr32(&_memory_active_heaps); -#endif - set_thread_heap(heap); -#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) - FlsSetValue(fls_key, heap); -#endif - } - } -} - -//! Finalize thread, orphan heap -TRACY_API void -rpmalloc_thread_finalize(void) { - heap_t* heap = get_thread_heap_raw(); - if (heap) - _memory_heap_finalize(heap); -} - -int -rpmalloc_is_thread_initialized(void) { - return (get_thread_heap_raw() != 0) ? 1 : 0; -} - -const rpmalloc_config_t* -rpmalloc_config(void) { - return &_memory_config; -} - -//! Map new pages to virtual memory -static void* -_memory_map_os(size_t size, size_t* offset) { - //Either size is a heap (a single page) or a (multiple) span - we only need to align spans, and only if larger than map granularity - size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0; - assert(size >= _memory_page_size); -#if PLATFORM_WINDOWS - //Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed" - void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); - if (!ptr) { - assert(!"Failed to map virtual memory block"); - return 0; - } -#else - int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED; -# if defined(__APPLE__) - int fd = (int)VM_MAKE_TAG(240U); - if (_memory_huge_pages) - fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB; - void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0); -# elif defined(MAP_HUGETLB) - void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0); -# else - void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); -# endif - if ((ptr == MAP_FAILED) || !ptr) { - assert("Failed to map virtual memory block" == 0); - return 0; - } -#endif -#if ENABLE_STATISTICS - atomic_add32(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift)); -#endif - if (padding) { - size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask); - assert(final_padding <= _memory_span_size); - assert(final_padding <= padding); - assert(!(final_padding % 8)); - ptr = pointer_offset(ptr, final_padding); - *offset = final_padding >> 3; - } - assert((size < _memory_span_size) || !((uintptr_t)ptr & ~_memory_span_mask)); - return ptr; -} - -//! Unmap pages from virtual memory -static void -_memory_unmap_os(void* address, size_t size, size_t offset, size_t release) { - assert(release || (offset == 0)); - assert(!release || (release >= _memory_page_size)); - assert(size >= _memory_page_size); - if (release && offset) { - offset <<= 3; - address = pointer_offset(address, -(int32_t)offset); -#if PLATFORM_POSIX - //Padding is always one span size - release += _memory_span_size; -#endif - } -#if !DISABLE_UNMAP -#if PLATFORM_WINDOWS - if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) { - assert(!"Failed to unmap virtual memory block"); - } -#else - if (release) { - if (munmap(address, release)) { - assert("Failed to unmap virtual memory block" == 0); - } - } - else { -#if defined(POSIX_MADV_FREE) - if (posix_madvise(address, size, POSIX_MADV_FREE)) -#endif -#if defined(POSIX_MADV_DONTNEED) - if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) { - assert("Failed to madvise virtual memory block as free" == 0); - } -#endif - } -#endif -#endif -#if ENABLE_STATISTICS - if (release) - atomic_add32(&_mapped_pages_os, -(int32_t)(release >> _memory_page_size_shift)); -#endif -} - -// Extern interface - -TRACY_API RPMALLOC_ALLOCATOR void* -rpmalloc(size_t size) { -#if ENABLE_VALIDATE_ARGS - if (size >= MAX_ALLOC_SIZE) { - errno = EINVAL; - return 0; - } -#endif - heap_t* heap = get_thread_heap(); - return _memory_allocate(heap, size); -} - -TRACY_API void -rpfree(void* ptr) { - _memory_deallocate(ptr); -} - -extern inline RPMALLOC_ALLOCATOR void* -rpcalloc(size_t num, size_t size) { - size_t total; -#if ENABLE_VALIDATE_ARGS -#if PLATFORM_WINDOWS - int err = SizeTMult(num, size, &total); - if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#else - int err = __builtin_umull_overflow(num, size, &total); - if (err || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#endif -#else - total = num * size; -#endif - heap_t* heap = get_thread_heap(); - void* block = _memory_allocate(heap, total); - memset(block, 0, total); - return block; -} - -TRACY_API RPMALLOC_ALLOCATOR void* -rprealloc(void* ptr, size_t size) { -#if ENABLE_VALIDATE_ARGS - if (size >= MAX_ALLOC_SIZE) { - errno = EINVAL; - return ptr; - } -#endif - return _memory_reallocate(ptr, size, 0, 0); -} - -extern RPMALLOC_ALLOCATOR void* -rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, - unsigned int flags) { -#if ENABLE_VALIDATE_ARGS - if ((size + alignment < size) || (alignment > _memory_page_size)) { - errno = EINVAL; - return 0; - } -#endif - void* block; - if (alignment > 32) { - size_t usablesize = _memory_usable_size(ptr); - if ((usablesize >= size) && (size >= (usablesize / 2)) && !((uintptr_t)ptr & (alignment - 1))) - return ptr; - - block = rpaligned_alloc(alignment, size); - if (ptr) { - if (!oldsize) - oldsize = usablesize; - if (!(flags & RPMALLOC_NO_PRESERVE)) - memcpy(block, ptr, oldsize < size ? oldsize : size); - rpfree(ptr); - } - //Mark as having aligned blocks - span_t* span = (span_t*)((uintptr_t)block & _memory_span_mask); - span->flags |= SPAN_FLAG_ALIGNED_BLOCKS; - } else { - block = _memory_reallocate(ptr, size, oldsize, flags); - } - return block; -} - -extern RPMALLOC_ALLOCATOR void* -rpaligned_alloc(size_t alignment, size_t size) { - if (alignment <= 16) - return rpmalloc(size); +_rpmalloc_aligned_allocate(heap_t* heap, size_t alignment, size_t size) { + if (alignment <= SMALL_GRANULARITY) + return _rpmalloc_allocate(heap, size); #if ENABLE_VALIDATE_ARGS if ((size + alignment) < size) { @@ -2225,15 +2244,26 @@ rpaligned_alloc(size_t alignment, size_t size) { } #endif + if ((alignment <= SPAN_HEADER_SIZE) && (size < _memory_medium_size_limit)) { + // If alignment is less or equal to span header size (which is power of two), + // and size aligned to span header size multiples is less than size + alignment, + // then use natural alignment of blocks to provide alignment + size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) & ~(uintptr_t)(SPAN_HEADER_SIZE - 1) : SPAN_HEADER_SIZE; + rpmalloc_assert(!(multiple_size % SPAN_HEADER_SIZE), "Failed alignment calculation"); + if (multiple_size <= (size + alignment)) + return _rpmalloc_allocate(heap, multiple_size); + } + void* ptr = 0; size_t align_mask = alignment - 1; - if (alignment < _memory_page_size) { - ptr = rpmalloc(size + alignment); - if ((uintptr_t)ptr & align_mask) + if (alignment <= _memory_page_size) { + ptr = _rpmalloc_allocate(heap, size + alignment); + if ((uintptr_t)ptr & align_mask) { ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment); - //Mark as having aligned blocks - span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask); - span->flags |= SPAN_FLAG_ALIGNED_BLOCKS; + //Mark as having aligned blocks + span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask); + span->flags |= SPAN_FLAG_ALIGNED_BLOCKS; + } return ptr; } @@ -2277,7 +2307,7 @@ retry: align_offset = 0; mapped_size = num_pages * _memory_page_size; - span = (span_t*)_memory_map(mapped_size, &align_offset); + span = (span_t*)_rpmalloc_mmap(mapped_size, &align_offset); if (!span) { errno = ENOMEM; return 0; @@ -2290,7 +2320,7 @@ retry: if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) || (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) || (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) { - _memory_unmap(span, mapped_size, align_offset, mapped_size); + _rpmalloc_unmap(span, mapped_size, align_offset, mapped_size); ++num_pages; if (num_pages > limit_pages) { errno = EINVAL; @@ -2300,14 +2330,774 @@ retry: } //Store page count in span_count - span->size_class = (uint32_t)-1; + span->size_class = SIZE_CLASS_HUGE; span->span_count = (uint32_t)num_pages; span->align_offset = (uint32_t)align_offset; - _memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + span->heap = heap; + _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); +#endif + ++heap->full_span_count; + + _rpmalloc_stat_add64(&_allocation_counter, 1); return ptr; } + +//////////// +/// +/// Deallocation entry points +/// +////// + +//! Deallocate the given small/medium memory block in the current thread local heap +static void +_rpmalloc_deallocate_direct_small_or_medium(span_t* span, void* block) { + heap_t* heap = span->heap; + rpmalloc_assert(heap->owner_thread == get_thread_id() || !heap->owner_thread || heap->finalize, "Internal failure"); + //Add block to free list + if (UNEXPECTED(_rpmalloc_span_is_fully_utilized(span))) { + span->used_count = span->block_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span); +#endif + _rpmalloc_span_double_link_list_add(&heap->size_class[span->size_class].partial_span, span); + --heap->full_span_count; + } + *((void**)block) = span->free_list; + --span->used_count; + span->free_list = block; + if (UNEXPECTED(span->used_count == span->list_size)) { + // If there are no used blocks it is guaranteed that no other external thread is accessing the span + if (span->used_count) { + // Make sure we have synchronized the deferred list and list size by using acquire semantics + // and guarantee that no external thread is accessing span concurrently + void* free_list; + do { + free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER); + } while (free_list == INVALID_POINTER); + atomic_store_ptr_release(&span->free_list_deferred, free_list); + } + _rpmalloc_span_double_link_list_remove(&heap->size_class[span->size_class].partial_span, span); + _rpmalloc_span_release_to_cache(heap, span); + } +} + +static void +_rpmalloc_deallocate_defer_free_span(heap_t* heap, span_t* span) { + if (span->size_class != SIZE_CLASS_HUGE) + _rpmalloc_stat_inc(&heap->span_use[span->span_count - 1].spans_deferred); + //This list does not need ABA protection, no mutable side state + do { + span->free_list = (void*)atomic_load_ptr(&heap->span_free_deferred); + } while (!atomic_cas_ptr(&heap->span_free_deferred, span, span->free_list)); +} + +//! Put the block in the deferred free list of the owning span +static void +_rpmalloc_deallocate_defer_small_or_medium(span_t* span, void* block) { + // The memory ordering here is a bit tricky, to avoid having to ABA protect + // the deferred free list to avoid desynchronization of list and list size + // we need to have acquire semantics on successful CAS of the pointer to + // guarantee the list_size variable validity + release semantics on pointer store + void* free_list; + do { + free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER); + } while (free_list == INVALID_POINTER); + *((void**)block) = free_list; + uint32_t free_count = ++span->list_size; + int all_deferred_free = (free_count == span->block_count); + atomic_store_ptr_release(&span->free_list_deferred, block); + if (all_deferred_free) { + // Span was completely freed by this block. Due to the INVALID_POINTER spin lock + // no other thread can reach this state simultaneously on this span. + // Safe to move to owner heap deferred cache + _rpmalloc_deallocate_defer_free_span(span->heap, span); + } +} + +static void +_rpmalloc_deallocate_small_or_medium(span_t* span, void* p) { + _rpmalloc_stat_inc_free(span->heap, span->size_class); + if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) { + //Realign pointer to block start + void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); + p = pointer_offset(p, -(int32_t)(block_offset % span->block_size)); + } + //Check if block belongs to this heap or if deallocation should be deferred +#if RPMALLOC_FIRST_CLASS_HEAPS + int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#else + int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#endif + if (!defer) + _rpmalloc_deallocate_direct_small_or_medium(span, p); + else + _rpmalloc_deallocate_defer_small_or_medium(span, p); +} + +//! Deallocate the given large memory block to the current heap +static void +_rpmalloc_deallocate_large(span_t* span) { + rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, "Bad span size class"); + rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted"); + rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted"); + //We must always defer (unless finalizing) if from another heap since we cannot touch the list or counters of another heap +#if RPMALLOC_FIRST_CLASS_HEAPS + int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#else + int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#endif + if (defer) { + _rpmalloc_deallocate_defer_free_span(span->heap, span); + return; + } + rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted"); + --span->heap->full_span_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span); +#endif +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + //Decrease counter + size_t idx = span->span_count - 1; + atomic_decr32(&span->heap->span_use[idx].current); +#endif + heap_t* heap = span->heap; + rpmalloc_assert(heap, "No thread heap"); +#if ENABLE_THREAD_CACHE + const int set_as_reserved = ((span->span_count > 1) && (heap->span_cache.count == 0) && !heap->finalize && !heap->spans_reserved); +#else + const int set_as_reserved = ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved); +#endif + if (set_as_reserved) { + heap->span_reserve = span; + heap->spans_reserved = span->span_count; + if (span->flags & SPAN_FLAG_MASTER) { + heap->span_reserve_master = span; + } else { //SPAN_FLAG_SUBSPAN + span_t* master = (span_t*)pointer_offset(span, -(intptr_t)((size_t)span->offset_from_master * _memory_span_size)); + heap->span_reserve_master = master; + rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted"); + rpmalloc_assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count, "Master span count corrupted"); + } + _rpmalloc_stat_inc(&heap->span_use[idx].spans_to_reserved); + } else { + //Insert into cache list + _rpmalloc_heap_cache_insert(heap, span); + } +} + +//! Deallocate the given huge span +static void +_rpmalloc_deallocate_huge(span_t* span) { + rpmalloc_assert(span->heap, "No span heap"); +#if RPMALLOC_FIRST_CLASS_HEAPS + int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#else + int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#endif + if (defer) { + _rpmalloc_deallocate_defer_free_span(span->heap, span); + return; + } + rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted"); + --span->heap->full_span_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span); +#endif + + //Oversized allocation, page count is stored in span_count + size_t num_pages = span->span_count; + _rpmalloc_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size); + _rpmalloc_stat_sub(&_huge_pages_current, num_pages); +} + +//! Deallocate the given block +static void +_rpmalloc_deallocate(void* p) { + _rpmalloc_stat_add64(&_deallocation_counter, 1); + //Grab the span (always at start of span, using span alignment) + span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); + if (UNEXPECTED(!span)) + return; + if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) + _rpmalloc_deallocate_small_or_medium(span, p); + else if (span->size_class == SIZE_CLASS_LARGE) + _rpmalloc_deallocate_large(span); + else + _rpmalloc_deallocate_huge(span); +} + +//////////// +/// +/// Reallocation entry points +/// +////// + +static size_t +_rpmalloc_usable_size(void* p); + +//! Reallocate the given block to the given size +static void* +_rpmalloc_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigned int flags) { + if (p) { + //Grab the span using guaranteed span alignment + span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); + if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) { + //Small/medium sized block + rpmalloc_assert(span->span_count == 1, "Span counter corrupted"); + void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); + uint32_t block_idx = block_offset / span->block_size; + void* block = pointer_offset(blocks_start, (size_t)block_idx * span->block_size); + if (!oldsize) + oldsize = (size_t)((ptrdiff_t)span->block_size - pointer_diff(p, block)); + if ((size_t)span->block_size >= size) { + //Still fits in block, never mind trying to save memory, but preserve data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } else if (span->size_class == SIZE_CLASS_LARGE) { + //Large block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_spans = total_size >> _memory_span_size_shift; + if (total_size & (_memory_span_mask - 1)) + ++num_spans; + size_t current_spans = span->span_count; + void* block = pointer_offset(span, SPAN_HEADER_SIZE); + if (!oldsize) + oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; + if ((current_spans >= num_spans) && (total_size >= (oldsize / 2))) { + //Still fits in block, never mind trying to save memory, but preserve data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } else { + //Oversized block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_pages = total_size >> _memory_page_size_shift; + if (total_size & (_memory_page_size - 1)) + ++num_pages; + //Page count is stored in span_count + size_t current_pages = span->span_count; + void* block = pointer_offset(span, SPAN_HEADER_SIZE); + if (!oldsize) + oldsize = (current_pages * _memory_page_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; + if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) { + //Still fits in block, never mind trying to save memory, but preserve data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } + } else { + oldsize = 0; + } + + if (!!(flags & RPMALLOC_GROW_OR_FAIL)) + return 0; + + //Size is greater than block size, need to allocate a new block and deallocate the old + //Avoid hysteresis by overallocating if increase is small (below 37%) + size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); + size_t new_size = (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size); + void* block = _rpmalloc_allocate(heap, new_size); + if (p && block) { + if (!(flags & RPMALLOC_NO_PRESERVE)) + memcpy(block, p, oldsize < new_size ? oldsize : new_size); + _rpmalloc_deallocate(p); + } + + return block; +} + +static void* +_rpmalloc_aligned_reallocate(heap_t* heap, void* ptr, size_t alignment, size_t size, size_t oldsize, + unsigned int flags) { + if (alignment <= SMALL_GRANULARITY) + return _rpmalloc_reallocate(heap, ptr, size, oldsize, flags); + + int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL); + size_t usablesize = (ptr ? _rpmalloc_usable_size(ptr) : 0); + if ((usablesize >= size) && !((uintptr_t)ptr & (alignment - 1))) { + if (no_alloc || (size >= (usablesize / 2))) + return ptr; + } + // Aligned alloc marks span as having aligned blocks + void* block = (!no_alloc ? _rpmalloc_aligned_allocate(heap, alignment, size) : 0); + if (EXPECTED(block != 0)) { + if (!(flags & RPMALLOC_NO_PRESERVE) && ptr) { + if (!oldsize) + oldsize = usablesize; + memcpy(block, ptr, oldsize < size ? oldsize : size); + } + _rpmalloc_deallocate(ptr); + } + return block; +} + + +//////////// +/// +/// Initialization, finalization and utility +/// +////// + +//! Get the usable size of the given block +static size_t +_rpmalloc_usable_size(void* p) { + //Grab the span using guaranteed span alignment + span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); + if (span->size_class < SIZE_CLASS_COUNT) { + //Small/medium block + void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size); + } + if (span->size_class == SIZE_CLASS_LARGE) { + //Large block + size_t current_spans = span->span_count; + return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span); + } + //Oversized block, page count is stored in span_count + size_t current_pages = span->span_count; + return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span); +} + +//! Adjust and optimize the size class properties for the given class +static void +_rpmalloc_adjust_size_class(size_t iclass) { + size_t block_size = _memory_size_class[iclass].block_size; + size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size; + + _memory_size_class[iclass].block_count = (uint16_t)block_count; + _memory_size_class[iclass].class_idx = (uint16_t)iclass; + + //Check if previous size classes can be merged + if (iclass >= SMALL_CLASS_COUNT) { + size_t prevclass = iclass; + while (prevclass > 0) { + --prevclass; + //A class can be merged if number of pages and number of blocks are equal + if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count) + memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass])); + else + break; + } + } +} + +//! Initialize the allocator and setup global data +TRACY_API int +rpmalloc_initialize(void) { + if (_rpmalloc_initialized) { + rpmalloc_thread_initialize(); + return 0; + } + return rpmalloc_initialize_config(0); +} + +int +rpmalloc_initialize_config(const rpmalloc_config_t* config) { + if (_rpmalloc_initialized) { + rpmalloc_thread_initialize(); + return 0; + } + _rpmalloc_initialized = 1; + + if (config) + memcpy(&_memory_config, config, sizeof(rpmalloc_config_t)); + else + memset(&_memory_config, 0, sizeof(rpmalloc_config_t)); + + if (!_memory_config.memory_map || !_memory_config.memory_unmap) { + _memory_config.memory_map = _rpmalloc_mmap_os; + _memory_config.memory_unmap = _rpmalloc_unmap_os; + } + +#if PLATFORM_WINDOWS + SYSTEM_INFO system_info; + memset(&system_info, 0, sizeof(system_info)); + GetSystemInfo(&system_info); + _memory_map_granularity = system_info.dwAllocationGranularity; +#else + _memory_map_granularity = (size_t)sysconf(_SC_PAGESIZE); +#endif + +#if RPMALLOC_CONFIGURABLE + _memory_page_size = _memory_config.page_size; +#else + _memory_page_size = 0; +#endif + _memory_huge_pages = 0; + if (!_memory_page_size) { +#if PLATFORM_WINDOWS + _memory_page_size = system_info.dwPageSize; +#else + _memory_page_size = _memory_map_granularity; + if (_memory_config.enable_huge_pages) { +#if defined(__linux__) + size_t huge_page_size = 0; + FILE* meminfo = fopen("/proc/meminfo", "r"); + if (meminfo) { + char line[128]; + while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) { + line[sizeof(line) - 1] = 0; + if (strstr(line, "Hugepagesize:")) + huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024; + } + fclose(meminfo); + } + if (huge_page_size) { + _memory_huge_pages = 1; + _memory_page_size = huge_page_size; + _memory_map_granularity = huge_page_size; + } +#elif defined(__FreeBSD__) + int rc; + size_t sz = sizeof(rc); + + if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && rc == 1) { + _memory_huge_pages = 1; + _memory_page_size = 2 * 1024 * 1024; + _memory_map_granularity = _memory_page_size; + } +#elif defined(__APPLE__) || defined(__NetBSD__) + _memory_huge_pages = 1; + _memory_page_size = 2 * 1024 * 1024; + _memory_map_granularity = _memory_page_size; +#endif + } +#endif + } else { + if (_memory_config.enable_huge_pages) + _memory_huge_pages = 1; + } + +#if PLATFORM_WINDOWS + if (_memory_config.enable_huge_pages) { + HANDLE token = 0; + size_t large_page_minimum = GetLargePageMinimum(); + if (large_page_minimum) + OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token); + if (token) { + LUID luid; + if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) { + TOKEN_PRIVILEGES token_privileges; + memset(&token_privileges, 0, sizeof(token_privileges)); + token_privileges.PrivilegeCount = 1; + token_privileges.Privileges[0].Luid = luid; + token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) { + if (GetLastError() == ERROR_SUCCESS) + _memory_huge_pages = 1; + } + } + CloseHandle(token); + } + if (_memory_huge_pages) { + if (large_page_minimum > _memory_page_size) + _memory_page_size = large_page_minimum; + if (large_page_minimum > _memory_map_granularity) + _memory_map_granularity = large_page_minimum; + } + } +#endif + + size_t min_span_size = 256; + size_t max_page_size; +#if UINTPTR_MAX > 0xFFFFFFFF + max_page_size = 4096ULL * 1024ULL * 1024ULL; +#else + max_page_size = 4 * 1024 * 1024; +#endif + if (_memory_page_size < min_span_size) + _memory_page_size = min_span_size; + if (_memory_page_size > max_page_size) + _memory_page_size = max_page_size; + _memory_page_size_shift = 0; + size_t page_size_bit = _memory_page_size; + while (page_size_bit != 1) { + ++_memory_page_size_shift; + page_size_bit >>= 1; + } + _memory_page_size = ((size_t)1 << _memory_page_size_shift); + +#if RPMALLOC_CONFIGURABLE + if (!_memory_config.span_size) { + _memory_span_size = _memory_default_span_size; + _memory_span_size_shift = _memory_default_span_size_shift; + _memory_span_mask = _memory_default_span_mask; + } else { + size_t span_size = _memory_config.span_size; + if (span_size > (256 * 1024)) + span_size = (256 * 1024); + _memory_span_size = 4096; + _memory_span_size_shift = 12; + while (_memory_span_size < span_size) { + _memory_span_size <<= 1; + ++_memory_span_size_shift; + } + _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1); + } +#endif + + _memory_span_map_count = ( _memory_config.span_map_count ? _memory_config.span_map_count : DEFAULT_SPAN_MAP_COUNT); + if ((_memory_span_size * _memory_span_map_count) < _memory_page_size) + _memory_span_map_count = (_memory_page_size / _memory_span_size); + if ((_memory_page_size >= _memory_span_size) && ((_memory_span_map_count * _memory_span_size) % _memory_page_size)) + _memory_span_map_count = (_memory_page_size / _memory_span_size); + _memory_heap_reserve_count = (_memory_span_map_count > DEFAULT_SPAN_MAP_COUNT) ? DEFAULT_SPAN_MAP_COUNT : _memory_span_map_count; + + _memory_config.page_size = _memory_page_size; + _memory_config.span_size = _memory_span_size; + _memory_config.span_map_count = _memory_span_map_count; + _memory_config.enable_huge_pages = _memory_huge_pages; + +#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || defined(__TINYC__) + if (pthread_key_create(&_memory_thread_heap, _rpmalloc_heap_release_raw_fc)) + return -1; +#endif +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + fls_key = FlsAlloc(&_rpmalloc_thread_destructor); +#endif + + //Setup all small and medium size classes + size_t iclass = 0; + _memory_size_class[iclass].block_size = SMALL_GRANULARITY; + _rpmalloc_adjust_size_class(iclass); + for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) { + size_t size = iclass * SMALL_GRANULARITY; + _memory_size_class[iclass].block_size = (uint32_t)size; + _rpmalloc_adjust_size_class(iclass); + } + //At least two blocks per span, then fall back to large allocations + _memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1; + if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT) + _memory_medium_size_limit = MEDIUM_SIZE_LIMIT; + for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { + size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); + if (size > _memory_medium_size_limit) + break; + _memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size; + _rpmalloc_adjust_size_class(SMALL_CLASS_COUNT + iclass); + } + + _memory_orphan_heaps = 0; +#if RPMALLOC_FIRST_CLASS_HEAPS + _memory_first_class_orphan_heaps = 0; +#endif +#if ENABLE_STATISTICS + atomic_store32(&_memory_active_heaps, 0); + atomic_store32(&_mapped_pages, 0); + _mapped_pages_peak = 0; + atomic_store32(&_master_spans, 0); + atomic_store32(&_mapped_total, 0); + atomic_store32(&_unmapped_total, 0); + atomic_store32(&_mapped_pages_os, 0); + atomic_store32(&_huge_pages_current, 0); + _huge_pages_peak = 0; +#endif + memset(_memory_heaps, 0, sizeof(_memory_heaps)); + atomic_store32_release(&_memory_global_lock, 0); + + //Initialize this thread + rpmalloc_thread_initialize(); + return 0; +} + +//! Finalize the allocator +TRACY_API void +rpmalloc_finalize(void) { + rpmalloc_thread_finalize(1); + //rpmalloc_dump_statistics(stdout); + + if (_memory_global_reserve) { + atomic_add32(&_memory_global_reserve_master->remaining_spans, -(int32_t)_memory_global_reserve_count); + _memory_global_reserve_master = 0; + _memory_global_reserve_count = 0; + _memory_global_reserve = 0; + } + atomic_store32_release(&_memory_global_lock, 0); + + //Free all thread caches and fully free spans + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { + heap_t* heap = _memory_heaps[list_idx]; + while (heap) { + heap_t* next_heap = heap->next_heap; + heap->finalize = 1; + _rpmalloc_heap_global_finalize(heap); + heap = next_heap; + } + } + +#if ENABLE_GLOBAL_CACHE + //Free global caches + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) + _rpmalloc_global_cache_finalize(&_memory_span_cache[iclass]); +#endif + +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD + pthread_key_delete(_memory_thread_heap); +#endif +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsFree(fls_key); + fls_key = 0; +#endif +#if ENABLE_STATISTICS + //If you hit these asserts you probably have memory leaks (perhaps global scope data doing dynamic allocations) or double frees in your code + rpmalloc_assert(atomic_load32(&_mapped_pages) == 0, "Memory leak detected"); + rpmalloc_assert(atomic_load32(&_mapped_pages_os) == 0, "Memory leak detected"); +#endif + + _rpmalloc_initialized = 0; +} + +//! Initialize thread, assign heap +TRACY_API void +rpmalloc_thread_initialize(void) { + if (!get_thread_heap_raw()) { + heap_t* heap = _rpmalloc_heap_allocate(0); + if (heap) { + _rpmalloc_stat_inc(&_memory_active_heaps); + set_thread_heap(heap); +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsSetValue(fls_key, heap); +#endif + } + } +} + +//! Finalize thread, orphan heap +TRACY_API void +rpmalloc_thread_finalize(int release_caches) { + heap_t* heap = get_thread_heap_raw(); + if (heap) + _rpmalloc_heap_release_raw(heap, release_caches); + set_thread_heap(0); +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsSetValue(fls_key, 0); +#endif +} + +int +rpmalloc_is_thread_initialized(void) { + return (get_thread_heap_raw() != 0) ? 1 : 0; +} + +const rpmalloc_config_t* +rpmalloc_config(void) { + return &_memory_config; +} + +// Extern interface + +TRACY_API RPMALLOC_ALLOCATOR void* +rpmalloc(size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + heap_t* heap = get_thread_heap(); + return _rpmalloc_allocate(heap, size); +} + +TRACY_API void +rpfree(void* ptr) { + _rpmalloc_deallocate(ptr); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpcalloc(size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + heap_t* heap = get_thread_heap(); + void* block = _rpmalloc_allocate(heap, total); + if (block) + memset(block, 0, total); + return block; +} + +TRACY_API RPMALLOC_ALLOCATOR void* +rprealloc(void* ptr, size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return ptr; + } +#endif + heap_t* heap = get_thread_heap(); + return _rpmalloc_reallocate(heap, ptr, size, 0, 0); +} + +extern RPMALLOC_ALLOCATOR void* +rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, + unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if ((size + alignment < size) || (alignment > _memory_page_size)) { + errno = EINVAL; + return 0; + } +#endif + heap_t* heap = get_thread_heap(); + return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, oldsize, flags); +} + +extern RPMALLOC_ALLOCATOR void* +rpaligned_alloc(size_t alignment, size_t size) { + heap_t* heap = get_thread_heap(); + return _rpmalloc_aligned_allocate(heap, alignment, size); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpaligned_calloc(size_t alignment, size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + void* block = rpaligned_alloc(alignment, total); + if (block) + memset(block, 0, total); + return block; +} + extern inline RPMALLOC_ALLOCATOR void* rpmemalign(size_t alignment, size_t size) { return rpaligned_alloc(alignment, size); @@ -2324,7 +3114,7 @@ rpposix_memalign(void **memptr, size_t alignment, size_t size) { extern inline size_t rpmalloc_usable_size(void* ptr) { - return (ptr ? _memory_usable_size(ptr) : 0); + return (ptr ? _rpmalloc_usable_size(ptr) : 0); } extern inline void @@ -2340,13 +3130,13 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) { for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { size_class_t* size_class = _memory_size_class + iclass; - heap_class_t* heap_class = heap->span_class + iclass; - span_t* span = heap_class->partial_span; + span_t* span = heap->size_class[iclass].partial_span; while (span) { - atomic_thread_fence_acquire(); size_t free_count = span->list_size; - if (span->state == SPAN_STATE_PARTIAL) - free_count += (size_class->block_count - span->used_count); + size_t block_count = size_class->block_count; + if (span->free_list_limit < block_count) + block_count = span->free_list_limit; + free_count += (block_count - span->used_count); stats->sizecache = free_count * size_class->block_size; span = span->next; } @@ -2354,38 +3144,46 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) { #if ENABLE_THREAD_CACHE for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - if (heap->span_cache[iclass]) - stats->spancache = (size_t)heap->span_cache[iclass]->list_size * (iclass + 1) * _memory_span_size; - span_t* deferred_list = !iclass ? (span_t*)atomic_load_ptr(&heap->span_cache_deferred) : 0; - //TODO: Incorrect, for deferred lists the size is NOT stored in list_size - if (deferred_list) - stats->spancache = (size_t)deferred_list->list_size * (iclass + 1) * _memory_span_size; + span_cache_t* span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1)); + stats->spancache = span_cache->count * (iclass + 1) * _memory_span_size; } #endif + + span_t* deferred = (span_t*)atomic_load_ptr(&heap->span_free_deferred); + while (deferred) { + if (deferred->size_class != SIZE_CLASS_HUGE) + stats->spancache = (size_t)deferred->span_count * _memory_span_size; + deferred = (span_t*)deferred->free_list; + } + #if ENABLE_STATISTICS - stats->thread_to_global = heap->thread_to_global; - stats->global_to_thread = heap->global_to_thread; + stats->thread_to_global = (size_t)atomic_load64(&heap->thread_to_global); + stats->global_to_thread = (size_t)atomic_load64(&heap->global_to_thread); for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { stats->span_use[iclass].current = (size_t)atomic_load32(&heap->span_use[iclass].current); - stats->span_use[iclass].peak = (size_t)heap->span_use[iclass].high; - stats->span_use[iclass].to_global = (size_t)heap->span_use[iclass].spans_to_global; - stats->span_use[iclass].from_global = (size_t)heap->span_use[iclass].spans_from_global; - stats->span_use[iclass].to_cache = (size_t)heap->span_use[iclass].spans_to_cache; - stats->span_use[iclass].from_cache = (size_t)heap->span_use[iclass].spans_from_cache; - stats->span_use[iclass].to_reserved = (size_t)heap->span_use[iclass].spans_to_reserved; - stats->span_use[iclass].from_reserved = (size_t)heap->span_use[iclass].spans_from_reserved; - stats->span_use[iclass].map_calls = (size_t)heap->span_use[iclass].spans_map_calls; + stats->span_use[iclass].peak = (size_t)atomic_load32(&heap->span_use[iclass].high); + stats->span_use[iclass].to_global = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_global); + stats->span_use[iclass].from_global = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_global); + stats->span_use[iclass].to_cache = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache); + stats->span_use[iclass].from_cache = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache); + stats->span_use[iclass].to_reserved = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved); + stats->span_use[iclass].from_reserved = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved); + stats->span_use[iclass].map_calls = (size_t)atomic_load32(&heap->span_use[iclass].spans_map_calls); } for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { stats->size_use[iclass].alloc_current = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current); stats->size_use[iclass].alloc_peak = (size_t)heap->size_class_use[iclass].alloc_peak; - stats->size_use[iclass].alloc_total = (size_t)heap->size_class_use[iclass].alloc_total; + stats->size_use[iclass].alloc_total = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_total); stats->size_use[iclass].free_total = (size_t)atomic_load32(&heap->size_class_use[iclass].free_total); - stats->size_use[iclass].spans_to_cache = (size_t)heap->size_class_use[iclass].spans_to_cache; - stats->size_use[iclass].spans_from_cache = (size_t)heap->size_class_use[iclass].spans_from_cache; - stats->size_use[iclass].spans_from_reserved = (size_t)heap->size_class_use[iclass].spans_from_reserved; - stats->size_use[iclass].map_calls = (size_t)heap->size_class_use[iclass].spans_map_calls; + stats->size_use[iclass].spans_to_cache = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache); + stats->size_use[iclass].spans_from_cache = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache); + stats->size_use[iclass].spans_from_reserved = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_reserved); + stats->size_use[iclass].map_calls = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_map_calls); } #endif } @@ -2402,94 +3200,319 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) { stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size; #endif #if ENABLE_GLOBAL_CACHE - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - stats->cached += (size_t)atomic_load32(&_memory_span_cache[iclass].size) * (iclass + 1) * _memory_span_size; - } + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) + stats->cached += _memory_span_cache[iclass].count * (iclass + 1) * _memory_span_size; #endif } +#if ENABLE_STATISTICS + +static void +_memory_heap_dump_statistics(heap_t* heap, void* file) { + fprintf(file, "Heap %d stats:\n", heap->id); + fprintf(file, "Class CurAlloc PeakAlloc TotAlloc TotFree BlkSize BlkCount SpansCur SpansPeak PeakAllocMiB ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n"); + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) + continue; + fprintf(file, "%3u: %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass, + atomic_load32(&heap->size_class_use[iclass].alloc_current), + heap->size_class_use[iclass].alloc_peak, + atomic_load32(&heap->size_class_use[iclass].alloc_total), + atomic_load32(&heap->size_class_use[iclass].free_total), + _memory_size_class[iclass].block_size, + _memory_size_class[iclass].block_count, + atomic_load32(&heap->size_class_use[iclass].spans_current), + heap->size_class_use[iclass].spans_peak, + ((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_reserved) * _memory_span_size) / (size_t)(1024 * 1024), + atomic_load32(&heap->size_class_use[iclass].spans_map_calls)); + } + fprintf(file, "Spans Current Peak Deferred PeakMiB Cached ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB MmapCalls\n"); + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + if (!atomic_load32(&heap->span_use[iclass].high) && !atomic_load32(&heap->span_use[iclass].spans_map_calls)) + continue; + fprintf(file, "%4u: %8d %8u %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1), + atomic_load32(&heap->span_use[iclass].current), + atomic_load32(&heap->span_use[iclass].high), + atomic_load32(&heap->span_use[iclass].spans_deferred), + ((size_t)atomic_load32(&heap->span_use[iclass].high) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), +#if ENABLE_THREAD_CACHE + (unsigned int)(!iclass ? heap->span_cache.count : heap->span_large_cache[iclass - 1].count), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), +#else + 0, (size_t)0, (size_t)0, +#endif + ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_global) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_global) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), + atomic_load32(&heap->span_use[iclass].spans_map_calls)); + } + fprintf(file, "Full spans: %zu\n", heap->full_span_count); + fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n"); + fprintf(file, "%17zu %17zu\n", (size_t)atomic_load64(&heap->thread_to_global) / (size_t)(1024 * 1024), (size_t)atomic_load64(&heap->global_to_thread) / (size_t)(1024 * 1024)); +} + +#endif + void rpmalloc_dump_statistics(void* file) { #if ENABLE_STATISTICS - //If you hit this assert, you still have active threads or forgot to finalize some thread(s) - assert(atomic_load32(&_memory_active_heaps) == 0); - for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { - heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]); + heap_t* heap = _memory_heaps[list_idx]; while (heap) { - fprintf(file, "Heap %d stats:\n", heap->id); - fprintf(file, "Class CurAlloc PeakAlloc TotAlloc TotFree BlkSize BlkCount SpansCur SpansPeak PeakAllocMiB ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n"); - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - if (!heap->size_class_use[iclass].alloc_total) { - assert(!atomic_load32(&heap->size_class_use[iclass].free_total)); - assert(!heap->size_class_use[iclass].spans_map_calls); + int need_dump = 0; + for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT); ++iclass) { + if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) { + rpmalloc_assert(!atomic_load32(&heap->size_class_use[iclass].free_total), "Heap statistics counter mismatch"); + rpmalloc_assert(!atomic_load32(&heap->size_class_use[iclass].spans_map_calls), "Heap statistics counter mismatch"); continue; } - fprintf(file, "%3u: %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass, - atomic_load32(&heap->size_class_use[iclass].alloc_current), - heap->size_class_use[iclass].alloc_peak, - heap->size_class_use[iclass].alloc_total, - atomic_load32(&heap->size_class_use[iclass].free_total), - _memory_size_class[iclass].block_size, - _memory_size_class[iclass].block_count, - heap->size_class_use[iclass].spans_current, - heap->size_class_use[iclass].spans_peak, - ((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024), - ((size_t)heap->size_class_use[iclass].spans_to_cache * _memory_span_size) / (size_t)(1024 * 1024), - ((size_t)heap->size_class_use[iclass].spans_from_cache * _memory_span_size) / (size_t)(1024 * 1024), - ((size_t)heap->size_class_use[iclass].spans_from_reserved * _memory_span_size) / (size_t)(1024 * 1024), - heap->size_class_use[iclass].spans_map_calls); + need_dump = 1; } - fprintf(file, "Spans Current Peak PeakMiB Cached ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB MmapCalls\n"); - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - if (!heap->span_use[iclass].high && !heap->span_use[iclass].spans_map_calls) + for (size_t iclass = 0; !need_dump && (iclass < LARGE_CLASS_COUNT); ++iclass) { + if (!atomic_load32(&heap->span_use[iclass].high) && !atomic_load32(&heap->span_use[iclass].spans_map_calls)) continue; - fprintf(file, "%4u: %8d %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1), - atomic_load32(&heap->span_use[iclass].current), - heap->span_use[iclass].high, - ((size_t)heap->span_use[iclass].high * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), - heap->span_cache[iclass] ? heap->span_cache[iclass]->list_size : 0, - ((size_t)heap->span_use[iclass].spans_to_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), - ((size_t)heap->span_use[iclass].spans_from_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), - ((size_t)heap->span_use[iclass].spans_to_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), - ((size_t)heap->span_use[iclass].spans_from_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), - ((size_t)heap->span_use[iclass].spans_to_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), - ((size_t)heap->span_use[iclass].spans_from_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), - heap->span_use[iclass].spans_map_calls); + need_dump = 1; } - fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n"); - fprintf(file, "%17zu %17zu\n", (size_t)heap->thread_to_global / (size_t)(1024 * 1024), (size_t)heap->global_to_thread / (size_t)(1024 * 1024)); + if (need_dump) + _memory_heap_dump_statistics(heap, file); heap = heap->next_heap; } } - fprintf(file, "Global stats:\n"); size_t huge_current = (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size; size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size; fprintf(file, "HugeCurrentMiB HugePeakMiB\n"); fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024), huge_peak / (size_t)(1024 * 1024)); + fprintf(file, "GlobalCacheMiB\n"); + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + global_cache_t* cache = _memory_span_cache + iclass; + size_t global_cache = (size_t)cache->count * iclass * _memory_span_size; + + size_t global_overflow_cache = 0; + span_t* span = cache->overflow; + while (span) { + global_overflow_cache += iclass * _memory_span_size; + span = span->next; + } + if (global_cache || global_overflow_cache || cache->insert_count || cache->extract_count) + fprintf(file, "%4zu: %8zuMiB (%8zuMiB overflow) %14zu insert %14zu extract\n", iclass + 1, global_cache / (size_t)(1024 * 1024), global_overflow_cache / (size_t)(1024 * 1024), cache->insert_count, cache->extract_count); + } + size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; size_t mapped_os = (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size; size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size; size_t mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size; size_t unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; - size_t reserved_total = (size_t)atomic_load32(&_reserved_spans) * _memory_span_size; - fprintf(file, "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB ReservedTotalMiB\n"); - fprintf(file, "%9zu %11zu %13zu %14zu %16zu %16zu\n", + fprintf(file, "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB\n"); + fprintf(file, "%9zu %11zu %13zu %14zu %16zu\n", mapped / (size_t)(1024 * 1024), mapped_os / (size_t)(1024 * 1024), mapped_peak / (size_t)(1024 * 1024), mapped_total / (size_t)(1024 * 1024), - unmapped_total / (size_t)(1024 * 1024), - reserved_total / (size_t)(1024 * 1024)); + unmapped_total / (size_t)(1024 * 1024)); fprintf(file, "\n"); -#else +#if 0 + int64_t allocated = atomic_load64(&_allocation_counter); + int64_t deallocated = atomic_load64(&_deallocation_counter); + fprintf(file, "Allocation count: %lli\n", allocated); + fprintf(file, "Deallocation count: %lli\n", deallocated); + fprintf(file, "Current allocations: %lli\n", (allocated - deallocated)); + fprintf(file, "Master spans: %d\n", atomic_load32(&_master_spans)); + fprintf(file, "Dangling master spans: %d\n", atomic_load32(&_unmapped_master_spans)); +#endif +#endif (void)sizeof(file); +} + +#if RPMALLOC_FIRST_CLASS_HEAPS + +extern inline rpmalloc_heap_t* +rpmalloc_heap_acquire(void) { + // Must be a pristine heap from newly mapped memory pages, or else memory blocks + // could already be allocated from the heap which would (wrongly) be released when + // heap is cleared with rpmalloc_heap_free_all(). Also heaps guaranteed to be + // pristine from the dedicated orphan list can be used. + heap_t* heap = _rpmalloc_heap_allocate(1); + heap->owner_thread = 0; + _rpmalloc_stat_inc(&_memory_active_heaps); + return heap; +} + +extern inline void +rpmalloc_heap_release(rpmalloc_heap_t* heap) { + if (heap) + _rpmalloc_heap_release(heap, 1, 1); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + return _rpmalloc_allocate(heap, size); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + return _rpmalloc_aligned_allocate(heap, alignment, size); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) { + return rpmalloc_heap_aligned_calloc(heap, 0, num, size); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + void* block = _rpmalloc_aligned_allocate(heap, alignment, total); + if (block) + memset(block, 0, total); + return block; +} + +extern inline RPMALLOC_ALLOCATOR void* +rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return ptr; + } +#endif + return _rpmalloc_reallocate(heap, ptr, size, 0, flags); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if ((size + alignment < size) || (alignment > _memory_page_size)) { + errno = EINVAL; + return 0; + } +#endif + return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, 0, flags); +} + +extern inline void +rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr) { + (void)sizeof(heap); + _rpmalloc_deallocate(ptr); +} + +extern inline void +rpmalloc_heap_free_all(rpmalloc_heap_t* heap) { + span_t* span; + span_t* next_span; + + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + span = heap->size_class[iclass].partial_span; + while (span) { + next_span = span->next; + _rpmalloc_heap_cache_insert(heap, span); + span = next_span; + } + heap->size_class[iclass].partial_span = 0; + span = heap->full_span[iclass]; + while (span) { + next_span = span->next; + _rpmalloc_heap_cache_insert(heap, span); + span = next_span; + } + } + memset(heap->size_class, 0, sizeof(heap->size_class)); + memset(heap->full_span, 0, sizeof(heap->full_span)); + + span = heap->large_huge_span; + while (span) { + next_span = span->next; + if (UNEXPECTED(span->size_class == SIZE_CLASS_HUGE)) + _rpmalloc_deallocate_huge(span); + else + _rpmalloc_heap_cache_insert(heap, span); + span = next_span; + } + heap->large_huge_span = 0; + heap->full_span_count = 0; + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t* span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1)); + if (!span_cache->count) + continue; +#if ENABLE_GLOBAL_CACHE + _rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count); + _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count); +#else + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); +#endif + span_cache->count = 0; + } +#endif + +#if ENABLE_STATISTICS + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + atomic_store32(&heap->size_class_use[iclass].alloc_current, 0); + atomic_store32(&heap->size_class_use[iclass].spans_current, 0); + } + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + atomic_store32(&heap->span_use[iclass].current, 0); + } #endif } +extern inline void +rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap) { + heap_t* prev_heap = get_thread_heap_raw(); + if (prev_heap != heap) { + set_thread_heap(heap); + if (prev_heap) + rpmalloc_heap_release(prev_heap); + } +} + +#endif + } #endif diff --git a/Source/ThirdParty/tracy/client/tracy_rpmalloc.hpp b/Source/ThirdParty/tracy/client/tracy_rpmalloc.hpp index 3e8c4f1b5..2a743d709 100644 --- a/Source/ThirdParty/tracy/client/tracy_rpmalloc.hpp +++ b/Source/ThirdParty/tracy/client/tracy_rpmalloc.hpp @@ -20,11 +20,12 @@ namespace tracy #if defined(__clang__) || defined(__GNUC__) # define RPMALLOC_EXPORT __attribute__((visibility("default"))) # define RPMALLOC_ALLOCATOR -# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__)) -# if defined(__clang_major__) && (__clang_major__ < 4) +# if (defined(__clang_major__) && (__clang_major__ < 4)) || (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD) +# define RPMALLOC_ATTRIB_MALLOC # define RPMALLOC_ATTRIB_ALLOC_SIZE(size) # define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) # else +# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__)) # define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size))) # define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) __attribute__((alloc_size(count, size))) # endif @@ -45,13 +46,24 @@ namespace tracy # define RPMALLOC_CDECL #endif -//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes +//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce +// a very small overhead due to some size calculations not being compile time constants #ifndef RPMALLOC_CONFIGURABLE #define RPMALLOC_CONFIGURABLE 0 #endif +//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_* functions). +// Will introduce a very small overhead to track fully allocated spans in heaps +#ifndef RPMALLOC_FIRST_CLASS_HEAPS +#define RPMALLOC_FIRST_CLASS_HEAPS 0 +#endif + //! Flag to rpaligned_realloc to not preserve content in reallocation #define RPMALLOC_NO_PRESERVE 1 +//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be done in-place, +// in which case the original pointer is still valid (just like a call to realloc which failes to allocate +// a new block). +#define RPMALLOC_GROW_OR_FAIL 2 typedef struct rpmalloc_global_statistics_t { //! Current amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1) @@ -99,7 +111,7 @@ typedef struct rpmalloc_thread_statistics_t { size_t from_reserved; //! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls) size_t map_calls; - } span_use[32]; + } span_use[64]; //! Per size class statistics (only if ENABLE_STATISTICS=1) struct { //! Current number of allocations @@ -131,7 +143,8 @@ typedef struct rpmalloc_config_t { // larger than 65535 (storable in an uint16_t), if it is you must use natural // alignment to shift it into 16 bits. If you set a memory_map function, you // must also set a memory_unmap function or else the default implementation will - // be used for both. + // be used for both. This function must be thread safe, it can be called by + // multiple threads simultaneously. void* (*memory_map)(size_t size, size_t* offset); //! Unmap the memory pages starting at address and spanning the given number of bytes. // If release is set to non-zero, the unmap is for an entire span range as returned by @@ -139,8 +152,18 @@ typedef struct rpmalloc_config_t { // release argument holds the size of the entire span range. If release is set to 0, // the unmap is a partial decommit of a subset of the mapped memory range. // If you set a memory_unmap function, you must also set a memory_map function or - // else the default implementation will be used for both. + // else the default implementation will be used for both. This function must be thread + // safe, it can be called by multiple threads simultaneously. void (*memory_unmap)(void* address, size_t size, size_t offset, size_t release); + //! Called when an assert fails, if asserts are enabled. Will use the standard assert() + // if this is not set. + void (*error_callback)(const char* message); + //! Called when a call to map memory pages fails (out of memory). If this callback is + // not set or returns zero the library will return a null pointer in the allocation + // call. If this callback returns non-zero the map call will be retried. The argument + // passed is the number of bytes that was requested in the map call. Only used if + // the default system memory map function is used (memory_map callback is not set). + int (*map_fail_callback)(size_t size); //! Size of memory pages. The page size MUST be a power of two. All memory mapping // requests to memory_map will be made with size set to a multiple of the page size. // Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system page size is used. @@ -163,6 +186,10 @@ typedef struct rpmalloc_config_t { // For Windows, see https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support // For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt int enable_huge_pages; + //! Respectively allocated pages and huge allocated pages names for systems + // supporting it to be able to distinguish among anonymous regions. + const char *page_name; + const char *huge_page_name; } rpmalloc_config_t; //! Initialize allocator with default configuration @@ -187,7 +214,7 @@ rpmalloc_thread_initialize(void); //! Finalize allocator for calling thread TRACY_API void -rpmalloc_thread_finalize(void); +rpmalloc_thread_finalize(int release_caches); //! Perform deferred deallocations pending for the calling thread heap RPMALLOC_EXPORT void @@ -240,6 +267,13 @@ rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsi RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2); +//! Allocate a memory block of at least the given size and alignment, and zero initialize it. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpaligned_calloc(size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + //! Allocate a memory block of at least the given size and alignment. // Alignment must be a power of two and a multiple of sizeof(void*), // and should ideally be less than memory page size. A caveat of rpmalloc @@ -252,10 +286,78 @@ rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB // and should ideally be less than memory page size. A caveat of rpmalloc // internals is that this must also be strictly less than the span size (default 64KiB) RPMALLOC_EXPORT int -rpposix_memalign(void **memptr, size_t alignment, size_t size); +rpposix_memalign(void** memptr, size_t alignment, size_t size); //! Query the usable size of the given memory block (from given pointer to the end of block) RPMALLOC_EXPORT size_t rpmalloc_usable_size(void* ptr); +#if RPMALLOC_FIRST_CLASS_HEAPS + +//! Heap type +typedef struct heap_t rpmalloc_heap_t; + +//! Acquire a new heap. Will reuse existing released heaps or allocate memory for a new heap +// if none available. Heap API is implemented with the strict assumption that only one single +// thread will call heap functions for a given heap at any given time, no functions are thread safe. +RPMALLOC_EXPORT rpmalloc_heap_t* +rpmalloc_heap_acquire(void); + +//! Release a heap (does NOT free the memory allocated by the heap, use rpmalloc_heap_free_all before destroying the heap). +// Releasing a heap will enable it to be reused by other threads. Safe to pass a null pointer. +RPMALLOC_EXPORT void +rpmalloc_heap_release(rpmalloc_heap_t* heap); + +//! Allocate a memory block of at least the given size using the given heap. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size using the given heap. The returned +// block will have the requested alignment. Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Allocate a memory block of at least the given size using the given heap and zero initialize it. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + +//! Allocate a memory block of at least the given size using the given heap and zero initialize it. The returned +// block will have the requested alignment. Alignment must either be zero, or a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + +//! Reallocate the given block to at least the given size. The memory block MUST be allocated +// by the same heap given to this function. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Reallocate the given block to at least the given size. The memory block MUST be allocated +// by the same heap given to this function. The returned block will have the requested alignment. +// Alignment must be either zero, or a power of two and a multiple of sizeof(void*), and should ideally be +// less than memory page size. A caveat of rpmalloc internals is that this must also be strictly less than +// the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(4); + +//! Free the given memory block from the given heap. The memory block MUST be allocated +// by the same heap given to this function. +RPMALLOC_EXPORT void +rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr); + +//! Free all memory allocated by the heap +RPMALLOC_EXPORT void +rpmalloc_heap_free_all(rpmalloc_heap_t* heap); + +//! Set the given heap as the current heap for the calling thread. A heap MUST only be current heap +// for a single thread, a heap can never be shared between multiple threads. The previous +// current heap for the calling thread is released to be reused by other threads. +RPMALLOC_EXPORT void +rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap); + +#endif + } diff --git a/Source/ThirdParty/tracy/common/TracyAlloc.hpp b/Source/ThirdParty/tracy/common/TracyAlloc.hpp index a3cbec057..4e49df84d 100644 --- a/Source/ThirdParty/tracy/common/TracyAlloc.hpp +++ b/Source/ThirdParty/tracy/common/TracyAlloc.hpp @@ -3,16 +3,33 @@ #include -#ifdef TRACY_ENABLE +#if defined TRACY_ENABLE && !defined __EMSCRIPTEN__ # include "../client/tracy_rpmalloc.hpp" +# define TRACY_USE_RPMALLOC #endif namespace tracy { +#ifdef TRACY_USE_RPMALLOC +TRACY_API void InitRpmalloc(); +#else +static inline void InitRpmalloc() {} +#endif + static inline void* tracy_malloc( size_t size ) { -#ifdef TRACY_ENABLE +#ifdef TRACY_USE_RPMALLOC + InitRpmalloc(); + return rpmalloc( size ); +#else + return malloc( size ); +#endif +} + +static inline void* tracy_malloc_fast( size_t size ) +{ +#ifdef TRACY_USE_RPMALLOC return rpmalloc( size ); #else return malloc( size ); @@ -21,7 +38,17 @@ static inline void* tracy_malloc( size_t size ) static inline void tracy_free( void* ptr ) { -#ifdef TRACY_ENABLE +#ifdef TRACY_USE_RPMALLOC + InitRpmalloc(); + rpfree( ptr ); +#else + free( ptr ); +#endif +} + +static inline void tracy_free_fast( void* ptr ) +{ +#ifdef TRACY_USE_RPMALLOC rpfree( ptr ); #else free( ptr ); @@ -30,7 +57,8 @@ static inline void tracy_free( void* ptr ) static inline void* tracy_realloc( void* ptr, size_t size ) { -#ifdef TRACY_ENABLE +#ifdef TRACY_USE_RPMALLOC + InitRpmalloc(); return rprealloc( ptr, size ); #else return realloc( ptr, size ); diff --git a/Source/ThirdParty/tracy/common/TracyProtocol.hpp b/Source/ThirdParty/tracy/common/TracyProtocol.hpp index 2326a7f32..dd30e5391 100644 --- a/Source/ThirdParty/tracy/common/TracyProtocol.hpp +++ b/Source/ThirdParty/tracy/common/TracyProtocol.hpp @@ -9,8 +9,8 @@ namespace tracy constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; } -enum : uint32_t { ProtocolVersion = 46 }; -enum : uint16_t { BroadcastVersion = 2 }; +enum : uint32_t { ProtocolVersion = 63 }; +enum : uint16_t { BroadcastVersion = 3 }; using lz4sz_t = uint32_t; @@ -34,7 +34,7 @@ enum HandshakeStatus : uint8_t enum { WelcomeMessageProgramNameSize = 64 }; enum { WelcomeMessageHostInfoSize = 1024 }; -#pragma pack( 1 ) +#pragma pack( push, 1 ) // Must increase left query space after handling! enum ServerQuery : uint8_t @@ -44,14 +44,15 @@ enum ServerQuery : uint8_t ServerQueryThreadString, ServerQuerySourceLocation, ServerQueryPlotName, - ServerQueryCallstackFrame, ServerQueryFrameName, - ServerQueryDisconnect, - ServerQueryExternalName, ServerQueryParameter, + ServerQueryFiberName, + // Items above are high priority. Split order must be preserved. See IsQueryPrio(). + ServerQueryDisconnect, + ServerQueryCallstackFrame, + ServerQueryExternalName, ServerQuerySymbol, ServerQuerySymbolCode, - ServerQueryCodeLocation, ServerQuerySourceCode, ServerQueryDataTransfer, ServerQueryDataTransferPart @@ -77,6 +78,18 @@ enum CpuArchitecture : uint8_t }; +struct WelcomeFlag +{ + enum _t : uint8_t + { + OnDemand = 1 << 0, + IsApple = 1 << 1, + CodeTransfer = 1 << 2, + CombineSamples = 1 << 3, + IdentifySamples = 1 << 4, + }; +}; + struct WelcomeMessage { double timerMul; @@ -88,10 +101,8 @@ struct WelcomeMessage uint64_t exectime; uint64_t pid; int64_t samplingPeriod; - uint8_t onDemand; - uint8_t isApple; + uint8_t flags; uint8_t cpuArch; - uint8_t codeTransfer; char cpuManufacturer[12]; uint32_t cpuId; char programName[WelcomeMessageProgramNameSize]; @@ -115,13 +126,43 @@ struct BroadcastMessage uint16_t broadcastVersion; uint16_t listenPort; uint32_t protocolVersion; + uint64_t pid; int32_t activeTime; // in seconds char programName[WelcomeMessageProgramNameSize]; }; -enum { BroadcastMessageSize = sizeof( BroadcastMessage ) }; +struct BroadcastMessage_v2 +{ + uint16_t broadcastVersion; + uint16_t listenPort; + uint32_t protocolVersion; + int32_t activeTime; + char programName[WelcomeMessageProgramNameSize]; +}; -#pragma pack() +struct BroadcastMessage_v1 +{ + uint32_t broadcastVersion; + uint32_t protocolVersion; + uint32_t listenPort; + uint32_t activeTime; + char programName[WelcomeMessageProgramNameSize]; +}; + +struct BroadcastMessage_v0 +{ + uint32_t broadcastVersion; + uint32_t protocolVersion; + uint32_t activeTime; + char programName[WelcomeMessageProgramNameSize]; +}; + +enum { BroadcastMessageSize = sizeof( BroadcastMessage ) }; +enum { BroadcastMessageSize_v2 = sizeof( BroadcastMessage_v2 ) }; +enum { BroadcastMessageSize_v1 = sizeof( BroadcastMessage_v1 ) }; +enum { BroadcastMessageSize_v0 = sizeof( BroadcastMessage_v0 ) }; + +#pragma pack( pop ) } diff --git a/Source/ThirdParty/tracy/common/TracyQueue.hpp b/Source/ThirdParty/tracy/common/TracyQueue.hpp index d99945013..092d26969 100644 --- a/Source/ThirdParty/tracy/common/TracyQueue.hpp +++ b/Source/ThirdParty/tracy/common/TracyQueue.hpp @@ -21,6 +21,7 @@ enum class QueueType : uint8_t Callstack, CallstackAlloc, CallstackSample, + CallstackSampleContextSwitch, FrameImage, ZoneBegin, ZoneBeginCallstack, @@ -50,11 +51,20 @@ enum class QueueType : uint8_t GpuZoneBeginAllocSrcLocSerial, GpuZoneBeginAllocSrcLocCallstackSerial, GpuZoneEndSerial, - PlotData, + PlotDataInt, + PlotDataFloat, + PlotDataDouble, ContextSwitch, ThreadWakeup, GpuTime, GpuContextName, + CallstackFrameSize, + SymbolInformation, + ExternalNameMetadata, + SymbolCodeMetadata, + SourceCodeMetadata, + FiberEnter, + FiberLeave, Terminate, KeepAlive, ThreadContext, @@ -67,6 +77,7 @@ enum class QueueType : uint8_t FrameMarkMsg, FrameMarkMsgStart, FrameMarkMsgEnd, + FrameVsync, SourceLocation, LockAnnounce, LockTerminate, @@ -76,16 +87,20 @@ enum class QueueType : uint8_t MessageLiteralCallstack, MessageLiteralColorCallstack, GpuNewContext, - CallstackFrameSize, CallstackFrame, - SymbolInformation, - CodeInformation, SysTimeReport, TidToPid, + HwSampleCpuCycle, + HwSampleInstructionRetired, + HwSampleCacheReference, + HwSampleCacheMiss, + HwSampleBranchRetired, + HwSampleBranchMiss, PlotConfig, ParamSetup, AckServerQueryNoop, AckSourceCodeNotAvailable, + AckSymbolCodeNotAvailable, CpuTopology, SingleStringData, SecondStringData, @@ -102,14 +117,15 @@ enum class QueueType : uint8_t ExternalThreadName, SymbolCode, SourceCode, + FiberName, NUM_TYPES }; -#pragma pack( 1 ) +#pragma pack( push, 1 ) struct QueueThreadContext { - uint64_t thread; + uint32_t thread; }; struct QueueZoneBeginLean @@ -122,16 +138,31 @@ struct QueueZoneBegin : public QueueZoneBeginLean uint64_t srcloc; // ptr }; +struct QueueZoneBeginThread : public QueueZoneBegin +{ + uint32_t thread; +}; + struct QueueZoneEnd { int64_t time; }; +struct QueueZoneEndThread : public QueueZoneEnd +{ + uint32_t thread; +}; + struct QueueZoneValidation { uint32_t id; }; +struct QueueZoneValidationThread : public QueueZoneValidation +{ + uint32_t thread; +}; + struct QueueZoneColor { uint8_t r; @@ -139,11 +170,21 @@ struct QueueZoneColor uint8_t b; }; +struct QueueZoneColorThread : public QueueZoneColor +{ + uint32_t thread; +}; + struct QueueZoneValue { uint64_t value; }; +struct QueueZoneValueThread : public QueueZoneValue +{ + uint32_t thread; +}; + struct QueueStringTransfer { uint64_t ptr; @@ -155,6 +196,12 @@ struct QueueFrameMark uint64_t name; // ptr }; +struct QueueFrameVsync +{ + int64_t time; + uint32_t id; +}; + struct QueueFrameImage { uint32_t frame; @@ -185,6 +232,11 @@ struct QueueZoneTextFat uint16_t size; }; +struct QueueZoneTextFatThread : public QueueZoneTextFat +{ + uint32_t thread; +}; + enum class LockType : uint8_t { Lockable, @@ -199,6 +251,19 @@ struct QueueLockAnnounce LockType type; }; +struct QueueFiberEnter +{ + int64_t time; + uint64_t fiber; // ptr + uint32_t thread; +}; + +struct QueueFiberLeave +{ + int64_t time; + uint32_t thread; +}; + struct QueueLockTerminate { uint32_t id; @@ -207,28 +272,32 @@ struct QueueLockTerminate struct QueueLockWait { - uint64_t thread; + uint32_t thread; uint32_t id; int64_t time; }; struct QueueLockObtain { - uint64_t thread; + uint32_t thread; uint32_t id; int64_t time; }; struct QueueLockRelease { - uint64_t thread; uint32_t id; int64_t time; }; +struct QueueLockReleaseShared : public QueueLockRelease +{ + uint32_t thread; +}; + struct QueueLockMark { - uint64_t thread; + uint32_t thread; uint32_t id; uint64_t srcloc; // ptr }; @@ -244,24 +313,25 @@ struct QueueLockNameFat : public QueueLockName uint16_t size; }; -enum class PlotDataType : uint8_t -{ - Float, - Double, - Int -}; - -struct QueuePlotData +struct QueuePlotDataBase { uint64_t name; // ptr int64_t time; - PlotDataType type; - union - { - double d; - float f; - int64_t i; - } data; +}; + +struct QueuePlotDataInt : public QueuePlotDataBase +{ + int64_t val; +}; + +struct QueuePlotDataFloat : public QueuePlotDataBase +{ + float val; +}; + +struct QueuePlotDataDouble : public QueuePlotDataBase +{ + double val; }; struct QueueMessage @@ -281,23 +351,43 @@ struct QueueMessageLiteral : public QueueMessage uint64_t text; // ptr }; +struct QueueMessageLiteralThread : public QueueMessageLiteral +{ + uint32_t thread; +}; + struct QueueMessageColorLiteral : public QueueMessageColor { uint64_t text; // ptr }; +struct QueueMessageColorLiteralThread : public QueueMessageColorLiteral +{ + uint32_t thread; +}; + struct QueueMessageFat : public QueueMessage { uint64_t text; // ptr uint16_t size; }; +struct QueueMessageFatThread : public QueueMessageFat +{ + uint32_t thread; +}; + struct QueueMessageColorFat : public QueueMessageColor { uint64_t text; // ptr uint16_t size; }; +struct QueueMessageColorFatThread : public QueueMessageColorFat +{ + uint32_t thread; +}; + // Don't change order, only add new entries at the end, this is also used on trace dumps! enum class GpuContextType : uint8_t { @@ -305,7 +395,8 @@ enum class GpuContextType : uint8_t OpenGl, Vulkan, OpenCL, - Direct3D12 + Direct3D12, + Direct3D11 }; enum GpuContextFlags : uint8_t @@ -317,7 +408,7 @@ struct QueueGpuNewContext { int64_t cpuTime; int64_t gpuTime; - uint64_t thread; + uint32_t thread; float period; uint8_t context; GpuContextFlags flags; @@ -327,7 +418,7 @@ struct QueueGpuNewContext struct QueueGpuZoneBeginLean { int64_t cpuTime; - uint64_t thread; + uint32_t thread; uint16_t queryId; uint8_t context; }; @@ -340,7 +431,7 @@ struct QueueGpuZoneBegin : public QueueGpuZoneBeginLean struct QueueGpuZoneEnd { int64_t cpuTime; - uint64_t thread; + uint32_t thread; uint16_t queryId; uint8_t context; }; @@ -379,7 +470,7 @@ struct QueueMemNamePayload struct QueueMemAlloc { int64_t time; - uint64_t thread; + uint32_t thread; uint64_t ptr; char size[6]; }; @@ -387,7 +478,7 @@ struct QueueMemAlloc struct QueueMemFree { int64_t time; - uint64_t thread; + uint32_t thread; uint64_t ptr; }; @@ -396,16 +487,26 @@ struct QueueCallstackFat uint64_t ptr; }; +struct QueueCallstackFatThread : public QueueCallstackFat +{ + uint32_t thread; +}; + struct QueueCallstackAllocFat { uint64_t ptr; uint64_t nativePtr; }; +struct QueueCallstackAllocFatThread : public QueueCallstackAllocFat +{ + uint32_t thread; +}; + struct QueueCallstackSample { int64_t time; - uint64_t thread; + uint32_t thread; }; struct QueueCallstackSampleFat : public QueueCallstackSample @@ -419,6 +520,12 @@ struct QueueCallstackFrameSize uint8_t size; }; +struct QueueCallstackFrameSizeFat : public QueueCallstackFrameSize +{ + uint64_t data; + uint64_t imageName; +}; + struct QueueCallstackFrame { uint32_t line; @@ -432,10 +539,10 @@ struct QueueSymbolInformation uint64_t symAddr; }; -struct QueueCodeInformation +struct QueueSymbolInformationFat : public QueueSymbolInformation { - uint64_t ptr; - uint32_t line; + uint64_t fileString; + uint8_t needFree; }; struct QueueCrashReport @@ -444,6 +551,11 @@ struct QueueCrashReport uint64_t text; // ptr }; +struct QueueCrashReportThread +{ + uint32_t thread; +}; + struct QueueSysTime { int64_t time; @@ -453,8 +565,8 @@ struct QueueSysTime struct QueueContextSwitch { int64_t time; - uint64_t oldThread; - uint64_t newThread; + uint32_t oldThread; + uint32_t newThread; uint8_t cpu; uint8_t reason; uint8_t state; @@ -463,7 +575,7 @@ struct QueueContextSwitch struct QueueThreadWakeup { int64_t time; - uint64_t thread; + uint32_t thread; }; struct QueueTidToPid @@ -472,10 +584,19 @@ struct QueueTidToPid uint64_t pid; }; +struct QueueHwSample +{ + uint64_t ip; + int64_t time; +}; + struct QueuePlotConfig { uint64_t name; // ptr uint8_t type; + uint8_t step; + uint8_t fill; + uint32_t color; }; struct QueueParamSetup @@ -486,6 +607,11 @@ struct QueueParamSetup int32_t val; }; +struct QueueSourceCodeNotAvailable +{ + uint32_t id; +}; + struct QueueCpuTopology { uint32_t package; @@ -493,6 +619,27 @@ struct QueueCpuTopology uint32_t thread; }; +struct QueueExternalNameMetadata +{ + uint64_t thread; + uint64_t name; + uint64_t threadName; +}; + +struct QueueSymbolCodeMetadata +{ + uint64_t symbol; + uint64_t ptr; + uint32_t size; +}; + +struct QueueSourceCodeMetadata +{ + uint64_t ptr; + uint32_t size; + uint32_t id; +}; + struct QueueHeader { union @@ -510,31 +657,45 @@ struct QueueItem QueueThreadContext threadCtx; QueueZoneBegin zoneBegin; QueueZoneBeginLean zoneBeginLean; + QueueZoneBeginThread zoneBeginThread; QueueZoneEnd zoneEnd; + QueueZoneEndThread zoneEndThread; QueueZoneValidation zoneValidation; + QueueZoneValidationThread zoneValidationThread; QueueZoneColor zoneColor; + QueueZoneColorThread zoneColorThread; QueueZoneValue zoneValue; + QueueZoneValueThread zoneValueThread; QueueStringTransfer stringTransfer; QueueFrameMark frameMark; + QueueFrameVsync frameVsync; QueueFrameImage frameImage; QueueFrameImageFat frameImageFat; QueueSourceLocation srcloc; QueueZoneTextFat zoneTextFat; + QueueZoneTextFatThread zoneTextFatThread; QueueLockAnnounce lockAnnounce; QueueLockTerminate lockTerminate; QueueLockWait lockWait; QueueLockObtain lockObtain; QueueLockRelease lockRelease; + QueueLockReleaseShared lockReleaseShared; QueueLockMark lockMark; QueueLockName lockName; QueueLockNameFat lockNameFat; - QueuePlotData plotData; + QueuePlotDataInt plotDataInt; + QueuePlotDataFloat plotDataFloat; + QueuePlotDataDouble plotDataDouble; QueueMessage message; QueueMessageColor messageColor; QueueMessageLiteral messageLiteral; + QueueMessageLiteralThread messageLiteralThread; QueueMessageColorLiteral messageColorLiteral; + QueueMessageColorLiteralThread messageColorLiteralThread; QueueMessageFat messageFat; + QueueMessageFatThread messageFatThread; QueueMessageColorFat messageColorFat; + QueueMessageColorFatThread messageColorFatThread; QueueGpuNewContext gpuNewContext; QueueGpuZoneBegin gpuZoneBegin; QueueGpuZoneBeginLean gpuZoneBeginLean; @@ -547,24 +708,35 @@ struct QueueItem QueueMemFree memFree; QueueMemNamePayload memName; QueueCallstackFat callstackFat; + QueueCallstackFatThread callstackFatThread; QueueCallstackAllocFat callstackAllocFat; + QueueCallstackAllocFatThread callstackAllocFatThread; QueueCallstackSample callstackSample; QueueCallstackSampleFat callstackSampleFat; QueueCallstackFrameSize callstackFrameSize; + QueueCallstackFrameSizeFat callstackFrameSizeFat; QueueCallstackFrame callstackFrame; QueueSymbolInformation symbolInformation; - QueueCodeInformation codeInformation; + QueueSymbolInformationFat symbolInformationFat; QueueCrashReport crashReport; + QueueCrashReportThread crashReportThread; QueueSysTime sysTime; QueueContextSwitch contextSwitch; QueueThreadWakeup threadWakeup; QueueTidToPid tidToPid; + QueueHwSample hwSample; QueuePlotConfig plotConfig; QueueParamSetup paramSetup; QueueCpuTopology cpuTopology; + QueueExternalNameMetadata externalNameMetadata; + QueueSymbolCodeMetadata symbolCodeMetadata; + QueueSourceCodeMetadata sourceCodeMetadata; + QueueSourceCodeNotAvailable sourceCodeNotAvailable; + QueueFiberEnter fiberEnter; + QueueFiberLeave fiberLeave; }; }; -#pragma pack() +#pragma pack( pop ) enum { QueueItemSize = sizeof( QueueItem ) }; @@ -583,6 +755,7 @@ static constexpr size_t QueueDataSize[] = { sizeof( QueueHeader ), // callstack sizeof( QueueHeader ), // callstack alloc sizeof( QueueHeader ) + sizeof( QueueCallstackSample ), + sizeof( QueueHeader ) + sizeof( QueueCallstackSample ), // context switch sizeof( QueueHeader ) + sizeof( QueueFrameImage ), sizeof( QueueHeader ) + sizeof( QueueZoneBegin ), sizeof( QueueHeader ) + sizeof( QueueZoneBegin ), // callstack @@ -592,7 +765,7 @@ static constexpr size_t QueueDataSize[] = { sizeof( QueueHeader ) + sizeof( QueueLockRelease ), sizeof( QueueHeader ) + sizeof( QueueLockWait ), // shared sizeof( QueueHeader ) + sizeof( QueueLockObtain ), // shared - sizeof( QueueHeader ) + sizeof( QueueLockRelease ), // shared + sizeof( QueueHeader ) + sizeof( QueueLockReleaseShared ), sizeof( QueueHeader ) + sizeof( QueueLockName ), sizeof( QueueHeader ) + sizeof( QueueMemAlloc ), sizeof( QueueHeader ) + sizeof( QueueMemAlloc ), // named @@ -612,11 +785,20 @@ static constexpr size_t QueueDataSize[] = { sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// serial, allocated source location sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// serial, allocated source location, callstack sizeof( QueueHeader ) + sizeof( QueueGpuZoneEnd ), // serial - sizeof( QueueHeader ) + sizeof( QueuePlotData ), + sizeof( QueueHeader ) + sizeof( QueuePlotDataInt ), + sizeof( QueueHeader ) + sizeof( QueuePlotDataFloat ), + sizeof( QueueHeader ) + sizeof( QueuePlotDataDouble ), sizeof( QueueHeader ) + sizeof( QueueContextSwitch ), sizeof( QueueHeader ) + sizeof( QueueThreadWakeup ), sizeof( QueueHeader ) + sizeof( QueueGpuTime ), sizeof( QueueHeader ) + sizeof( QueueGpuContextName ), + sizeof( QueueHeader ) + sizeof( QueueCallstackFrameSize ), + sizeof( QueueHeader ) + sizeof( QueueSymbolInformation ), + sizeof( QueueHeader ), // ExternalNameMetadata - not for wire transfer + sizeof( QueueHeader ), // SymbolCodeMetadata - not for wire transfer + sizeof( QueueHeader ), // SourceCodeMetadata - not for wire transfer + sizeof( QueueHeader ) + sizeof( QueueFiberEnter ), + sizeof( QueueHeader ) + sizeof( QueueFiberLeave ), // above items must be first sizeof( QueueHeader ), // terminate sizeof( QueueHeader ), // keep alive @@ -630,6 +812,7 @@ static constexpr size_t QueueDataSize[] = { sizeof( QueueHeader ) + sizeof( QueueFrameMark ), // continuous frames sizeof( QueueHeader ) + sizeof( QueueFrameMark ), // start sizeof( QueueHeader ) + sizeof( QueueFrameMark ), // end + sizeof( QueueHeader ) + sizeof( QueueFrameVsync ), sizeof( QueueHeader ) + sizeof( QueueSourceLocation ), sizeof( QueueHeader ) + sizeof( QueueLockAnnounce ), sizeof( QueueHeader ) + sizeof( QueueLockTerminate ), @@ -639,16 +822,20 @@ static constexpr size_t QueueDataSize[] = { sizeof( QueueHeader ) + sizeof( QueueMessageLiteral ), // callstack sizeof( QueueHeader ) + sizeof( QueueMessageColorLiteral ), // callstack sizeof( QueueHeader ) + sizeof( QueueGpuNewContext ), - sizeof( QueueHeader ) + sizeof( QueueCallstackFrameSize ), sizeof( QueueHeader ) + sizeof( QueueCallstackFrame ), - sizeof( QueueHeader ) + sizeof( QueueSymbolInformation ), - sizeof( QueueHeader ) + sizeof( QueueCodeInformation ), sizeof( QueueHeader ) + sizeof( QueueSysTime ), sizeof( QueueHeader ) + sizeof( QueueTidToPid ), + sizeof( QueueHeader ) + sizeof( QueueHwSample ), // cpu cycle + sizeof( QueueHeader ) + sizeof( QueueHwSample ), // instruction retired + sizeof( QueueHeader ) + sizeof( QueueHwSample ), // cache reference + sizeof( QueueHeader ) + sizeof( QueueHwSample ), // cache miss + sizeof( QueueHeader ) + sizeof( QueueHwSample ), // branch retired + sizeof( QueueHeader ) + sizeof( QueueHwSample ), // branch miss sizeof( QueueHeader ) + sizeof( QueuePlotConfig ), sizeof( QueueHeader ) + sizeof( QueueParamSetup ), sizeof( QueueHeader ), // server query acknowledgement - sizeof( QueueHeader ), // source code not available + sizeof( QueueHeader ) + sizeof( QueueSourceCodeNotAvailable ), + sizeof( QueueHeader ), // symbol code not available sizeof( QueueHeader ) + sizeof( QueueCpuTopology ), sizeof( QueueHeader ), // single string data sizeof( QueueHeader ), // second string data @@ -666,6 +853,7 @@ static constexpr size_t QueueDataSize[] = { sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // external thread name sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // symbol code sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // source code + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // fiber name }; static_assert( QueueItemSize == 32, "Queue item size not 32 bytes" ); diff --git a/Source/ThirdParty/tracy/common/TracySocket.cpp b/Source/ThirdParty/tracy/common/TracySocket.cpp index f16569b06..176bbc7aa 100644 --- a/Source/ThirdParty/tracy/common/TracySocket.cpp +++ b/Source/ThirdParty/tracy/common/TracySocket.cpp @@ -8,6 +8,7 @@ #include "TracyAlloc.hpp" #include "TracySocket.hpp" +#include "TracySystem.hpp" #ifdef _WIN32 # ifndef NOMINMAX @@ -454,7 +455,7 @@ static int addrinfo_and_socket_for_family( uint16_t port, int ai_family, struct hints.ai_family = ai_family; hints.ai_socktype = SOCK_STREAM; #ifndef TRACY_ONLY_LOCALHOST - const char* onlyLocalhost = getenv( "TRACY_ONLY_LOCALHOST" ); + const char* onlyLocalhost = GetEnvVar( "TRACY_ONLY_LOCALHOST" ); if( !onlyLocalhost || onlyLocalhost[0] != '1' ) { hints.ai_flags = AI_PASSIVE; @@ -475,7 +476,7 @@ bool ListenSocket::Listen( uint16_t port, int backlog ) struct addrinfo* res = nullptr; #if !defined TRACY_ONLY_IPV4 && !defined TRACY_ONLY_LOCALHOST - const char* onlyIPv4 = getenv( "TRACY_ONLY_IPV4" ); + const char* onlyIPv4 = GetEnvVar( "TRACY_ONLY_IPV4" ); if( !onlyIPv4 || onlyIPv4[0] != '1' ) { m_sock = addrinfo_and_socket_for_family( port, AF_INET6, &res ); @@ -488,7 +489,7 @@ bool ListenSocket::Listen( uint16_t port, int backlog ) m_sock = addrinfo_and_socket_for_family( port, AF_INET, &res ); if( m_sock == -1 ) return false; } -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 unsigned long val = 0; setsockopt( m_sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*)&val, sizeof( val ) ); #elif defined BSD diff --git a/Source/ThirdParty/tracy/common/TracySocket.hpp b/Source/ThirdParty/tracy/common/TracySocket.hpp index 4fbb3278a..4b3075e29 100644 --- a/Source/ThirdParty/tracy/common/TracySocket.hpp +++ b/Source/ThirdParty/tracy/common/TracySocket.hpp @@ -2,6 +2,7 @@ #define __TRACYSOCKET_HPP__ #include +#include #include struct addrinfo; diff --git a/Source/ThirdParty/tracy/common/TracyStackFrames.cpp b/Source/ThirdParty/tracy/common/TracyStackFrames.cpp new file mode 100644 index 000000000..7b0abace3 --- /dev/null +++ b/Source/ThirdParty/tracy/common/TracyStackFrames.cpp @@ -0,0 +1,122 @@ +#include "TracyStackFrames.hpp" + +namespace tracy +{ + +const char* s_tracyStackFrames_[] = { + "tracy::Callstack", + "tracy::Callstack(int)", + "tracy::GpuCtxScope::{ctor}", + "tracy::Profiler::SendCallstack", + "tracy::Profiler::SendCallstack(int)", + "tracy::Profiler::SendCallstack(int, unsigned long)", + "tracy::Profiler::MemAllocCallstack", + "tracy::Profiler::MemAllocCallstack(void const*, unsigned long, int)", + "tracy::Profiler::MemFreeCallstack", + "tracy::Profiler::MemFreeCallstack(void const*, int)", + "tracy::ScopedZone::{ctor}", + "tracy::ScopedZone::ScopedZone(tracy::SourceLocationData const*, int, bool)", + "tracy::Profiler::Message", + nullptr +}; + +const char** s_tracyStackFrames = s_tracyStackFrames_; + +const StringMatch s_tracySkipSubframes_[] = { + { "/include/arm_neon.h", 19 }, + { "/include/adxintrin.h", 20 }, + { "/include/ammintrin.h", 20 }, + { "/include/amxbf16intrin.h", 24 }, + { "/include/amxint8intrin.h", 24 }, + { "/include/amxtileintrin.h", 24 }, + { "/include/avx2intrin.h", 21 }, + { "/include/avx5124fmapsintrin.h", 29 }, + { "/include/avx5124vnniwintrin.h", 29 }, + { "/include/avx512bf16intrin.h", 27 }, + { "/include/avx512bf16vlintrin.h", 29 }, + { "/include/avx512bitalgintrin.h", 29 }, + { "/include/avx512bwintrin.h", 25 }, + { "/include/avx512cdintrin.h", 25 }, + { "/include/avx512dqintrin.h", 25 }, + { "/include/avx512erintrin.h", 25 }, + { "/include/avx512fintrin.h", 24 }, + { "/include/avx512ifmaintrin.h", 27 }, + { "/include/avx512ifmavlintrin.h", 29 }, + { "/include/avx512pfintrin.h", 25 }, + { "/include/avx512vbmi2intrin.h", 28 }, + { "/include/avx512vbmi2vlintrin.h", 30 }, + { "/include/avx512vbmiintrin.h", 27 }, + { "/include/avx512vbmivlintrin.h", 29 }, + { "/include/avx512vlbwintrin.h", 27 }, + { "/include/avx512vldqintrin.h", 27 }, + { "/include/avx512vlintrin.h", 25 }, + { "/include/avx512vnniintrin.h", 27 }, + { "/include/avx512vnnivlintrin.h", 29 }, + { "/include/avx512vp2intersectintrin.h", 35 }, + { "/include/avx512vp2intersectvlintrin.h", 37 }, + { "/include/avx512vpopcntdqintrin.h", 32 }, + { "/include/avx512vpopcntdqvlintrin.h", 34 }, + { "/include/avxintrin.h", 20 }, + { "/include/avxvnniintrin.h", 24 }, + { "/include/bmi2intrin.h", 21 }, + { "/include/bmiintrin.h", 20 }, + { "/include/bmmintrin.h", 20 }, + { "/include/cetintrin.h", 20 }, + { "/include/cldemoteintrin.h", 25 }, + { "/include/clflushoptintrin.h", 27 }, + { "/include/clwbintrin.h", 21 }, + { "/include/clzerointrin.h", 23 }, + { "/include/emmintrin.h", 20 }, + { "/include/enqcmdintrin.h", 23 }, + { "/include/f16cintrin.h", 21 }, + { "/include/fma4intrin.h", 21 }, + { "/include/fmaintrin.h", 20 }, + { "/include/fxsrintrin.h", 21 }, + { "/include/gfniintrin.h", 21 }, + { "/include/hresetintrin.h", 23 }, + { "/include/ia32intrin.h", 21 }, + { "/include/immintrin.h", 20 }, + { "/include/keylockerintrin.h", 26 }, + { "/include/lwpintrin.h", 20 }, + { "/include/lzcntintrin.h", 22 }, + { "/include/mmintrin.h", 19 }, + { "/include/movdirintrin.h", 23 }, + { "/include/mwaitxintrin.h", 23 }, + { "/include/nmmintrin.h", 20 }, + { "/include/pconfigintrin.h", 24 }, + { "/include/pkuintrin.h", 20 }, + { "/include/pmmintrin.h", 20 }, + { "/include/popcntintrin.h", 23 }, + { "/include/prfchwintrin.h", 23 }, + { "/include/rdseedintrin.h", 23 }, + { "/include/rtmintrin.h", 20 }, + { "/include/serializeintrin.h", 26 }, + { "/include/sgxintrin.h", 20 }, + { "/include/shaintrin.h", 20 }, + { "/include/smmintrin.h", 20 }, + { "/include/tbmintrin.h", 20 }, + { "/include/tmmintrin.h", 20 }, + { "/include/tsxldtrkintrin.h", 25 }, + { "/include/uintrintrin.h", 22 }, + { "/include/vaesintrin.h", 21 }, + { "/include/vpclmulqdqintrin.h", 27 }, + { "/include/waitpkgintrin.h", 24 }, + { "/include/wbnoinvdintrin.h", 25 }, + { "/include/wmmintrin.h", 20 }, + { "/include/x86gprintrin.h", 23 }, + { "/include/x86intrin.h", 20 }, + { "/include/xmmintrin.h", 20 }, + { "/include/xopintrin.h", 20 }, + { "/include/xsavecintrin.h", 23 }, + { "/include/xsaveintrin.h", 22 }, + { "/include/xsaveoptintrin.h", 25 }, + { "/include/xsavesintrin.h", 23 }, + { "/include/xtestintrin.h", 22 }, + { "/bits/atomic_base.h", 19 }, + { "/atomic", 7 }, + {} +}; + +const StringMatch* s_tracySkipSubframes = s_tracySkipSubframes_; + +} diff --git a/Source/ThirdParty/tracy/common/TracyStackFrames.hpp b/Source/ThirdParty/tracy/common/TracyStackFrames.hpp new file mode 100644 index 000000000..9d4262c00 --- /dev/null +++ b/Source/ThirdParty/tracy/common/TracyStackFrames.hpp @@ -0,0 +1,22 @@ +#ifndef __TRACYSTACKFRAMES_HPP__ +#define __TRACYSTACKFRAMES_HPP__ + +#include + +namespace tracy +{ + +struct StringMatch +{ + const char* str; + size_t len; +}; + +extern const char** s_tracyStackFrames; +extern const StringMatch* s_tracySkipSubframes; + +static constexpr int s_tracySkipSubframesMinLen = 7; + +} + +#endif diff --git a/Source/ThirdParty/tracy/common/TracySystem.cpp b/Source/ThirdParty/tracy/common/TracySystem.cpp index 25ccf9f8a..5ca8e1f45 100644 --- a/Source/ThirdParty/tracy/common/TracySystem.cpp +++ b/Source/ThirdParty/tracy/common/TracySystem.cpp @@ -1,16 +1,16 @@ -#if defined _MSC_VER || defined __CYGWIN__ || defined _WIN32 -# ifndef WIN32_LEAN_AND_MEAN -# define WIN32_LEAN_AND_MEAN -# endif -# ifndef NOMINMAX -# define NOMINMAX -# endif -#endif #ifdef _MSC_VER # pragma warning(disable:4996) #endif -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 +# ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +# endif +# ifndef NOMINMAX +# define NOMINMAX +# endif # include +# include +# include "TracyUwp.hpp" #else # include # include @@ -39,7 +39,7 @@ #include "TracySystem.hpp" -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 extern "C" typedef HRESULT (WINAPI *t_SetThreadDescription)( HANDLE, PCWSTR ); extern "C" typedef HRESULT (WINAPI *t_GetThreadDescription)( HANDLE, PWSTR* ); #endif @@ -55,19 +55,19 @@ namespace tracy namespace detail { -TRACY_API uint64_t GetThreadHandleImpl() +TRACY_API uint32_t GetThreadHandleImpl() { -#if defined _WIN32 || defined __CYGWIN__ - static_assert( sizeof( decltype( GetCurrentThreadId() ) ) <= sizeof( uint64_t ), "Thread handle too big to fit in protocol" ); - return uint64_t( GetCurrentThreadId() ); +#if defined _WIN32 + static_assert( sizeof( decltype( GetCurrentThreadId() ) ) <= sizeof( uint32_t ), "Thread handle too big to fit in protocol" ); + return uint32_t( GetCurrentThreadId() ); #elif defined __APPLE__ uint64_t id; pthread_threadid_np( pthread_self(), &id ); - return id; + return uint32_t( id ); #elif defined __ANDROID__ - return (uint64_t)gettid(); + return (uint32_t)gettid(); #elif defined __linux__ - return (uint64_t)syscall( SYS_gettid ); + return (uint32_t)syscall( SYS_gettid ); #elif defined __FreeBSD__ long id; thr_self( &id ); @@ -78,9 +78,17 @@ TRACY_API uint64_t GetThreadHandleImpl() return lwp_gettid(); #elif defined __OpenBSD__ return getthrid(); +#elif defined __EMSCRIPTEN__ + // Not supported, but let it compile. + return 0; #else - static_assert( sizeof( decltype( pthread_self() ) ) <= sizeof( uint64_t ), "Thread handle too big to fit in protocol" ); - return uint64_t( pthread_self() ); + // To add support for a platform, retrieve and return the kernel thread identifier here. + // + // Note that pthread_t (as for example returned by pthread_self()) is *not* a kernel + // thread identifier. It is a pointer to a library-allocated data structure instead. + // Such pointers will be reused heavily, making the pthread_t non-unique. Additionally + // a 64-bit pointer cannot be reliably truncated to 32 bits. + #error "Unsupported platform!" #endif } @@ -90,18 +98,44 @@ TRACY_API uint64_t GetThreadHandleImpl() #ifdef TRACY_ENABLE struct ThreadNameData { - uint64_t id; + uint32_t id; const char* name; ThreadNameData* next; }; std::atomic& GetThreadNameData(); -TRACY_API void InitRPMallocThread(); +#endif + +#ifdef _MSC_VER +# pragma pack( push, 8 ) +struct THREADNAME_INFO +{ + DWORD dwType; + LPCSTR szName; + DWORD dwThreadID; + DWORD dwFlags; +}; +# pragma pack( pop ) + +void ThreadNameMsvcMagic( const THREADNAME_INFO& info ) +{ + __try + { + RaiseException( 0x406D1388, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info ); + } + __except(EXCEPTION_EXECUTE_HANDLER) + { + } +} #endif TRACY_API void SetThreadName( const char* name ) { -#if defined _WIN32 || defined __CYGWIN__ +#if defined _WIN32 +# ifdef TRACY_UWP + static auto _SetThreadDescription = &::SetThreadDescription; +# else static auto _SetThreadDescription = (t_SetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "SetThreadDescription" ); +# endif if( _SetThreadDescription ) { wchar_t buf[256]; @@ -111,57 +145,45 @@ TRACY_API void SetThreadName( const char* name ) else { # if defined _MSC_VER - const DWORD MS_VC_EXCEPTION=0x406D1388; -# pragma pack( push, 8 ) - struct THREADNAME_INFO - { - DWORD dwType; - LPCSTR szName; - DWORD dwThreadID; - DWORD dwFlags; - }; -# pragma pack(pop) - - DWORD ThreadId = GetCurrentThreadId(); THREADNAME_INFO info; info.dwType = 0x1000; info.szName = name; - info.dwThreadID = ThreadId; + info.dwThreadID = GetCurrentThreadId(); info.dwFlags = 0; - - __try - { - RaiseException( MS_VC_EXCEPTION, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info ); - } - __except(EXCEPTION_EXECUTE_HANDLER) - { - } + ThreadNameMsvcMagic( info ); # endif } -#elif defined _GNU_SOURCE && !defined __EMSCRIPTEN__ && !defined __CYGWIN__ +#elif defined _GNU_SOURCE && !defined __EMSCRIPTEN__ { const auto sz = strlen( name ); if( sz <= 15 ) { +#if defined __APPLE__ + pthread_setname_np( name ); +#else pthread_setname_np( pthread_self(), name ); +#endif } else { char buf[16]; memcpy( buf, name, 15 ); buf[15] = '\0'; +#if defined __APPLE__ + pthread_setname_np( buf ); +#else pthread_setname_np( pthread_self(), buf ); +#endif } } #endif #ifdef TRACY_ENABLE { - InitRPMallocThread(); const auto sz = strlen( name ); char* buf = (char*)tracy_malloc( sz+1 ); memcpy( buf, name, sz ); buf[sz] = '\0'; - auto data = (ThreadNameData*)tracy_malloc( sizeof( ThreadNameData ) ); + auto data = (ThreadNameData*)tracy_malloc_fast( sizeof( ThreadNameData ) ); data->id = detail::GetThreadHandleImpl(); data->name = buf; data->next = GetThreadNameData().load( std::memory_order_relaxed ); @@ -170,7 +192,7 @@ TRACY_API void SetThreadName( const char* name ) #endif } -TRACY_API const char* GetThreadName( uint64_t id ) +TRACY_API const char* GetThreadName( uint32_t id ) { static char buf[256]; #ifdef TRACY_ENABLE @@ -184,8 +206,12 @@ TRACY_API const char* GetThreadName( uint64_t id ) ptr = ptr->next; } #else -# if defined _WIN32 || defined __CYGWIN__ +# if defined _WIN32 +# ifdef TRACY_UWP + static auto _GetThreadDescription = &::GetThreadDescription; +# else static auto _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" ); +# endif if( _GetThreadDescription ) { auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id ); @@ -210,7 +236,7 @@ TRACY_API const char* GetThreadName( uint64_t id ) int tid = (int) syscall( SYS_gettid ); # endif snprintf( path, sizeof( path ), "/proc/self/task/%d/comm", tid ); - sprintf( buf, "%" PRIu64, id ); + sprintf( buf, "%" PRIu32, id ); # ifndef __ANDROID__ pthread_setcancelstate( PTHREAD_CANCEL_DISABLE, &cs ); # endif @@ -232,8 +258,40 @@ TRACY_API const char* GetThreadName( uint64_t id ) return buf; # endif #endif - sprintf( buf, "%" PRIu64, id ); + sprintf( buf, "%" PRIu32, id ); return buf; } +TRACY_API const char* GetEnvVar( const char* name ) +{ +#if defined _WIN32 + // unfortunately getenv() on Windows is just fundamentally broken. It caches the entire + // environment block once on startup, then never refreshes it again. If any environment + // strings are added or modified after startup of the CRT, those changes will not be + // seen by getenv(). This removes the possibility of an app using this SDK from + // programmatically setting any of the behaviour controlling envvars here. + // + // To work around this, we'll instead go directly to the Win32 environment strings APIs + // to get the current value. + static char buffer[1024]; + DWORD const kBufferSize = DWORD(sizeof(buffer) / sizeof(buffer[0])); + DWORD count = GetEnvironmentVariableA(name, buffer, kBufferSize); + + if( count == 0 ) + return nullptr; + + if( count >= kBufferSize ) + { + char* buf = reinterpret_cast(_alloca(count + 1)); + count = GetEnvironmentVariableA(name, buf, count + 1); + memcpy(buffer, buf, kBufferSize); + buffer[kBufferSize - 1] = 0; + } + + return buffer; +#else + return getenv(name); +#endif +} + } diff --git a/Source/ThirdParty/tracy/common/TracySystem.hpp b/Source/ThirdParty/tracy/common/TracySystem.hpp index 8a699886b..edcc5cf31 100644 --- a/Source/ThirdParty/tracy/common/TracySystem.hpp +++ b/Source/ThirdParty/tracy/common/TracySystem.hpp @@ -32,7 +32,7 @@ enum class PlotFormatType : uint8_t Percentage }; -typedef void(*ParameterCallback)( uint32_t idx, int32_t val ); +typedef void(*ParameterCallback)( void* data, uint32_t idx, int32_t val ); struct TRACY_API SourceLocationData { @@ -80,20 +80,22 @@ private: namespace detail { -TRACY_API uint64_t GetThreadHandleImpl(); +TRACY_API uint32_t GetThreadHandleImpl(); } #ifdef TRACY_ENABLE -TRACY_API uint64_t GetThreadHandle(); +TRACY_API uint32_t GetThreadHandle(); #else -static inline uint64_t GetThreadHandle() +static inline uint32_t GetThreadHandle() { return detail::GetThreadHandleImpl(); } #endif TRACY_API void SetThreadName( const char* name ); -TRACY_API const char* GetThreadName( uint64_t id ); +TRACY_API const char* GetThreadName( uint32_t id ); + +TRACY_API const char* GetEnvVar(const char* name); } diff --git a/Source/ThirdParty/tracy/common/TracyUwp.hpp b/Source/ThirdParty/tracy/common/TracyUwp.hpp new file mode 100644 index 000000000..7dce96b96 --- /dev/null +++ b/Source/ThirdParty/tracy/common/TracyUwp.hpp @@ -0,0 +1,11 @@ +#ifndef __TRACYUWP_HPP__ +#define __TRACYUWP_HPP__ + +#ifdef _WIN32 +# include +# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) && !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +# define TRACY_UWP +# endif +#endif + +#endif diff --git a/Source/ThirdParty/tracy/common/TracyVersion.hpp b/Source/ThirdParty/tracy/common/TracyVersion.hpp new file mode 100644 index 000000000..983d1c51f --- /dev/null +++ b/Source/ThirdParty/tracy/common/TracyVersion.hpp @@ -0,0 +1,14 @@ +#ifndef __TRACYVERSION_HPP__ +#define __TRACYVERSION_HPP__ + +namespace tracy +{ +namespace Version +{ +enum { Major = 0 }; +enum { Minor = 9 }; +enum { Patch = 0 }; +} +} + +#endif diff --git a/Source/ThirdParty/tracy/common/TracyYield.hpp b/Source/ThirdParty/tracy/common/TracyYield.hpp new file mode 100644 index 000000000..703970dbb --- /dev/null +++ b/Source/ThirdParty/tracy/common/TracyYield.hpp @@ -0,0 +1,26 @@ +#ifndef __TRACYYIELD_HPP__ +#define __TRACYYIELD_HPP__ + +#if defined __SSE2__ || defined _M_AMD64 || (defined _M_IX86_FP && _M_IX86_FP == 2) +# include +#else +# include +#endif + +namespace tracy +{ + +static tracy_force_inline void YieldThread() +{ +#if defined __SSE2__ || defined _M_AMD64 || (defined _M_IX86_FP && _M_IX86_FP == 2) + _mm_pause(); +#elif defined __aarch64__ + asm volatile( "isb" : : ); +#else + std::this_thread::yield(); +#endif +} + +} + +#endif diff --git a/Source/ThirdParty/tracy/libbacktrace/backtrace.hpp b/Source/ThirdParty/tracy/libbacktrace/backtrace.hpp index d999803c8..e4be297a9 100644 --- a/Source/ThirdParty/tracy/libbacktrace/backtrace.hpp +++ b/Source/ThirdParty/tracy/libbacktrace/backtrace.hpp @@ -53,13 +53,14 @@ struct backtrace_state; invalid after this function returns. As a special case, the ERRNUM argument will be passed as -1 if no - debug info can be found for the executable, but the function - requires debug info (e.g., backtrace_full, backtrace_pcinfo). The - MSG in this case will be something along the lines of "no debug - info". Similarly, ERRNUM will be passed as -1 if there is no - symbol table, but the function requires a symbol table (e.g., - backtrace_syminfo). This may be used as a signal that some other - approach should be tried. */ + debug info can be found for the executable, or if the debug info + exists but has an unsupported version, but the function requires + debug info (e.g., backtrace_full, backtrace_pcinfo). The MSG in + this case will be something along the lines of "no debug info". + Similarly, ERRNUM will be passed as -1 if there is no symbol table, + but the function requires a symbol table (e.g., backtrace_syminfo). + This may be used as a signal that some other approach should be + tried. */ typedef void (*backtrace_error_callback) (void *data, const char *msg, int errnum); diff --git a/Source/ThirdParty/tracy/libbacktrace/dwarf.cpp b/Source/ThirdParty/tracy/libbacktrace/dwarf.cpp index f76e03cfb..246cb9f36 100644 --- a/Source/ThirdParty/tracy/libbacktrace/dwarf.cpp +++ b/Source/ThirdParty/tracy/libbacktrace/dwarf.cpp @@ -52,6 +52,7 @@ enum dwarf_tag { DW_TAG_compile_unit = 0x11, DW_TAG_inlined_subroutine = 0x1d, DW_TAG_subprogram = 0x2e, + DW_TAG_skeleton_unit = 0x4a, }; enum dwarf_form { @@ -746,13 +747,13 @@ struct dwarf_data /* Report an error for a DWARF buffer. */ static void -dwarf_buf_error (struct dwarf_buf *buf, const char *msg) +dwarf_buf_error (struct dwarf_buf *buf, const char *msg, int errnum) { char b[200]; snprintf (b, sizeof b, "%s in %s at %d", msg, buf->name, (int) (buf->buf - buf->start)); - buf->error_callback (buf->data, b, 0); + buf->error_callback (buf->data, b, errnum); } /* Require at least COUNT bytes in BUF. Return 1 if all is well, 0 on @@ -766,7 +767,7 @@ require (struct dwarf_buf *buf, size_t count) if (!buf->reported_underflow) { - dwarf_buf_error (buf, "DWARF underflow"); + dwarf_buf_error (buf, "DWARF underflow", 0); buf->reported_underflow = 1; } @@ -928,7 +929,7 @@ read_address (struct dwarf_buf *buf, int addrsize) case 8: return read_uint64 (buf); default: - dwarf_buf_error (buf, "unrecognized address size"); + dwarf_buf_error (buf, "unrecognized address size", 0); return 0; } } @@ -979,7 +980,7 @@ read_uleb128 (struct dwarf_buf *buf) ret |= ((uint64_t) (b & 0x7f)) << shift; else if (!overflow) { - dwarf_buf_error (buf, "LEB128 overflows uint64_t"); + dwarf_buf_error (buf, "LEB128 overflows uint64_t", 0); overflow = 1; } shift += 7; @@ -1014,7 +1015,7 @@ read_sleb128 (struct dwarf_buf *buf) val |= ((uint64_t) (b & 0x7f)) << shift; else if (!overflow) { - dwarf_buf_error (buf, "signed LEB128 overflows uint64_t"); + dwarf_buf_error (buf, "signed LEB128 overflows uint64_t", 0); overflow = 1; } shift += 7; @@ -1154,7 +1155,7 @@ read_attribute (enum dwarf_form form, uint64_t implicit_val, offset = read_offset (buf, is_dwarf64); if (offset >= dwarf_sections->size[DEBUG_STR]) { - dwarf_buf_error (buf, "DW_FORM_strp out of range"); + dwarf_buf_error (buf, "DW_FORM_strp out of range", 0); return 0; } val->encoding = ATTR_VAL_STRING; @@ -1169,7 +1170,7 @@ read_attribute (enum dwarf_form form, uint64_t implicit_val, offset = read_offset (buf, is_dwarf64); if (offset >= dwarf_sections->size[DEBUG_LINE_STR]) { - dwarf_buf_error (buf, "DW_FORM_line_strp out of range"); + dwarf_buf_error (buf, "DW_FORM_line_strp out of range", 0); return 0; } val->encoding = ATTR_VAL_STRING; @@ -1216,7 +1217,8 @@ read_attribute (enum dwarf_form form, uint64_t implicit_val, if (form == DW_FORM_implicit_const) { dwarf_buf_error (buf, - "DW_FORM_indirect to DW_FORM_implicit_const"); + "DW_FORM_indirect to DW_FORM_implicit_const", + 0); return 0; } return read_attribute ((enum dwarf_form) form, 0, buf, is_dwarf64, @@ -1349,7 +1351,7 @@ read_attribute (enum dwarf_form form, uint64_t implicit_val, } if (offset >= altlink->dwarf_sections.size[DEBUG_STR]) { - dwarf_buf_error (buf, "DW_FORM_strp_sup out of range"); + dwarf_buf_error (buf, "DW_FORM_strp_sup out of range", 0); return 0; } val->encoding = ATTR_VAL_STRING; @@ -1358,7 +1360,7 @@ read_attribute (enum dwarf_form form, uint64_t implicit_val, return 1; } default: - dwarf_buf_error (buf, "unrecognized DWARF form"); + dwarf_buf_error (buf, "unrecognized DWARF form", -1); return 0; } } @@ -1407,7 +1409,9 @@ resolve_string (const struct dwarf_sections *dwarf_sections, int is_dwarf64, offset = read_offset (&offset_buf, is_dwarf64); if (offset >= dwarf_sections->size[DEBUG_STR]) { - dwarf_buf_error (&offset_buf, "DW_FORM_strx offset out of range"); + dwarf_buf_error (&offset_buf, + "DW_FORM_strx offset out of range", + 0); return 0; } *string = (const char *) dwarf_sections->data[DEBUG_STR] + offset; @@ -2215,7 +2219,7 @@ add_ranges_from_rnglists ( break; default: - dwarf_buf_error (&rnglists_buf, "unrecognized DW_RLE value"); + dwarf_buf_error (&rnglists_buf, "unrecognized DW_RLE value", -1); return 0; } } @@ -2322,14 +2326,16 @@ find_address_ranges (struct backtrace_state *state, uintptr_t base_address, break; case DW_AT_stmt_list: - if (abbrev->tag == DW_TAG_compile_unit + if ((abbrev->tag == DW_TAG_compile_unit + || abbrev->tag == DW_TAG_skeleton_unit) && (val.encoding == ATTR_VAL_UINT || val.encoding == ATTR_VAL_REF_SECTION)) u->lineoff = val.u.uint; break; case DW_AT_name: - if (abbrev->tag == DW_TAG_compile_unit) + if (abbrev->tag == DW_TAG_compile_unit + || abbrev->tag == DW_TAG_skeleton_unit) { name_val = val; have_name_val = 1; @@ -2337,7 +2343,8 @@ find_address_ranges (struct backtrace_state *state, uintptr_t base_address, break; case DW_AT_comp_dir: - if (abbrev->tag == DW_TAG_compile_unit) + if (abbrev->tag == DW_TAG_compile_unit + || abbrev->tag == DW_TAG_skeleton_unit) { comp_dir_val = val; have_comp_dir_val = 1; @@ -2345,19 +2352,22 @@ find_address_ranges (struct backtrace_state *state, uintptr_t base_address, break; case DW_AT_str_offsets_base: - if (abbrev->tag == DW_TAG_compile_unit + if ((abbrev->tag == DW_TAG_compile_unit + || abbrev->tag == DW_TAG_skeleton_unit) && val.encoding == ATTR_VAL_REF_SECTION) u->str_offsets_base = val.u.uint; break; case DW_AT_addr_base: - if (abbrev->tag == DW_TAG_compile_unit + if ((abbrev->tag == DW_TAG_compile_unit + || abbrev->tag == DW_TAG_skeleton_unit) && val.encoding == ATTR_VAL_REF_SECTION) u->addr_base = val.u.uint; break; case DW_AT_rnglists_base: - if (abbrev->tag == DW_TAG_compile_unit + if ((abbrev->tag == DW_TAG_compile_unit + || abbrev->tag == DW_TAG_skeleton_unit) && val.encoding == ATTR_VAL_REF_SECTION) u->rnglists_base = val.u.uint; break; @@ -2385,7 +2395,8 @@ find_address_ranges (struct backtrace_state *state, uintptr_t base_address, } if (abbrev->tag == DW_TAG_compile_unit - || abbrev->tag == DW_TAG_subprogram) + || abbrev->tag == DW_TAG_subprogram + || abbrev->tag == DW_TAG_skeleton_unit) { if (!add_ranges (state, dwarf_sections, base_address, is_bigendian, u, pcrange.lowpc, &pcrange, @@ -2393,9 +2404,10 @@ find_address_ranges (struct backtrace_state *state, uintptr_t base_address, (void *) addrs)) return 0; - /* If we found the PC range in the DW_TAG_compile_unit, we - can stop now. */ - if (abbrev->tag == DW_TAG_compile_unit + /* If we found the PC range in the DW_TAG_compile_unit or + DW_TAG_skeleton_unit, we can stop now. */ + if ((abbrev->tag == DW_TAG_compile_unit + || abbrev->tag == DW_TAG_skeleton_unit) && (pcrange.have_ranges || (pcrange.have_lowpc && pcrange.have_highpc))) return 1; @@ -2482,7 +2494,7 @@ build_address_map (struct backtrace_state *state, uintptr_t base_address, version = read_uint16 (&unit_buf); if (version < 2 || version > 5) { - dwarf_buf_error (&unit_buf, "unrecognized DWARF version"); + dwarf_buf_error (&unit_buf, "unrecognized DWARF version", -1); goto fail; } @@ -2554,6 +2566,9 @@ build_address_map (struct backtrace_state *state, uintptr_t base_address, u->comp_dir = NULL; u->abs_filename = NULL; u->lineoff = 0; + u->str_offsets_base = 0; + u->addr_base = 0; + u->rnglists_base = 0; /* The actual line number mappings will be read as needed. */ u->lines = NULL; @@ -2761,7 +2776,8 @@ read_v2_paths (struct backtrace_state *state, struct unit *u, { dwarf_buf_error (hdr_buf, ("invalid directory index in " - "line number program header")); + "line number program header"), + 0); return 0; } dir_len = strlen (dir); @@ -2830,7 +2846,8 @@ read_lnct (struct backtrace_state *state, struct dwarf_data *ddata, { dwarf_buf_error (hdr_buf, ("invalid directory index in " - "line number program header")); + "line number program header"), + 0); return 0; } dir = hdr->dirs[val.u.uint]; @@ -2845,7 +2862,8 @@ read_lnct (struct backtrace_state *state, struct dwarf_data *ddata, if (path == NULL) { dwarf_buf_error (hdr_buf, - "missing file name in line number program header"); + "missing file name in line number program header", + 0); return 0; } @@ -2972,7 +2990,7 @@ read_line_header (struct backtrace_state *state, struct dwarf_data *ddata, hdr->version = read_uint16 (line_buf); if (hdr->version < 2 || hdr->version > 5) { - dwarf_buf_error (line_buf, "unsupported line number version"); + dwarf_buf_error (line_buf, "unsupported line number version", -1); return 0; } @@ -2986,7 +3004,8 @@ read_line_header (struct backtrace_state *state, struct dwarf_data *ddata, if (read_byte (line_buf) != 0) { dwarf_buf_error (line_buf, - "non-zero segment_selector_size not supported"); + "non-zero segment_selector_size not supported", + -1); return 0; } } @@ -3127,7 +3146,8 @@ read_line_program (struct backtrace_state *state, struct dwarf_data *ddata, { dwarf_buf_error (line_buf, ("invalid directory index " - "in line number program")); + "in line number program"), + 0); return 0; } dir_len = strlen (dir); @@ -3185,19 +3205,15 @@ read_line_program (struct backtrace_state *state, struct dwarf_data *ddata, uint64_t fileno; fileno = read_uleb128 (line_buf); - if (fileno == 0) - filename = ""; - else + if (fileno >= hdr->filenames_count) { - if (fileno >= hdr->filenames_count) - { - dwarf_buf_error (line_buf, - ("invalid file number in " - "line number program")); - return 0; - } - filename = hdr->filenames[fileno]; + dwarf_buf_error (line_buf, + ("invalid file number in " + "line number program"), + 0); + return 0; } + filename = hdr->filenames[fileno]; } break; case DW_LNS_set_column: @@ -3428,7 +3444,9 @@ read_referenced_name (struct dwarf_data *ddata, struct unit *u, code = read_uleb128 (&unit_buf); if (code == 0) { - dwarf_buf_error (&unit_buf, "invalid abstract origin or specification"); + dwarf_buf_error (&unit_buf, + "invalid abstract origin or specification", + 0); return NULL; } @@ -3601,7 +3619,8 @@ read_function_entry (struct backtrace_state *state, struct dwarf_data *ddata, /* The compile unit sets the base address for any address ranges in the function entries. */ - if (abbrev->tag == DW_TAG_compile_unit + if ((abbrev->tag == DW_TAG_compile_unit + || abbrev->tag == DW_TAG_skeleton_unit) && abbrev->attrs[i].name == DW_AT_low_pc) { if (val.encoding == ATTR_VAL_ADDRESS) @@ -3623,20 +3642,15 @@ read_function_entry (struct backtrace_state *state, struct dwarf_data *ddata, case DW_AT_call_file: if (val.encoding == ATTR_VAL_UINT) { - if (val.u.uint == 0) - function->caller_filename = ""; - else + if (val.u.uint >= lhdr->filenames_count) { - if (val.u.uint >= lhdr->filenames_count) - { - dwarf_buf_error (unit_buf, - ("invalid file number in " - "DW_AT_call_file attribute")); - return 0; - } - function->caller_filename = - lhdr->filenames[val.u.uint]; + dwarf_buf_error (unit_buf, + ("invalid file number in " + "DW_AT_call_file attribute"), + 0); + return 0; } + function->caller_filename = lhdr->filenames[val.u.uint]; } break; @@ -3884,7 +3898,7 @@ read_function_info (struct backtrace_state *state, struct dwarf_data *ddata, Returns whatever CALLBACK returns, or 0 to keep going. */ static int -report_inlined_functions (uintptr_t pc, struct function *function, +report_inlined_functions (uintptr_t pc, struct function *function, const char* comp_dir, backtrace_full_callback callback, void *data, const char **filename, int *lineno) { @@ -3938,13 +3952,22 @@ report_inlined_functions (uintptr_t pc, struct function *function, inlined = match->function; /* Report any calls inlined into this one. */ - ret = report_inlined_functions (pc, inlined, callback, data, + ret = report_inlined_functions (pc, inlined, comp_dir, callback, data, filename, lineno); if (ret != 0) return ret; /* Report this inlined call. */ - ret = callback (data, pc, match->low, *filename, *lineno, inlined->name); + if (*filename[0] != '/' && comp_dir) + { + char buf[1024]; + snprintf (buf, 1024, "%s/%s", comp_dir, *filename); + ret = callback (data, pc, match->low, buf, *lineno, inlined->name); + } + else + { + ret = callback (data, pc, match->low, *filename, *lineno, inlined->name); + } if (ret != 0) return ret; @@ -4211,12 +4234,21 @@ dwarf_lookup_pc (struct backtrace_state *state, struct dwarf_data *ddata, filename = ln->filename; lineno = ln->lineno; - ret = report_inlined_functions (pc, function, callback, data, + ret = report_inlined_functions (pc, function, entry->u->comp_dir, callback, data, &filename, &lineno); if (ret != 0) return ret; - return callback (data, pc, fmatch->low, filename, lineno, function->name); + if (filename[0] != '/' && entry->u->comp_dir) + { + char buf[1024]; + snprintf (buf, 1024, "%s/%s", entry->u->comp_dir, filename); + return callback (data, pc, fmatch->low, buf, lineno, function->name); + } + else + { + return callback (data, pc, fmatch->low, filename, lineno, function->name); + } } diff --git a/Source/ThirdParty/tracy/libbacktrace/elf.cpp b/Source/ThirdParty/tracy/libbacktrace/elf.cpp index 50715bf95..9e62f090d 100644 --- a/Source/ThirdParty/tracy/libbacktrace/elf.cpp +++ b/Source/ThirdParty/tracy/libbacktrace/elf.cpp @@ -46,6 +46,9 @@ POSSIBILITY OF SUCH DAMAGE. */ #include "backtrace.hpp" #include "internal.hpp" +#include "../client/TracyFastVector.hpp" +#include "../common/TracyAlloc.hpp" + #ifndef S_ISLNK #ifndef S_IFLNK #define S_IFLNK 0120000 @@ -70,6 +73,10 @@ POSSIBILITY OF SUCH DAMAGE. */ namespace tracy { +#ifdef TRACY_DEBUGINFOD +int GetDebugInfoDescriptor( const char* buildid_data, size_t buildid_size ); +#endif + #if !defined(HAVE_DECL_STRNLEN) || !HAVE_DECL_STRNLEN /* If strnlen is not declared, provide our own version. */ @@ -867,6 +874,7 @@ elf_readlink (struct backtrace_state *state, const char *filename, static int elf_open_debugfile_by_buildid (struct backtrace_state *state, const char *buildid_data, size_t buildid_size, + const char *filename, backtrace_error_callback error_callback, void *data) { @@ -913,7 +921,14 @@ elf_open_debugfile_by_buildid (struct backtrace_state *state, That seems kind of pointless to me--why would it have the right name but not the right build ID?--so skipping the check. */ +#ifdef TRACY_DEBUGINFOD + if (ret == -1) + return GetDebugInfoDescriptor( buildid_data, buildid_size, filename ); + else + return ret; +#else return ret; +#endif } /* Try to open a file whose name is PREFIX (length PREFIX_LEN) @@ -1803,7 +1818,7 @@ elf_zlib_inflate (const unsigned char *pin, size_t sin, uint16_t *zdebug_table, /* An uncompressed block. */ /* If we've read ahead more than a byte, back up. */ - while (bits > 8) + while (bits >= 8) { --pin; bits -= 8; @@ -4429,7 +4444,7 @@ elf_add (struct backtrace_state *state, const char *filename, int descriptor, int d; d = elf_open_debugfile_by_buildid (state, buildid_data, buildid_size, - error_callback, data); + filename, error_callback, data); if (d >= 0) { int ret; @@ -4812,12 +4827,34 @@ struct phdr_data /* Callback passed to dl_iterate_phdr. Load debug info from shared libraries. */ +struct PhdrIterate +{ + char* dlpi_name; + ElfW(Addr) dlpi_addr; +}; +FastVector s_phdrData(16); + +static int +phdr_callback_mock (struct dl_phdr_info *info, size_t size ATTRIBUTE_UNUSED, + void *pdata) +{ + auto ptr = s_phdrData.push_next(); + if (info->dlpi_name) + { + size_t sz = strlen (info->dlpi_name) + 1; + ptr->dlpi_name = (char*)tracy_malloc (sz); + memcpy (ptr->dlpi_name, info->dlpi_name, sz); + } + else ptr->dlpi_name = nullptr; + ptr->dlpi_addr = info->dlpi_addr; + return 0; +} + static int #ifdef __i386__ __attribute__ ((__force_align_arg_pointer__)) #endif -phdr_callback (struct dl_phdr_info *info, size_t size ATTRIBUTE_UNUSED, - void *pdata) +phdr_callback (struct PhdrIterate *info, void *pdata) { struct phdr_data *pd = (struct phdr_data *) pdata; const char *filename; @@ -4896,7 +4933,14 @@ backtrace_initialize (struct backtrace_state *state, const char *filename, pd.exe_filename = filename; pd.exe_descriptor = ret < 0 ? descriptor : -1; - dl_iterate_phdr (phdr_callback, (void *) &pd); + assert (s_phdrData.empty()); + dl_iterate_phdr (phdr_callback_mock, nullptr); + for (auto& v : s_phdrData) + { + phdr_callback (&v, (void *) &pd); + tracy_free (v.dlpi_name); + } + s_phdrData.clear(); if (!state->threaded) { diff --git a/Source/ThirdParty/tracy/libbacktrace/macho.cpp b/Source/ThirdParty/tracy/libbacktrace/macho.cpp index cb50dc5f7..6cccdabaa 100644 --- a/Source/ThirdParty/tracy/libbacktrace/macho.cpp +++ b/Source/ThirdParty/tracy/libbacktrace/macho.cpp @@ -1271,7 +1271,7 @@ backtrace_initialize (struct backtrace_state *state, const char *filename, mff = macho_nodebug; if (!macho_add (state, name, d, 0, NULL, base_address, 0, error_callback, data, &mff, &mfs)) - return 0; + continue; if (mff != macho_nodebug) macho_fileline_fn = mff; @@ -1292,7 +1292,7 @@ backtrace_initialize (struct backtrace_state *state, const char *filename, else { if (found_sym) - backtrace_atomic_store_pointer (&state->syminfo_fn, macho_syminfo); + backtrace_atomic_store_pointer (&state->syminfo_fn, &macho_syminfo); else (void) __sync_bool_compare_and_swap (&state->syminfo_fn, NULL, macho_nosyms); @@ -1338,7 +1338,7 @@ backtrace_initialize (struct backtrace_state *state, const char *filename, else { if (found_sym) - backtrace_atomic_store_pointer (&state->syminfo_fn, macho_syminfo); + backtrace_atomic_store_pointer (&state->syminfo_fn, &macho_syminfo); else (void) __sync_bool_compare_and_swap (&state->syminfo_fn, NULL, macho_nosyms); diff --git a/Source/ThirdParty/tracy/tracy.Build.cs b/Source/ThirdParty/tracy/tracy.Build.cs index 76d8854f6..d6119a5ca 100644 --- a/Source/ThirdParty/tracy/tracy.Build.cs +++ b/Source/ThirdParty/tracy/tracy.Build.cs @@ -34,11 +34,12 @@ public class tracy : ThirdPartyModule options.SourcePaths.Clear(); options.SourceFiles.Clear(); - options.SourceFiles.Add(Path.Combine(FolderPath, "Tracy.h")); + options.SourceFiles.Add(Path.Combine(FolderPath, "tracy", "Tracy.hpp")); options.SourceFiles.Add(Path.Combine(FolderPath, "TracyClient.cpp")); options.PublicDefinitions.Add("TRACY_ENABLE"); options.PrivateDefinitions.Add("TRACY_NO_INVARIANT_CHECK"); + options.PrivateDefinitions.Add("TRACY_NO_FRAME_IMAGE"); if (options.Platform.Target == TargetPlatform.Windows) { options.PrivateDefinitions.Add("TRACY_DBGHELP_LOCK=DbgHelp"); @@ -54,7 +55,7 @@ public class tracy : ThirdPartyModule { base.GetFilesToDeploy(files); - files.Add(Path.Combine(FolderPath, "Tracy.h")); + files.Add(Path.Combine(FolderPath, "tracy", "Tracy.hpp")); files.Add(Path.Combine(FolderPath, "common", "TracySystem.hpp")); files.Add(Path.Combine(FolderPath, "client", "TracyCallstack.h")); } diff --git a/Source/ThirdParty/tracy/Tracy.h b/Source/ThirdParty/tracy/tracy/Tracy.hpp similarity index 69% rename from Source/ThirdParty/tracy/Tracy.h rename to Source/ThirdParty/tracy/tracy/Tracy.hpp index 0b3469cd5..8ef26d59e 100644 --- a/Source/ThirdParty/tracy/Tracy.h +++ b/Source/ThirdParty/tracy/tracy/Tracy.hpp @@ -24,12 +24,14 @@ #define ZoneColorV(x,y) #define ZoneValue(x) #define ZoneValueV(x,y) +#define ZoneIsActive false +#define ZoneIsActiveV(x) false #define FrameMark #define FrameMarkNamed(x) #define TracyPlot(x,y) -#define TracyPlotConfig(x,y) +#define TracyPlotConfig(x,y,z,w,a) #define TracyMessage(x,y) #define TracyMessageL(x) @@ -75,15 +77,19 @@ #define TracyMessageCS(x,y,z,w) #define TracyMessageLCS(x,y,z) -#define TracyParameterRegister(x) +#define TracySourceCallbackRegister(x,y) +#define TracyParameterRegister(x,y) #define TracyParameterSetup(x,y,z,w) +#define TracyFiberEnter(x) +#define TracyFiberLeave + #else #include -#include "common/TracySystem.hpp" -#include "client/TracyCallstack.h" +#include "../common/TracyQueue.hpp" +#include "../common/TracySystem.hpp" namespace tracy { @@ -91,10 +97,12 @@ class TRACY_API Profiler { public: static void SendFrameMark( const char* name ); + static void SendFrameMark( const char* name, QueueType type ); + static void SendFrameImage( const void* image, uint16_t w, uint16_t h, uint8_t offset, bool flip ); static void PlotData( const char* name, int64_t val ); static void PlotData( const char* name, float val ); static void PlotData( const char* name, double val ); - static void ConfigurePlot( const char* name, PlotFormatType type ); + static void ConfigurePlot( const char* name, PlotFormatType type, bool step, bool fill, uint32_t color ); static void Message( const char* txt, size_t size, int callstack ); static void Message( const char* txt, int callstack ); static void MessageColor( const char* txt, size_t size, uint32_t color, int callstack ); @@ -108,27 +116,29 @@ public: static void MemFreeNamed( const void* ptr, bool secure, const char* name ); static void MemAllocCallstackNamed( const void* ptr, size_t size, int depth, bool secure, const char* name ); static void MemFreeCallstackNamed( const void* ptr, int depth, bool secure, const char* name ); + static void SendCallstack( int depth ); static void ParameterRegister( ParameterCallback cb ); + static void ParameterRegister( ParameterCallback cb, void* data ); static void ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val ); }; } #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK -# define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); -# define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); -# define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); -# define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ) +# define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ) +# define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ) +# define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ) -# define ZoneTransient( varname, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, TRACY_CALLSTACK, active ); -# define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), TRACY_CALLSTACK, active ); +# define ZoneTransient( varname, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, TRACY_CALLSTACK, active ) +# define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), TRACY_CALLSTACK, active ) #else -# define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ); -# define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ); -# define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ); -# define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ); +# define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ) +# define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ) +# define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ) +# define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ) -# define ZoneTransient( varname, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, active ); -# define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), active ); +# define ZoneTransient( varname, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, active ) +# define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), active ) #endif #define ZoneScoped ZoneNamed( ___tracy_scoped_zone, true ) @@ -136,83 +146,98 @@ public: #define ZoneScopedC( color ) ZoneNamedC( ___tracy_scoped_zone, color, true ) #define ZoneScopedNC( name, color ) ZoneNamedNC( ___tracy_scoped_zone, name, color, true ) -#define ZoneText( txt, size ) ___tracy_scoped_zone.Text( txt, size ); -#define ZoneTextV( varname, txt, size ) varname.Text( txt, size ); -#define ZoneName( txt, size ) ___tracy_scoped_zone.Name( txt, size ); -#define ZoneNameV( varname, txt, size ) varname.Name( txt, size ); -#define ZoneColor( color ) ___tracy_scoped_zone.Color( color ); -#define ZoneColorV( varname, color ) varname.Color( color ); -#define ZoneValue( value ) ___tracy_scoped_zone.Value( value ); -#define ZoneValueV( varname, value ) varname.Value( value ); +#define ZoneText( txt, size ) ___tracy_scoped_zone.Text( txt, size ) +#define ZoneTextV( varname, txt, size ) varname.Text( txt, size ) +#define ZoneName( txt, size ) ___tracy_scoped_zone.Name( txt, size ) +#define ZoneNameV( varname, txt, size ) varname.Name( txt, size ) +#define ZoneColor( color ) ___tracy_scoped_zone.Color( color ) +#define ZoneColorV( varname, color ) varname.Color( color ) +#define ZoneValue( value ) ___tracy_scoped_zone.Value( value ) +#define ZoneValueV( varname, value ) varname.Value( value ) +#define ZoneIsActive ___tracy_scoped_zone.IsActive() +#define ZoneIsActiveV( varname ) varname.IsActive() -#define FrameMark tracy::Profiler::SendFrameMark( nullptr ); -#define FrameMarkNamed( name ) tracy::Profiler::SendFrameMark( name ); +#define FrameMark tracy::Profiler::SendFrameMark( nullptr ) +#define FrameMarkNamed( name ) tracy::Profiler::SendFrameMark( name ) +#define FrameMarkStart( name ) tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgStart ) +#define FrameMarkEnd( name ) tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgEnd ) -#define TracyPlot( name, val ) tracy::Profiler::PlotData( name, val ); -#define TracyPlotConfig( name, type ) tracy::Profiler::ConfigurePlot( name, type ); +#define FrameImage( image, width, height, offset, flip ) tracy::Profiler::SendFrameImage( image, width, height, offset, flip ) -#define TracyAppInfo( txt, size ) tracy::Profiler::MessageAppInfo( txt, size ); +#define TracyLockable( type, varname ) tracy::Lockable varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, #type " " #varname, __FILE__, __LINE__, 0 }; return &srcloc; }() } +#define TracyLockableN( type, varname, desc ) tracy::Lockable varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, desc, __FILE__, __LINE__, 0 }; return &srcloc; }() } +#define TracySharedLockable( type, varname ) tracy::SharedLockable varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, #type " " #varname, __FILE__, __LINE__, 0 }; return &srcloc; }() } +#define TracySharedLockableN( type, varname, desc ) tracy::SharedLockable varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, desc, __FILE__, __LINE__, 0 }; return &srcloc; }() } +#define LockableBase( type ) tracy::Lockable +#define SharedLockableBase( type ) tracy::SharedLockable +#define LockMark( varname ) static constexpr tracy::SourceLocationData __tracy_lock_location_##varname { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; varname.Mark( &__tracy_lock_location_##varname ) +#define LockableName( varname, txt, size ) varname.CustomName( txt, size ) + +#define TracyPlot( name, val ) tracy::Profiler::PlotData( name, val ) +#define TracyPlotConfig( name, type, step, fill, color ) tracy::Profiler::ConfigurePlot( name, type, step, fill, color ) + +#define TracyAppInfo( txt, size ) tracy::Profiler::MessageAppInfo( txt, size ) #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK -# define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, TRACY_CALLSTACK ); -# define TracyMessageL( txt ) tracy::Profiler::Message( txt, TRACY_CALLSTACK ); -# define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, TRACY_CALLSTACK ); -# define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, TRACY_CALLSTACK ); +# define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, TRACY_CALLSTACK ) +# define TracyMessageL( txt ) tracy::Profiler::Message( txt, TRACY_CALLSTACK ) +# define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, TRACY_CALLSTACK ) +# define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, TRACY_CALLSTACK ) -# define TracyAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, false ); -# define TracyFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, false ); -# define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, true ); -# define TracySecureFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, true ); +# define TracyAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, false ) +# define TracyFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, false ) +# define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, true ) +# define TracySecureFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, true ) -# define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, false, name ); -# define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, false, name ); -# define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, true, name ); -# define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, true, name ); +# define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, false, name ) +# define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, false, name ) +# define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, true, name ) +# define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, true, name ) #else -# define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, 0 ); -# define TracyMessageL( txt ) tracy::Profiler::Message( txt, 0 ); -# define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, 0 ); -# define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, 0 ); +# define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, 0 ) +# define TracyMessageL( txt ) tracy::Profiler::Message( txt, 0 ) +# define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, 0 ) +# define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, 0 ) -# define TracyAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, false ); -# define TracyFree( ptr ) tracy::Profiler::MemFree( ptr, false ); -# define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, true ); -# define TracySecureFree( ptr ) tracy::Profiler::MemFree( ptr, true ); +# define TracyAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, false ) +# define TracyFree( ptr ) tracy::Profiler::MemFree( ptr, false ) +# define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, true ) +# define TracySecureFree( ptr ) tracy::Profiler::MemFree( ptr, true ) -# define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, false, name ); -# define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, false, name ); -# define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, true, name ); -# define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, true, name ); +# define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, false, name ) +# define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, false, name ) +# define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, true, name ) +# define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, true, name ) #endif #ifdef TRACY_HAS_CALLSTACK -# define ZoneNamedS( varname, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); -# define ZoneNamedNS( varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); -# define ZoneNamedCS( varname, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); -# define ZoneNamedNCS( varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); +# define ZoneNamedS( varname, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ) +# define ZoneNamedNS( varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ) +# define ZoneNamedCS( varname, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ) +# define ZoneNamedNCS( varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ) -# define ZoneTransientS( varname, depth, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, depth, active ); -# define ZoneTransientNS( varname, name, depth, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), depth, active ); +# define ZoneTransientS( varname, depth, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, depth, active ) +# define ZoneTransientNS( varname, name, depth, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), depth, active ) # define ZoneScopedS( depth ) ZoneNamedS( ___tracy_scoped_zone, depth, true ) # define ZoneScopedNS( name, depth ) ZoneNamedNS( ___tracy_scoped_zone, name, depth, true ) # define ZoneScopedCS( color, depth ) ZoneNamedCS( ___tracy_scoped_zone, color, depth, true ) # define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color, depth, true ) -# define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, false ); -# define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, false ); -# define TracySecureAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, true ); -# define TracySecureFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, true ); +# define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, false ) +# define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, false ) +# define TracySecureAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, true ) +# define TracySecureFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, true ) -# define TracyAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, false, name ); -# define TracyFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, false, name ); -# define TracySecureAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, true, name ); -# define TracySecureFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, true, name ); +# define TracyAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, false, name ) +# define TracyFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, false, name ) +# define TracySecureAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, true, name ) +# define TracySecureFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, true, name ) -# define TracyMessageS( txt, size, depth ) tracy::Profiler::Message( txt, size, depth ); -# define TracyMessageLS( txt, depth ) tracy::Profiler::Message( txt, depth ); -# define TracyMessageCS( txt, size, color, depth ) tracy::Profiler::MessageColor( txt, size, color, depth ); -# define TracyMessageLCS( txt, color, depth ) tracy::Profiler::MessageColor( txt, color, depth ); +# define TracyMessageS( txt, size, depth ) tracy::Profiler::Message( txt, size, depth ) +# define TracyMessageLS( txt, depth ) tracy::Profiler::Message( txt, depth ) +# define TracyMessageCS( txt, size, color, depth ) tracy::Profiler::MessageColor( txt, size, color, depth ) +# define TracyMessageLCS( txt, color, depth ) tracy::Profiler::MessageColor( txt, color, depth ) #else # define ZoneNamedS( varname, depth, active ) ZoneNamed( varname, active ) # define ZoneNamedNS( varname, name, depth, active ) ZoneNamedN( varname, name, active ) @@ -232,10 +257,10 @@ public: # define TracySecureAllocS( ptr, size, depth ) TracySecureAlloc( ptr, size ) # define TracySecureFreeS( ptr, depth ) TracySecureFree( ptr ) -# define TracyAllocNS( ptr, size, depth, name ) TracyAlloc( ptr, size, name ) -# define TracyFreeNS( ptr, depth, name ) TracyFree( ptr, name ) -# define TracySecureAllocNS( ptr, size, depth, name ) TracySecureAlloc( ptr, size, name ) -# define TracySecureFreeNS( ptr, depth, name ) TracySecureFree( ptr, name ) +# define TracyAllocNS( ptr, size, depth, name ) TracyAllocN( ptr, size, name ) +# define TracyFreeNS( ptr, depth, name ) TracyFreeN( ptr, name ) +# define TracySecureAllocNS( ptr, size, depth, name ) TracySecureAllocN( ptr, size, name ) +# define TracySecureFreeNS( ptr, depth, name ) TracySecureFreeN( ptr, name ) # define TracyMessageS( txt, size, depth ) TracyMessage( txt, size ) # define TracyMessageLS( txt, depth ) TracyMessageL( txt ) @@ -243,8 +268,15 @@ public: # define TracyMessageLCS( txt, color, depth ) TracyMessageLC( txt, color ) #endif -#define TracyParameterRegister( cb ) tracy::Profiler::ParameterRegister( cb ); -#define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val ); +#define TracySourceCallbackRegister( cb, data ) tracy::Profiler::SourceCallbackRegister( cb, data ) +#define TracyParameterRegister( cb, data ) tracy::Profiler::ParameterRegister( cb, data ) +#define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val ) +#define TracyIsConnected tracy::GetProfiler().IsConnected() + +#ifdef TRACY_FIBERS +# define TracyFiberEnter( fiber ) tracy::Profiler::EnterFiber( fiber ) +# define TracyFiberLeave tracy::Profiler::LeaveFiber() +#endif #endif