diff --git a/Source/Editor/Windows/AboutDialog.cs b/Source/Editor/Windows/AboutDialog.cs index bb320fb87..d50908f28 100644 --- a/Source/Editor/Windows/AboutDialog.cs +++ b/Source/Editor/Windows/AboutDialog.cs @@ -131,6 +131,7 @@ namespace FlaxEditor.Windows "LZ4 Library - Copyright (c) Yann Collet. All rights reserved.", "fmt - www.fmtlib.net", "minimp3 - www.github.com/lieff/minimp3", + "Tracy Profiler - www.github.com/wolfpld/tracy", "Ogg and Vorbis - Xiph.org Foundation", "OpenAL Soft - www.github.com/kcat/openal-soft", "OpenFBX - www.github.com/nem0/OpenFBX", diff --git a/Source/Engine/Content/Asset.cpp b/Source/Engine/Content/Asset.cpp index 11b1bed39..7fcd2f405 100644 --- a/Source/Engine/Content/Asset.cpp +++ b/Source/Engine/Content/Asset.cpp @@ -357,7 +357,15 @@ bool Asset::onLoad(LoadAssetTask* task) Locker.Lock(); // Load asset - const LoadResult result = loadAsset(); + LoadResult result; + { +#if TRACY_ENABLE + ZoneScoped; + const StringView name(GetPath()); + ZoneName(*name, name.Length()); +#endif + result = loadAsset(); + } const bool isLoaded = result == LoadResult::Ok; const bool failed = !isLoaded; _loadFailed = failed; diff --git a/Source/Engine/Engine/Engine.cpp b/Source/Engine/Engine/Engine.cpp index 012b3290c..24d691c56 100644 --- a/Source/Engine/Engine/Engine.cpp +++ b/Source/Engine/Engine/Engine.cpp @@ -166,6 +166,7 @@ int32 Engine::Main(const Char* cmdLine) } } #endif + // App paused logic if (Platform::GetIsPaused()) { @@ -202,6 +203,7 @@ int32 Engine::Main(const Char* cmdLine) { OnDraw(); Time::OnEndDraw(); + FrameMark; canDraw = false; } diff --git a/Source/Engine/Platform/Base/PlatformBase.cpp b/Source/Engine/Platform/Base/PlatformBase.cpp index 751f63dab..f6aa0e2d6 100644 --- a/Source/Engine/Platform/Base/PlatformBase.cpp +++ b/Source/Engine/Platform/Base/PlatformBase.cpp @@ -14,7 +14,7 @@ #include "Engine/Core/Math/Rectangle.h" #include "Engine/Core/Utilities.h" #if COMPILE_WITH_PROFILER -#include "Engine/Profiler/ProfilerMemory.h" +#include "Engine/Profiler/ProfilerCPU.h" #endif #include "Engine/Threading/Threading.h" #include "Engine/Engine/CommandLine.h" @@ -165,6 +165,44 @@ void PlatformBase::Exit() { } +#if COMPILE_WITH_PROFILER + +void PlatformBase::OnMemoryAlloc(void* ptr, uint64 size) +{ + if (!ptr) + return; + +#if TRACY_ENABLE + // Track memory allocation in Tracy + //tracy::Profiler::MemAlloc(ptr, size, false); + tracy::Profiler::MemAllocCallstack(ptr, size, 12, false); +#endif + + // Register allocation during the current CPU event + auto thread = ProfilerCPU::GetCurrentThread(); + if (thread != nullptr && thread->Buffer.GetCount() != 0) + { + auto& activeEvent = thread->Buffer.Last().Event(); + if (activeEvent.End < ZeroTolerance) + { + activeEvent.NativeMemoryAllocation += (int32)size; + } + } +} + +void PlatformBase::OnMemoryFree(void* ptr) +{ + if (!ptr) + return; + +#if TRACY_ENABLE + // Track memory allocation in Tracy + tracy::Profiler::MemFree(ptr, false); +#endif +} + +#endif + void* PlatformBase::AllocatePages(uint64 numPages, uint64 pageSize) { // Fallback to the default memory allocation @@ -460,15 +498,6 @@ Vector2 PlatformBase::GetVirtualDesktopSize() return Platform::GetVirtualDesktopBounds().Size; } -#if COMPILE_WITH_PROFILER - -void PlatformBase::TrackAllocation(uint64 size) -{ - ProfilerMemory::OnAllocation((uint32)size, false); -} - -#endif - void PlatformBase::GetEnvironmentVariables(Dictionary& result) { // Not supported diff --git a/Source/Engine/Platform/Base/PlatformBase.h b/Source/Engine/Platform/Base/PlatformBase.h index 0d972f412..a1a857451 100644 --- a/Source/Engine/Platform/Base/PlatformBase.h +++ b/Source/Engine/Platform/Base/PlatformBase.h @@ -299,7 +299,8 @@ public: static void Prefetch(void const* ptr) = delete; #if COMPILE_WITH_PROFILER - static void TrackAllocation(uint64 size); + static void OnMemoryAlloc(void* ptr, uint64 size); + static void OnMemoryFree(void* ptr); #endif /// diff --git a/Source/Engine/Platform/Base/ThreadBase.cpp b/Source/Engine/Platform/Base/ThreadBase.cpp index 41f5f0a57..ddcba4f7a 100644 --- a/Source/Engine/Platform/Base/ThreadBase.cpp +++ b/Source/Engine/Platform/Base/ThreadBase.cpp @@ -4,6 +4,10 @@ #include "Engine/Threading/IRunnable.h" #include "Engine/Threading/ThreadRegistry.h" #include "Engine/Core/Log.h" +#if TRACY_ENABLE +#include "Engine/Core/Math/Math.h" +#include +#endif Delegate ThreadBase::ThreadStarting; Delegate ThreadBase::ThreadExiting; @@ -70,6 +74,13 @@ int32 ThreadBase::Run() ASSERT(_runnable); const auto thread = static_cast(this); _id = Platform::GetCurrentThreadID(); +#if TRACY_ENABLE + char threadName[100]; + const int32 threadNameLength = Math::Min(ARRAY_COUNT(threadName) - 1, _name.Length()); + StringUtils::ConvertUTF162ANSI(*_name, threadName, threadNameLength); + threadName[threadNameLength] = 0; + tracy::SetThreadName(threadName); +#endif ThreadRegistry::Add(thread); ThreadStarting(thread); int32 exitCode = 1; diff --git a/Source/Engine/Platform/Win32/Win32Platform.cpp b/Source/Engine/Platform/Win32/Win32Platform.cpp index a342b3c2c..3bf261fa6 100644 --- a/Source/Engine/Platform/Win32/Win32Platform.cpp +++ b/Source/Engine/Platform/Win32/Win32Platform.cpp @@ -305,14 +305,18 @@ void Win32Platform::Prefetch(void const* ptr) void* Win32Platform::Allocate(uint64 size, uint64 alignment) { + void* ptr = _aligned_malloc((size_t)size, (size_t)alignment); #if COMPILE_WITH_PROFILER - TrackAllocation(size); + OnMemoryAlloc(ptr, size); #endif - return _aligned_malloc((size_t)size, (size_t)alignment); + return ptr; } void Win32Platform::Free(void* ptr) { +#if COMPILE_WITH_PROFILER + OnMemoryFree(ptr); +#endif _aligned_free(ptr); } diff --git a/Source/Engine/Platform/Windows/WindowsPlatform.cpp b/Source/Engine/Platform/Windows/WindowsPlatform.cpp index baf724421..00a5c0d0f 100644 --- a/Source/Engine/Platform/Windows/WindowsPlatform.cpp +++ b/Source/Engine/Platform/Windows/WindowsPlatform.cpp @@ -37,9 +37,30 @@ namespace int32 SystemDpi = 96; #if CRASH_LOG_ENABLE CriticalSection SymLocker; +#if TRACY_ENABLE + bool SymInitialized = true; +#else bool SymInitialized = false; - bool SymModulesDirty = true; +#endif Array SymbolsPath; + + void OnSymbolsPathModified() + { + if (!SymInitialized) + return; + HANDLE process = GetCurrentProcess(); + SymCleanup(process); + String symbolSearchPath; + for (auto& path : SymbolsPath) + { + symbolSearchPath += path; + symbolSearchPath += ";"; + } + symbolSearchPath += Platform::GetWorkingDirectory(); + SymInitializeW(process, *symbolSearchPath, TRUE); + //SymSetSearchPathW(process, *symbolSearchPath); + //SymRefreshModuleList(process); + } #endif } @@ -378,6 +399,20 @@ void WindowsPlatform::PreInit(void* hInstance) Error(TEXT("OLE initalization failed!")); exit(-1); } + +#if CRASH_LOG_ENABLE + TCHAR buffer[MAX_PATH] = { 0 }; + SymLocker.Lock(); + if (::GetModuleFileNameW(::GetModuleHandleW(nullptr), buffer, MAX_PATH)) + SymbolsPath.Add(StringUtils::GetDirectoryName(buffer)); + if (::GetEnvironmentVariableW(TEXT("_NT_SYMBOL_PATH"), buffer, MAX_PATH)) + SymbolsPath.Add(StringUtils::GetDirectoryName(buffer)); + DWORD options = SymGetOptions(); + options |= SYMOPT_LOAD_LINES | SYMOPT_FAIL_CRITICAL_ERRORS | SYMOPT_DEFERRED_LOADS | SYMOPT_EXACT_SYMBOLS; + SymSetOptions(options); + OnSymbolsPathModified(); + SymLocker.Unlock(); +#endif } bool WindowsPlatform::IsWindows10() @@ -604,11 +639,13 @@ void WindowsPlatform::Exit() { #if CRASH_LOG_ENABLE SymLocker.Lock(); +#if !TRACY_ENABLE if (SymInitialized) { SymInitialized = false; SymCleanup(GetCurrentProcess()); } +#endif SymbolsPath.Resize(0); SymLocker.Unlock(); #endif @@ -650,25 +687,20 @@ void WindowsPlatform::SetHighDpiAwarenessEnabled(bool enable) const HMODULE shCoreDll = LoadLibraryW(L"Shcore.dll"); if (!shCoreDll) return; - typedef enum _PROCESS_DPI_AWARENESS { PROCESS_DPI_UNAWARE = 0, PROCESS_SYSTEM_DPI_AWARE = 1, PROCESS_PER_MONITOR_DPI_AWARE = 2 } PROCESS_DPI_AWARENESS; - typedef HRESULT (STDAPICALLTYPE *SetProcessDpiAwarenessProc)(PROCESS_DPI_AWARENESS Value); const SetProcessDpiAwarenessProc setProcessDpiAwareness = (SetProcessDpiAwarenessProc)GetProcAddress(shCoreDll, "SetProcessDpiAwareness"); - if (setProcessDpiAwareness) { setProcessDpiAwareness(enable ? PROCESS_PER_MONITOR_DPI_AWARE : PROCESS_DPI_UNAWARE); } - SystemDpi = CalculateDpi(shCoreDll); - - FreeLibrary(shCoreDll); + ::FreeLibrary(shCoreDll); } BatteryInfo WindowsPlatform::GetBatteryInfo() @@ -1108,10 +1140,9 @@ void* WindowsPlatform::LoadLibrary(const Char* filename) SymLocker.Lock(); const auto folder = StringUtils::GetDirectoryName(filename); if (!SymbolsPath.Contains(folder)) - SymbolsPath.Add(folder); - if (SymInitialized) { - SymModulesDirty = true; + SymbolsPath.Add(folder); + OnSymbolsPathModified(); } SymLocker.Unlock(); #endif @@ -1131,46 +1162,16 @@ Array WindowsPlatform::GetStackFrames(int32 skipCount, if (!SymInitialized) { SymInitialized = true; - - // Build search path String symbolSearchPath; - TCHAR ModulePath[MAX_PATH] = { 0 }; - if (::GetModuleFileName(::GetModuleHandle(nullptr), ModulePath, MAX_PATH)) - { - symbolSearchPath += StringUtils::GetDirectoryName(ModulePath); - symbolSearchPath += ";"; - } for (auto& path : SymbolsPath) { symbolSearchPath += path; symbolSearchPath += ";"; } - String _NT_SYMBOL_PATH; - if (!Platform::GetEnvironmentVariable(TEXT("_NT_SYMBOL_PATH"), _NT_SYMBOL_PATH)) - { - symbolSearchPath += _NT_SYMBOL_PATH; - symbolSearchPath += ";"; - } symbolSearchPath += Platform::GetWorkingDirectory(); - symbolSearchPath += ";"; - - DWORD options = SymGetOptions(); - options |= SYMOPT_LOAD_LINES; - options |= SYMOPT_FAIL_CRITICAL_ERRORS; - options |= SYMOPT_DEFERRED_LOADS; - options |= SYMOPT_EXACT_SYMBOLS; - SymSetOptions(options); - SymInitializeW(process, *symbolSearchPath, TRUE); } - // Refresh modules if needed - if (SymModulesDirty) - { - SymModulesDirty = false; - SymRefreshModuleList(process); - } - // Capture the context if missing /*EXCEPTION_POINTERS exceptionPointers; CONTEXT contextData; diff --git a/Source/Engine/Profiler/Profiler.Build.cs b/Source/Engine/Profiler/Profiler.Build.cs index aeda888e5..289781271 100644 --- a/Source/Engine/Profiler/Profiler.Build.cs +++ b/Source/Engine/Profiler/Profiler.Build.cs @@ -27,5 +27,13 @@ public class Profiler : EngineModule options.PrivateDependencies.Clear(); options.PublicDefinitions.Add("COMPILE_WITH_PROFILER"); + + // Tracy profiling tools + switch (options.Platform.Target) + { + case TargetPlatform.Windows: + options.PublicDependencies.Add("tracy"); + break; + } } } diff --git a/Source/Engine/Profiler/ProfilerCPU.h b/Source/Engine/Profiler/ProfilerCPU.h index 88ec738bf..d5cd55f07 100644 --- a/Source/Engine/Profiler/ProfilerCPU.h +++ b/Source/Engine/Profiler/ProfilerCPU.h @@ -8,6 +8,7 @@ #include "Engine/Core/Collections/Array.h" #include "Engine/Core/Math/Math.h" #include "Engine/Scripting/ScriptingType.h" +#include #if COMPILE_WITH_PROFILER @@ -393,12 +394,22 @@ struct TIsPODType }; // Shortcut macros for profiling a single code block execution on CPU -#define PROFILE_CPU_NAMED(name) ScopeProfileBlockCPU ProfileBlockCPU(TEXT(name)) +// Use ZoneTransient for Tracy for code that can be hot-reloaded (eg. in Editor) + +#if USE_EDITOR +#define PROFILE_CPU_NAMED(name) ZoneTransientN(___tracy_scoped_zone, name, true); ScopeProfileBlockCPU ProfileBlockCPU(TEXT(name)) +#else +#define PROFILE_CPU_NAMED(name) ZoneNamedN(___tracy_scoped_zone, name, true); ScopeProfileBlockCPU ProfileBlockCPU(TEXT(name)) +#endif #if defined(_MSC_VER) -#define PROFILE_CPU() ScopeProfileBlockCPU ProfileBlockCPU(TEXT(__FUNCTION__)) +#if USE_EDITOR +#define PROFILE_CPU() ZoneTransient(___tracy_scoped_zone, true); ScopeProfileBlockCPU ProfileBlockCPU(TEXT(__FUNCTION__)) #else -#define PROFILE_CPU() \ +#define PROFILE_CPU() ZoneNamed(___tracy_scoped_zone, true); ScopeProfileBlockCPU ProfileBlockCPU(TEXT(__FUNCTION__)) +#endif +#else +#define PROFILE_CPU() ZoneTransient(___tracy_scoped_zone, true); \ const char* _functionName = __FUNCTION__; \ const int32 _functionNameLength = ARRAY_COUNT(__FUNCTION__); \ Char _functionNameBuffer[_functionNameLength + 1]; \ diff --git a/Source/Engine/Profiler/ProfilerMemory.cpp b/Source/Engine/Profiler/ProfilerMemory.cpp deleted file mode 100644 index ae8bff067..000000000 --- a/Source/Engine/Profiler/ProfilerMemory.cpp +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2012-2021 Wojciech Figat. All rights reserved. - -#if COMPILE_WITH_PROFILER - -#include "ProfilerMemory.h" -#include "ProfilerCPU.h" - -void ProfilerMemory::OnAllocation(int32 bytes, bool isGC) -{ - // Register allocation during the current CPU event - auto thread = ProfilerCPU::GetCurrentThread(); - if (thread != nullptr && thread->Buffer.GetCount() != 0) - { - auto& activeEvent = thread->Buffer.Last().Event(); - if (activeEvent.End < ZeroTolerance) - { - if (isGC) - activeEvent.ManagedMemoryAllocation += bytes; - else - activeEvent.NativeMemoryAllocation += bytes; - } - } -} - -#endif diff --git a/Source/Engine/Profiler/ProfilerMemory.h b/Source/Engine/Profiler/ProfilerMemory.h deleted file mode 100644 index a3d0098f7..000000000 --- a/Source/Engine/Profiler/ProfilerMemory.h +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) 2012-2021 Wojciech Figat. All rights reserved. - -#pragma once - -#include "Engine/Core/Types/BaseTypes.h" - -#if COMPILE_WITH_PROFILER - -/// -/// Provides memory allocations measuring methods. -/// -class FLAXENGINE_API ProfilerMemory -{ -public: - - /// - /// Called on memory allocation. - /// - /// The allocated bytes count. - /// True if allocation comes from the Garbage Collector, otherwise false. - static void OnAllocation(int32 bytes, bool isGC); -}; - -#endif diff --git a/Source/Engine/Scripting/ManagedCLR/MCore.Mono.cpp b/Source/Engine/Scripting/ManagedCLR/MCore.Mono.cpp index 289378ffa..95d66706f 100644 --- a/Source/Engine/Scripting/ManagedCLR/MCore.Mono.cpp +++ b/Source/Engine/Scripting/ManagedCLR/MCore.Mono.cpp @@ -15,7 +15,6 @@ #include "Engine/Threading/Threading.h" #include "Engine/Platform/Thread.h" #include "Engine/Scripting/MException.h" -#include "Engine/Profiler/ProfilerMemory.h" #include "Engine/Profiler/ProfilerCPU.h" #include #include @@ -182,7 +181,16 @@ void OnGCAllocation(MonoProfiler* profiler, MonoObject* obj) #endif #if COMPILE_WITH_PROFILER - ProfilerMemory::OnAllocation(size, true); + // Register allocation during the current CPU event + auto thread = ProfilerCPU::GetCurrentThread(); + if (thread != nullptr && thread->Buffer.GetCount() != 0) + { + auto& activeEvent = thread->Buffer.Last().Event(); + if (activeEvent.End < ZeroTolerance) + { + activeEvent.ManagedMemoryAllocation += size; + } + } #endif } diff --git a/Source/ThirdParty/tracy/LICENSE b/Source/ThirdParty/tracy/LICENSE new file mode 100644 index 000000000..c2a76e56c --- /dev/null +++ b/Source/ThirdParty/tracy/LICENSE @@ -0,0 +1,27 @@ +Tracy Profiler (https://github.com/wolfpld/tracy) is licensed under the +3-clause BSD license. + +Copyright (c) 2017-2021, Bartosz Taudul +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Source/ThirdParty/tracy/Tracy.h b/Source/ThirdParty/tracy/Tracy.h new file mode 100644 index 000000000..cb115740f --- /dev/null +++ b/Source/ThirdParty/tracy/Tracy.h @@ -0,0 +1,252 @@ +#ifndef __TRACY_HPP__ +#define __TRACY_HPP__ + +#include "common/TracySystem.hpp" + +#ifndef TRACY_ENABLE + +#define ZoneNamed(x,y) +#define ZoneNamedN(x,y,z) +#define ZoneNamedC(x,y,z) +#define ZoneNamedNC(x,y,z,w) + +#define ZoneTransient(x,y) +#define ZoneTransientN(x,y,z) + +#define ZoneScoped +#define ZoneScopedN(x) +#define ZoneScopedC(x) +#define ZoneScopedNC(x,y) + +#define ZoneText(x,y) +#define ZoneTextV(x,y,z) +#define ZoneName(x,y) +#define ZoneNameV(x,y,z) +#define ZoneColor(x) +#define ZoneColorV(x,y) +#define ZoneValue(x) +#define ZoneValueV(x,y) + +#define FrameMark +#define FrameMarkNamed(x) + +#define TracyPlot(x,y) +#define TracyPlotConfig(x,y) + +#define TracyMessage(x,y) +#define TracyMessageL(x) +#define TracyMessageC(x,y,z) +#define TracyMessageLC(x,y) +#define TracyAppInfo(x,y) + +#define TracyAlloc(x,y) +#define TracyFree(x) +#define TracySecureAlloc(x,y) +#define TracySecureFree(x) + +#define TracyAllocN(x,y,z) +#define TracyFreeN(x,y) +#define TracySecureAllocN(x,y,z) +#define TracySecureFreeN(x,y) + +#define ZoneNamedS(x,y,z) +#define ZoneNamedNS(x,y,z,w) +#define ZoneNamedCS(x,y,z,w) +#define ZoneNamedNCS(x,y,z,w,a) + +#define ZoneTransientS(x,y,z) +#define ZoneTransientNS(x,y,z,w) + +#define ZoneScopedS(x) +#define ZoneScopedNS(x,y) +#define ZoneScopedCS(x,y) +#define ZoneScopedNCS(x,y,z) + +#define TracyAllocS(x,y,z) +#define TracyFreeS(x,y) +#define TracySecureAllocS(x,y,z) +#define TracySecureFreeS(x,y) + +#define TracyAllocNS(x,y,z,w) +#define TracyFreeNS(x,y,z) +#define TracySecureAllocNS(x,y,z,w) +#define TracySecureFreeNS(x,y,z) + +#define TracyMessageS(x,y,z) +#define TracyMessageLS(x,y) +#define TracyMessageCS(x,y,z,w) +#define TracyMessageLCS(x,y,z) + +#define TracyParameterRegister(x) +#define TracyParameterSetup(x,y,z,w) + +#else + +#include + +#include "client/TracyCallstack.h" + +namespace tracy +{ +class TRACY_API Profiler +{ +public: + static void SendFrameMark( const char* name ); + static void PlotData( const char* name, int64_t val ); + static void PlotData( const char* name, float val ); + static void PlotData( const char* name, double val ); + static void ConfigurePlot( const char* name, PlotFormatType type ); + static void Message( const char* txt, size_t size, int callstack ); + static void Message( const char* txt, int callstack ); + static void MessageColor( const char* txt, size_t size, uint32_t color, int callstack ); + static void MessageColor( const char* txt, uint32_t color, int callstack ); + static void MessageAppInfo( const char* txt, size_t size ); + static void MemAlloc( const void* ptr, size_t size, bool secure ); + static void MemFree( const void* ptr, bool secure ); + static void MemAllocCallstack( const void* ptr, size_t size, int depth, bool secure ); + static void MemFreeCallstack( const void* ptr, int depth, bool secure ); + static void MemAllocNamed( const void* ptr, size_t size, bool secure, const char* name ); + static void MemFreeNamed( const void* ptr, bool secure, const char* name ); + static void MemAllocCallstackNamed( const void* ptr, size_t size, int depth, bool secure, const char* name ); + static void MemFreeCallstackNamed( const void* ptr, int depth, bool secure, const char* name ); + static void ParameterRegister( ParameterCallback cb ); + static void ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val ); +}; +} + +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); +# define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), TRACY_CALLSTACK, active ); + +# define ZoneTransient( varname, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, TRACY_CALLSTACK, active ); +# define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), TRACY_CALLSTACK, active ); +#else +# define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ); +# define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ); +# define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ); +# define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), active ); + +# define ZoneTransient( varname, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, active ); +# define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), active ); +#endif + +#define ZoneScoped ZoneNamed( ___tracy_scoped_zone, true ) +#define ZoneScopedN( name ) ZoneNamedN( ___tracy_scoped_zone, name, true ) +#define ZoneScopedC( color ) ZoneNamedC( ___tracy_scoped_zone, color, true ) +#define ZoneScopedNC( name, color ) ZoneNamedNC( ___tracy_scoped_zone, name, color, true ) + +#define ZoneText( txt, size ) ___tracy_scoped_zone.Text( txt, size ); +#define ZoneTextV( varname, txt, size ) varname.Text( txt, size ); +#define ZoneName( txt, size ) ___tracy_scoped_zone.Name( txt, size ); +#define ZoneNameV( varname, txt, size ) varname.Name( txt, size ); +#define ZoneColor( color ) ___tracy_scoped_zone.Color( color ); +#define ZoneColorV( varname, color ) varname.Color( color ); +#define ZoneValue( value ) ___tracy_scoped_zone.Value( value ); +#define ZoneValueV( varname, value ) varname.Value( value ); + +#define FrameMark tracy::Profiler::SendFrameMark( nullptr ); +#define FrameMarkNamed( name ) tracy::Profiler::SendFrameMark( name ); + +#define TracyPlot( name, val ) tracy::Profiler::PlotData( name, val ); +#define TracyPlotConfig( name, type ) tracy::Profiler::ConfigurePlot( name, type ); + +#define TracyAppInfo( txt, size ) tracy::Profiler::MessageAppInfo( txt, size ); + +#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK +# define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, TRACY_CALLSTACK ); +# define TracyMessageL( txt ) tracy::Profiler::Message( txt, TRACY_CALLSTACK ); +# define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, TRACY_CALLSTACK ); +# define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, TRACY_CALLSTACK ); + +# define TracyAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, false ); +# define TracyFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, false ); +# define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, true ); +# define TracySecureFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, true ); + +# define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, false, name ); +# define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, false, name ); +# define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, true, name ); +# define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, true, name ); +#else +# define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, 0 ); +# define TracyMessageL( txt ) tracy::Profiler::Message( txt, 0 ); +# define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, 0 ); +# define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, 0 ); + +# define TracyAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, false ); +# define TracyFree( ptr ) tracy::Profiler::MemFree( ptr, false ); +# define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, true ); +# define TracySecureFree( ptr ) tracy::Profiler::MemFree( ptr, true ); + +# define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, false, name ); +# define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, false, name ); +# define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, true, name ); +# define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, true, name ); +#endif + +#ifdef TRACY_HAS_CALLSTACK +# define ZoneNamedS( varname, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); +# define ZoneNamedNS( varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); +# define ZoneNamedCS( varname, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); +# define ZoneNamedNCS( varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,__LINE__), depth, active ); + +# define ZoneTransientS( varname, depth, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), nullptr, 0, depth, active ); +# define ZoneTransientNS( varname, name, depth, active ) tracy::ScopedZone varname( __LINE__, __FILE__, strlen( __FILE__ ), __FUNCTION__, strlen( __FUNCTION__ ), name, strlen( name ), depth, active ); + +# define ZoneScopedS( depth ) ZoneNamedS( ___tracy_scoped_zone, depth, true ) +# define ZoneScopedNS( name, depth ) ZoneNamedNS( ___tracy_scoped_zone, name, depth, true ) +# define ZoneScopedCS( color, depth ) ZoneNamedCS( ___tracy_scoped_zone, color, depth, true ) +# define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color, depth, true ) + +# define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, false ); +# define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, false ); +# define TracySecureAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, true ); +# define TracySecureFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, true ); + +# define TracyAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, false, name ); +# define TracyFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, false, name ); +# define TracySecureAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, true, name ); +# define TracySecureFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, true, name ); + +# define TracyMessageS( txt, size, depth ) tracy::Profiler::Message( txt, size, depth ); +# define TracyMessageLS( txt, depth ) tracy::Profiler::Message( txt, depth ); +# define TracyMessageCS( txt, size, color, depth ) tracy::Profiler::MessageColor( txt, size, color, depth ); +# define TracyMessageLCS( txt, color, depth ) tracy::Profiler::MessageColor( txt, color, depth ); +#else +# define ZoneNamedS( varname, depth, active ) ZoneNamed( varname, active ) +# define ZoneNamedNS( varname, name, depth, active ) ZoneNamedN( varname, name, active ) +# define ZoneNamedCS( varname, color, depth, active ) ZoneNamedC( varname, color, active ) +# define ZoneNamedNCS( varname, name, color, depth, active ) ZoneNamedNC( varname, name, color, active ) + +# define ZoneTransientS( varname, depth, active ) ZoneTransient( varname, active ) +# define ZoneTransientNS( varname, name, depth, active ) ZoneTransientN( varname, name, active ) + +# define ZoneScopedS( depth ) ZoneScoped +# define ZoneScopedNS( name, depth ) ZoneScopedN( name ) +# define ZoneScopedCS( color, depth ) ZoneScopedC( color ) +# define ZoneScopedNCS( name, color, depth ) ZoneScopedNC( name, color ) + +# define TracyAllocS( ptr, size, depth ) TracyAlloc( ptr, size ) +# define TracyFreeS( ptr, depth ) TracyFree( ptr ) +# define TracySecureAllocS( ptr, size, depth ) TracySecureAlloc( ptr, size ) +# define TracySecureFreeS( ptr, depth ) TracySecureFree( ptr ) + +# define TracyAllocNS( ptr, size, depth, name ) TracyAlloc( ptr, size, name ) +# define TracyFreeNS( ptr, depth, name ) TracyFree( ptr, name ) +# define TracySecureAllocNS( ptr, size, depth, name ) TracySecureAlloc( ptr, size, name ) +# define TracySecureFreeNS( ptr, depth, name ) TracySecureFree( ptr, name ) + +# define TracyMessageS( txt, size, depth ) TracyMessage( txt, size ) +# define TracyMessageLS( txt, depth ) TracyMessageL( txt ) +# define TracyMessageCS( txt, size, color, depth ) TracyMessageC( txt, size, color ) +# define TracyMessageLCS( txt, color, depth ) TracyMessageLC( txt, color ) +#endif + +#define TracyParameterRegister( cb ) tracy::Profiler::ParameterRegister( cb ); +#define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val ); + +#endif + +#endif diff --git a/Source/ThirdParty/tracy/TracyClient.cpp b/Source/ThirdParty/tracy/TracyClient.cpp new file mode 100644 index 000000000..dd56765c1 --- /dev/null +++ b/Source/ThirdParty/tracy/TracyClient.cpp @@ -0,0 +1,53 @@ +// +// Tracy profiler +// ---------------- +// +// For fast integration, compile and +// link with this source file (and none +// other) in your executable (or in the +// main DLL / shared object on multi-DLL +// projects). +// + +// Define TRACY_ENABLE to enable profiler. + +#include "common/TracySystem.cpp" + +#ifdef TRACY_ENABLE + +#ifdef _MSC_VER +# pragma warning(push, 0) +#endif + +#include +#include "client/TracyProfiler.cpp" +#include "client/TracyCallstack.cpp" +#include "client/TracySysTime.cpp" +#include "client/TracySysTrace.cpp" +#include "common/TracySocket.cpp" +#include "client/tracy_rpmalloc.cpp" + +#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6 +# include "libbacktrace/alloc.cpp" +# include "libbacktrace/dwarf.cpp" +# include "libbacktrace/fileline.cpp" +# include "libbacktrace/mmapio.cpp" +# include "libbacktrace/posix.cpp" +# include "libbacktrace/sort.cpp" +# include "libbacktrace/state.cpp" +# if TRACY_HAS_CALLSTACK == 4 +# include "libbacktrace/macho.cpp" +# else +# include "libbacktrace/elf.cpp" +# endif +#endif + +#ifdef _MSC_VER +# pragma comment(lib, "ws2_32.lib") +# pragma comment(lib, "dbghelp.lib") +# pragma comment(lib, "advapi32.lib") +# pragma comment(lib, "user32.lib") +# pragma warning(pop) +#endif + +#endif diff --git a/Source/ThirdParty/tracy/client/TracyArmCpuTable.hpp b/Source/ThirdParty/tracy/client/TracyArmCpuTable.hpp new file mode 100644 index 000000000..ff7d976c8 --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyArmCpuTable.hpp @@ -0,0 +1,349 @@ +namespace tracy +{ + +#if defined __linux__ && defined __ARM_ARCH + +static const char* DecodeArmImplementer( uint32_t v ) +{ + static char buf[16]; + switch( v ) + { + case 0x41: return "ARM"; + case 0x42: return "Broadcom"; + case 0x43: return "Cavium"; + case 0x44: return "DEC"; + case 0x46: return "Fujitsu"; + case 0x48: return "HiSilicon"; + case 0x49: return "Infineon"; + case 0x4d: return "Motorola"; + case 0x4e: return "Nvidia"; + case 0x50: return "Applied Micro"; + case 0x51: return "Qualcomm"; + case 0x53: return "Samsung"; + case 0x54: return "Texas Instruments"; + case 0x56: return "Marvell"; + case 0x61: return "Apple"; + case 0x66: return "Faraday"; + case 0x68: return "HXT"; + case 0x69: return "Intel"; + case 0xc0: return "Ampere Computing"; + default: break; + } + sprintf( buf, "0x%x", v ); + return buf; +} + +static const char* DecodeArmPart( uint32_t impl, uint32_t part ) +{ + static char buf[16]; + switch( impl ) + { + case 0x41: + switch( part ) + { + case 0x810: return "810"; + case 0x920: return "920"; + case 0x922: return "922"; + case 0x926: return "926"; + case 0x940: return "940"; + case 0x946: return "946"; + case 0x966: return "966"; + case 0xa20: return "1020"; + case 0xa22: return "1022"; + case 0xa26: return "1026"; + case 0xb02: return "11 MPCore"; + case 0xb36: return "1136"; + case 0xb56: return "1156"; + case 0xb76: return "1176"; + case 0xc05: return " Cortex-A5"; + case 0xc07: return " Cortex-A7"; + case 0xc08: return " Cortex-A8"; + case 0xc09: return " Cortex-A9"; + case 0xc0c: return " Cortex-A12"; + case 0xc0d: return " Rockchip RK3288"; + case 0xc0f: return " Cortex-A15"; + case 0xc0e: return " Cortex-A17"; + case 0xc14: return " Cortex-R4"; + case 0xc15: return " Cortex-R5"; + case 0xc17: return " Cortex-R7"; + case 0xc18: return " Cortex-R8"; + case 0xc20: return " Cortex-M0"; + case 0xc21: return " Cortex-M1"; + case 0xc23: return " Cortex-M3"; + case 0xc24: return " Cortex-M4"; + case 0xc27: return " Cortex-M7"; + case 0xc60: return " Cortex-M0+"; + case 0xd00: return " AArch64 simulator"; + case 0xd01: return " Cortex-A32"; + case 0xd02: return " Cortex-A34"; + case 0xd03: return " Cortex-A53"; + case 0xd04: return " Cortex-A35"; + case 0xd05: return " Cortex-A55"; + case 0xd06: return " Cortex-A65"; + case 0xd07: return " Cortex-A57"; + case 0xd08: return " Cortex-A72"; + case 0xd09: return " Cortex-A73"; + case 0xd0a: return " Cortex-A75"; + case 0xd0b: return " Cortex-A76"; + case 0xd0c: return " Neoverse N1"; + case 0xd0d: return " Cortex-A77"; + case 0xd0e: return " Cortex-A76AE"; + case 0xd0f: return " AEMv8"; + case 0xd13: return " Cortex-R52"; + case 0xd20: return " Cortex-M23"; + case 0xd21: return " Cortex-M33"; + case 0xd40: return " Zeus"; + case 0xd41: return " Cortex-A78"; + case 0xd43: return " Cortex-A65AE"; + case 0xd44: return " Cortex-X1"; + case 0xd4a: return " Neoverse E1"; + default: break; + } + case 0x42: + switch( part ) + { + case 0xf: return " Brahma B15"; + case 0x100: return " Brahma B53"; + case 0x516: return " ThunderX2"; + default: break; + } + case 0x43: + switch( part ) + { + case 0xa0: return " ThunderX"; + case 0xa1: return " ThunderX 88XX"; + case 0xa2: return " ThunderX 81XX"; + case 0xa3: return " ThunderX 83XX"; + case 0xaf: return " ThunderX2 99xx"; + case 0xb0: return " OcteonTX2"; + case 0xb1: return " OcteonTX2 T98"; + case 0xb2: return " OcteonTX2 T96"; + case 0xb3: return " OcteonTX2 F95"; + case 0xb4: return " OcteonTX2 F95N"; + case 0xb5: return " OcteonTX2 F95MM"; + case 0xb8: return " ThunderX3 T110"; + default: break; + } + case 0x44: + switch( part ) + { + case 0xa10: return " SA110"; + case 0xa11: return " SA1100"; + default: break; + } + case 0x46: + switch( part ) + { + case 0x1: return " A64FX"; + default: break; + } + case 0x48: + switch( part ) + { + case 0xd01: return " TSV100"; + case 0xd40: return " Kirin 980"; + default: break; + } + case 0x4e: + switch( part ) + { + case 0x0: return " Denver"; + case 0x3: return " Denver 2"; + case 0x4: return " Carmel"; + default: break; + } + case 0x50: + switch( part ) + { + case 0x0: return " X-Gene"; + default: break; + } + case 0x51: + switch( part ) + { + case 0xf: return " Scorpion"; + case 0x2d: return " Scorpion"; + case 0x4d: return " Krait"; + case 0x6f: return " Krait"; + case 0x200: return " Kryo"; + case 0x201: return " Kryo Silver (Snapdragon 821)"; + case 0x205: return " Kryo Gold"; + case 0x211: return " Kryo Silver (Snapdragon 820)"; + case 0x800: return " Kryo 260 / 280 Gold"; + case 0x801: return " Kryo 260 / 280 Silver"; + case 0x802: return " Kryo 385 Gold"; + case 0x803: return " Kryo 385 Silver"; + case 0x804: return " Kryo 485 Gold"; + case 0xc00: return " Falkor"; + case 0xc01: return " Saphira"; + default: break; + } + case 0x53: + switch( part ) + { + case 0x1: return " Exynos M1/M2"; + case 0x2: return " Exynos M3"; + default: break; + } + case 0x56: + switch( part ) + { + case 0x131: return " Feroceon 88FR131"; + case 0x581: return " PJ4 / PJ4B"; + case 0x584: return " PJ4B-MP / PJ4C"; + default: break; + } + case 0x61: + switch( part ) + { + case 0x1: return " Cyclone"; + case 0x2: return " Typhoon"; + case 0x3: return " Typhoon/Capri"; + case 0x4: return " Twister"; + case 0x5: return " Twister/Elba/Malta"; + case 0x6: return " Hurricane"; + case 0x7: return " Hurricane/Myst"; + default: break; + } + case 0x66: + switch( part ) + { + case 0x526: return " FA526"; + case 0x626: return " FA626"; + default: break; + } + case 0x68: + switch( part ) + { + case 0x0: return " Phecda"; + default: break; + } + default: break; + } + sprintf( buf, " 0x%x", part ); + return buf; +} + +#elif defined __APPLE__ && TARGET_OS_IPHONE == 1 + +static const char* DecodeIosDevice( const char* id ) +{ + static const char* DeviceTable[] = { + "i386", "32-bit simulator", + "x86_64", "64-bit simulator", + "iPhone1,1", "iPhone", + "iPhone1,2", "iPhone 3G", + "iPhone2,1", "iPhone 3GS", + "iPhone3,1", "iPhone 4 (GSM)", + "iPhone3,2", "iPhone 4 (GSM)", + "iPhone3,3", "iPhone 4 (CDMA)", + "iPhone4,1", "iPhone 4S", + "iPhone5,1", "iPhone 5 (A1428)", + "iPhone5,2", "iPhone 5 (A1429)", + "iPhone5,3", "iPhone 5c (A1456/A1532)", + "iPhone5,4", "iPhone 5c (A1507/A1516/1526/A1529)", + "iPhone6,1", "iPhone 5s (A1433/A1533)", + "iPhone6,2", "iPhone 5s (A1457/A1518/A1528/A1530)", + "iPhone7,1", "iPhone 6 Plus", + "iPhone7,2", "iPhone 6", + "iPhone8,1", "iPhone 6S", + "iPhone8,2", "iPhone 6S Plus", + "iPhone8,4", "iPhone SE", + "iPhone9,1", "iPhone 7 (CDMA)", + "iPhone9,2", "iPhone 7 Plus (CDMA)", + "iPhone9,3", "iPhone 7 (GSM)", + "iPhone9,4", "iPhone 7 Plus (GSM)", + "iPhone10,1", "iPhone 8 (CDMA)", + "iPhone10,2", "iPhone 8 Plus (CDMA)", + "iPhone10,3", "iPhone X (CDMA)", + "iPhone10,4", "iPhone 8 (GSM)", + "iPhone10,5", "iPhone 8 Plus (GSM)", + "iPhone10,6", "iPhone X (GSM)", + "iPhone11,2", "iPhone XS", + "iPhone11,4", "iPhone XS Max", + "iPhone11,6", "iPhone XS Max China", + "iPhone11,8", "iPhone XR", + "iPhone12,1", "iPhone 11", + "iPhone12,3", "iPhone 11 Pro", + "iPhone12,5", "iPhone 11 Pro Max", + "iPhone12,8", "iPhone SE 2nd Gen", + "iPad1,1", "iPad (A1219/A1337)", + "iPad2,1", "iPad 2 (A1395)", + "iPad2,2", "iPad 2 (A1396)", + "iPad2,3", "iPad 2 (A1397)", + "iPad2,4", "iPad 2 (A1395)", + "iPad2,5", "iPad Mini (A1432)", + "iPad2,6", "iPad Mini (A1454)", + "iPad2,7", "iPad Mini (A1455)", + "iPad3,1", "iPad 3 (A1416)", + "iPad3,2", "iPad 3 (A1403)", + "iPad3,3", "iPad 3 (A1430)", + "iPad3,4", "iPad 4 (A1458)", + "iPad3,5", "iPad 4 (A1459)", + "iPad3,6", "iPad 4 (A1460)", + "iPad4,1", "iPad Air (A1474)", + "iPad4,2", "iPad Air (A1475)", + "iPad4,3", "iPad Air (A1476)", + "iPad4,4", "iPad Mini 2 (A1489)", + "iPad4,5", "iPad Mini 2 (A1490)", + "iPad4,6", "iPad Mini 2 (A1491)", + "iPad4,7", "iPad Mini 3 (A1599)", + "iPad4,8", "iPad Mini 3 (A1600)", + "iPad4,9", "iPad Mini 3 (A1601)", + "iPad5,1", "iPad Mini 4 (A1538)", + "iPad5,2", "iPad Mini 4 (A1550)", + "iPad5,3", "iPad Air 2 (A1566)", + "iPad5,4", "iPad Air 2 (A1567)", + "iPad6,3", "iPad Pro 9.7\" (A1673)", + "iPad6,4", "iPad Pro 9.7\" (A1674)", + "iPad6,5", "iPad Pro 9.7\" (A1675)", + "iPad6,7", "iPad Pro 12.9\" (A1584)", + "iPad6,8", "iPad Pro 12.9\" (A1652)", + "iPad6,11", "iPad 5th gen (A1822)", + "iPad6,12", "iPad 5th gen (A1823)", + "iPad7,1", "iPad Pro 12.9\" 2nd gen (A1670)", + "iPad7,2", "iPad Pro 12.9\" 2nd gen (A1671/A1821)", + "iPad7,3", "iPad Pro 10.5\" (A1701)", + "iPad7,4", "iPad Pro 10.5\" (A1709)", + "iPad7,5", "iPad 6th gen (A1893)", + "iPad7,6", "iPad 6th gen (A1954)", + "iPad7,11", "iPad 7th gen 10.2\" (Wifi)", + "iPad7,12", "iPad 7th gen 10.2\" (Wifi+Cellular)", + "iPad8,1", "iPad Pro 11\" (A1980)", + "iPad8,2", "iPad Pro 11\" (A1980)", + "iPad8,3", "iPad Pro 11\" (A1934/A1979/A2013)", + "iPad8,4", "iPad Pro 11\" (A1934/A1979/A2013)", + "iPad8,5", "iPad Pro 12.9\" 3rd gen (A1876)", + "iPad8,6", "iPad Pro 12.9\" 3rd gen (A1876)", + "iPad8,7", "iPad Pro 12.9\" 3rd gen (A1895/A1983/A2014)", + "iPad8,8", "iPad Pro 12.9\" 3rd gen (A1895/A1983/A2014)", + "iPad8,9", "iPad Pro 11\" 2nd gen (Wifi)", + "iPad8,10", "iPad Pro 11\" 2nd gen (Wifi+Cellular)", + "iPad8,11", "iPad Pro 12.9\" 4th gen (Wifi)", + "iPad8,12", "iPad Pro 12.9\" 4th gen (Wifi+Cellular)", + "iPad11,1", "iPad Mini 5th gen (A2133)", + "iPad11,2", "iPad Mini 5th gen (A2124/A2125/A2126)", + "iPad11,3", "iPad Air 3rd gen (A2152)", + "iPad11,4", "iPad Air 3rd gen (A2123/A2153/A2154)", + "iPod1,1", "iPod Touch", + "iPod2,1", "iPod Touch 2nd gen", + "iPod3,1", "iPod Touch 3rd gen", + "iPod4,1", "iPod Touch 4th gen", + "iPod5,1", "iPod Touch 5th gen", + "iPod7,1", "iPod Touch 6th gen", + "iPod9,1", "iPod Touch 7th gen", + nullptr + }; + + auto ptr = DeviceTable; + while( *ptr ) + { + if( strcmp( ptr[0], id ) == 0 ) return ptr[1]; + ptr += 2; + } + return id; +} + +#endif + +} diff --git a/Source/ThirdParty/tracy/client/TracyCallstack.cpp b/Source/ThirdParty/tracy/client/TracyCallstack.cpp new file mode 100644 index 000000000..10698cb19 --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyCallstack.cpp @@ -0,0 +1,768 @@ +#include +#include +#include +#include "TracyCallstack.hpp" +#include "TracyFastVector.hpp" +#include "../common/TracyAlloc.hpp" + +#ifdef TRACY_HAS_CALLSTACK + +#if TRACY_HAS_CALLSTACK == 1 +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +# include +# ifdef _MSC_VER +# pragma warning( push ) +# pragma warning( disable : 4091 ) +# endif +# include +# ifdef _MSC_VER +# pragma warning( pop ) +# endif +#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6 +# include "../libbacktrace/backtrace.hpp" +# include +# include +#elif TRACY_HAS_CALLSTACK == 5 +# include +# include +#endif + +#ifdef TRACY_DBGHELP_LOCK +# include "TracyProfiler.hpp" + +# define DBGHELP_INIT TracyConcat( TRACY_DBGHELP_LOCK, Init() ) +# define DBGHELP_LOCK TracyConcat( TRACY_DBGHELP_LOCK, Lock() ); +# define DBGHELP_UNLOCK TracyConcat( TRACY_DBGHELP_LOCK, Unlock() ); + +extern "C" +{ + void DBGHELP_INIT; + void DBGHELP_LOCK; + void DBGHELP_UNLOCK; +}; +#endif + +namespace tracy +{ + +static inline char* CopyString( const char* src, size_t sz ) +{ + assert( strlen( src ) == sz ); + auto dst = (char*)tracy_malloc( sz + 1 ); + memcpy( dst, src, sz ); + dst[sz] = '\0'; + return dst; +} + +static inline char* CopyString( const char* src ) +{ + const auto sz = strlen( src ); + auto dst = (char*)tracy_malloc( sz + 1 ); + memcpy( dst, src, sz ); + dst[sz] = '\0'; + return dst; +} + + +#if TRACY_HAS_CALLSTACK == 1 + +enum { MaxCbTrace = 16 }; +enum { MaxNameSize = 8*1024 }; + +int cb_num; +CallstackEntry cb_data[MaxCbTrace]; + +extern "C" +{ + typedef unsigned long (__stdcall *t_RtlWalkFrameChain)( void**, unsigned long, unsigned long ); + t_RtlWalkFrameChain RtlWalkFrameChain = 0; +} + +#if defined __MINGW32__ && API_VERSION_NUMBER < 12 +extern "C" { +// Actual required API_VERSION_NUMBER is unknown because it is undocumented. These functions are not present in at least v11. +DWORD IMAGEAPI SymAddrIncludeInlineTrace(HANDLE hProcess, DWORD64 Address); +BOOL IMAGEAPI SymQueryInlineTrace(HANDLE hProcess, DWORD64 StartAddress, DWORD StartContext, DWORD64 StartRetAddress, + DWORD64 CurAddress, LPDWORD CurContext, LPDWORD CurFrameIndex); +BOOL IMAGEAPI SymFromInlineContext(HANDLE hProcess, DWORD64 Address, ULONG InlineContext, PDWORD64 Displacement, + PSYMBOL_INFO Symbol); +BOOL IMAGEAPI SymGetLineFromInlineContext(HANDLE hProcess, DWORD64 qwAddr, ULONG InlineContext, + DWORD64 qwModuleBaseAddress, PDWORD pdwDisplacement, PIMAGEHLP_LINE64 Line64); +}; +#endif + +#ifndef __CYGWIN__ +struct ModuleCache +{ + uint64_t start; + uint64_t end; + char* name; +}; + +static FastVector* s_modCache; +#endif + +void InitCallstack() +{ + RtlWalkFrameChain = (t_RtlWalkFrameChain)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlWalkFrameChain" ); + +#ifdef TRACY_DBGHELP_LOCK + DBGHELP_INIT; + DBGHELP_LOCK; +#endif + + //SymInitialize( GetCurrentProcess(), "C:\\Flax\\FlaxEngine\\Binaries\\Editor\\Win64\\Debug;C:\\Flax\\FlaxEngine\\Cache\\Projects", true ); + SymInitialize( GetCurrentProcess(), nullptr, true ); + SymSetOptions( SYMOPT_LOAD_LINES ); + +#ifndef __CYGWIN__ + HMODULE mod[1024]; + DWORD needed; + HANDLE proc = GetCurrentProcess(); + + s_modCache = (FastVector*)tracy_malloc( sizeof( FastVector ) ); + new(s_modCache) FastVector( 512 ); + + if( EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 ) + { + const auto sz = needed / sizeof( HMODULE ); + for( size_t i=0; i 0 ) + { + auto ptr = name + res; + while( ptr > name && *ptr != '\\' && *ptr != '/' ) ptr--; + if( ptr > name ) ptr++; + const auto namelen = name + res - ptr; + auto cache = s_modCache->push_next(); + cache->start = base; + cache->end = base + info.SizeOfImage; + cache->name = (char*)tracy_malloc( namelen+3 ); + cache->name[0] = '['; + memcpy( cache->name+1, ptr, namelen ); + cache->name[namelen+1] = ']'; + cache->name[namelen+2] = '\0'; + } + } + } + } +#endif + +#ifdef TRACY_DBGHELP_LOCK + DBGHELP_UNLOCK; +#endif +} + +TRACY_API uintptr_t* CallTrace( int depth ) +{ + auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) ); + const auto num = RtlWalkFrameChain( (void**)( trace + 1 ), depth, 0 ); + *trace = num; + return trace; +} + +const char* DecodeCallstackPtrFast( uint64_t ptr ) +{ + static char ret[MaxNameSize]; + const auto proc = GetCurrentProcess(); + + char buf[sizeof( SYMBOL_INFO ) + MaxNameSize]; + auto si = (SYMBOL_INFO*)buf; + si->SizeOfStruct = sizeof( SYMBOL_INFO ); + si->MaxNameLen = MaxNameSize; + +#ifdef TRACY_DBGHELP_LOCK + DBGHELP_LOCK; +#endif + if( SymFromAddr( proc, ptr, nullptr, si ) == 0 ) + { + *ret = '\0'; + } + else + { + memcpy( ret, si->Name, si->NameLen ); + ret[si->NameLen] = '\0'; + } +#ifdef TRACY_DBGHELP_LOCK + DBGHELP_UNLOCK; +#endif + return ret; +} + +static const char* GetModuleName( uint64_t addr ) +{ + if( ( addr & 0x8000000000000000 ) != 0 ) return "[kernel]"; + +#ifndef __CYGWIN__ + for( auto& v : *s_modCache ) + { + if( addr >= v.start && addr < v.end ) + { + return v.name; + } + } + + HMODULE mod[1024]; + DWORD needed; + HANDLE proc = GetCurrentProcess(); + + if( EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 ) + { + const auto sz = needed / sizeof( HMODULE ); + for( size_t i=0; i= base && addr < base + info.SizeOfImage ) + { + char name[1024]; + const auto res = GetModuleFileNameA( mod[i], name, 1021 ); + if( res > 0 ) + { + auto ptr = name + res; + while( ptr > name && *ptr != '\\' && *ptr != '/' ) ptr--; + if( ptr > name ) ptr++; + const auto namelen = name + res - ptr; + auto cache = s_modCache->push_next(); + cache->start = base; + cache->end = base + info.SizeOfImage; + cache->name = (char*)tracy_malloc( namelen+3 ); + cache->name[0] = '['; + memcpy( cache->name+1, ptr, namelen ); + cache->name[namelen+1] = ']'; + cache->name[namelen+2] = '\0'; + return cache->name; + } + } + } + } + } +#endif + + return "[unknown]"; +} + +CallstackSymbolData DecodeSymbolAddress( uint64_t ptr ) +{ + CallstackSymbolData sym; + IMAGEHLP_LINE64 line; + DWORD displacement = 0; + line.SizeOfStruct = sizeof(IMAGEHLP_LINE64); +#ifdef TRACY_DBGHELP_LOCK + DBGHELP_LOCK; +#endif + const auto res = SymGetLineFromAddr64( GetCurrentProcess(), ptr, &displacement, &line ); +#ifdef TRACY_DBGHELP_LOCK + DBGHELP_UNLOCK; +#endif + if( res == 0 ) + { + sym.file = "[unknown]"; + sym.line = 0; + } + else + { + sym.file = line.FileName; + sym.line = line.LineNumber; + } + sym.needFree = false; + return sym; +} + +CallstackSymbolData DecodeCodeAddress( uint64_t ptr ) +{ + CallstackSymbolData sym; + const auto proc = GetCurrentProcess(); + bool done = false; + + IMAGEHLP_LINE64 line; + DWORD displacement = 0; + line.SizeOfStruct = sizeof(IMAGEHLP_LINE64); + +#ifdef TRACY_DBGHELP_LOCK + DBGHELP_LOCK; +#endif +#ifndef __CYGWIN__ + DWORD inlineNum = SymAddrIncludeInlineTrace( proc, ptr ); + DWORD ctx = 0; + DWORD idx; + BOOL doInline = FALSE; + if( inlineNum != 0 ) doInline = SymQueryInlineTrace( proc, ptr, 0, ptr, ptr, &ctx, &idx ); + if( doInline ) + { + if( SymGetLineFromInlineContext( proc, ptr, ctx, 0, &displacement, &line ) != 0 ) + { + sym.file = line.FileName; + sym.line = line.LineNumber; + done = true; + } + } +#endif + if( !done ) + { + if( SymGetLineFromAddr64( proc, ptr, &displacement, &line ) == 0 ) + { + sym.file = "[unknown]"; + sym.line = 0; + } + else + { + sym.file = line.FileName; + sym.line = line.LineNumber; + } + } +#ifdef TRACY_DBGHELP_LOCK + DBGHELP_UNLOCK; +#endif + sym.needFree = false; + return sym; +} + +CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) +{ + int write; + const auto proc = GetCurrentProcess(); +#ifdef TRACY_DBGHELP_LOCK + DBGHELP_LOCK; +#endif +#ifndef __CYGWIN__ + DWORD inlineNum = SymAddrIncludeInlineTrace( proc, ptr ); + if( inlineNum > MaxCbTrace - 1 ) inlineNum = MaxCbTrace - 1; + DWORD ctx = 0; + DWORD idx; + BOOL doInline = FALSE; + if( inlineNum != 0 ) doInline = SymQueryInlineTrace( proc, ptr, 0, ptr, ptr, &ctx, &idx ); + if( doInline ) + { + write = inlineNum; + cb_num = 1 + inlineNum; + } + else +#endif + { + write = 0; + cb_num = 1; + } + + char buf[sizeof( SYMBOL_INFO ) + MaxNameSize]; + auto si = (SYMBOL_INFO*)buf; + si->SizeOfStruct = sizeof( SYMBOL_INFO ); + si->MaxNameLen = MaxNameSize; + + const auto moduleName = GetModuleName( ptr ); + const auto symValid = SymFromAddr( proc, ptr, nullptr, si ) != 0; + + IMAGEHLP_LINE64 line; + DWORD displacement = 0; + line.SizeOfStruct = sizeof(IMAGEHLP_LINE64); + + { + const char* filename; + if( SymGetLineFromAddr64( proc, ptr, &displacement, &line ) == 0 ) + { + filename = "[unknown]"; + cb_data[write].line = 0; + } + else + { + filename = line.FileName; + cb_data[write].line = line.LineNumber; + } + + cb_data[write].name = symValid ? CopyString( si->Name, si->NameLen ) : CopyString( moduleName ); + cb_data[write].file = CopyString( filename ); + if( symValid ) + { + cb_data[write].symLen = si->Size; + cb_data[write].symAddr = si->Address; + } + else + { + cb_data[write].symLen = 0; + cb_data[write].symAddr = 0; + } + } + +#ifndef __CYGWIN__ + if( doInline ) + { + for( DWORD i=0; iName, si->NameLen ) : CopyString( moduleName ); + cb.file = CopyString( filename ); + if( symInlineValid ) + { + cb.symLen = si->Size; + cb.symAddr = si->Address; + } + else + { + cb.symLen = 0; + cb.symAddr = 0; + } + + ctx++; + } + } +#endif +#ifdef TRACY_DBGHELP_LOCK + DBGHELP_UNLOCK; +#endif + + return { cb_data, uint8_t( cb_num ), moduleName }; +} + +#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6 + +enum { MaxCbTrace = 16 }; + +struct backtrace_state* cb_bts; +int cb_num; +CallstackEntry cb_data[MaxCbTrace]; +int cb_fixup; + +void InitCallstack() +{ + cb_bts = backtrace_create_state( nullptr, 0, nullptr, nullptr ); +} + +static int FastCallstackDataCb( void* data, uintptr_t pc, uintptr_t lowaddr, const char* fn, int lineno, const char* function ) +{ + if( function ) + { + strcpy( (char*)data, function ); + } + else + { + const char* symname = nullptr; + auto vptr = (void*)pc; + Dl_info dlinfo; + if( dladdr( vptr, &dlinfo ) ) + { + symname = dlinfo.dli_sname; + } + if( symname ) + { + strcpy( (char*)data, symname ); + } + else + { + *(char*)data = '\0'; + } + } + return 1; +} + +static void FastCallstackErrorCb( void* data, const char* /*msg*/, int /*errnum*/ ) +{ + *(char*)data = '\0'; +} + +const char* DecodeCallstackPtrFast( uint64_t ptr ) +{ + static char ret[1024]; + backtrace_pcinfo( cb_bts, ptr, FastCallstackDataCb, FastCallstackErrorCb, ret ); + return ret; +} + +static int SymbolAddressDataCb( void* data, uintptr_t pc, uintptr_t lowaddr, const char* fn, int lineno, const char* function ) +{ + auto& sym = *(CallstackSymbolData*)data; + if( !fn ) + { + sym.file = "[unknown]"; + sym.line = 0; + sym.needFree = false; + } + else + { + sym.file = CopyString( fn ); + sym.line = lineno; + sym.needFree = true; + } + + return 1; +} + +static void SymbolAddressErrorCb( void* data, const char* /*msg*/, int /*errnum*/ ) +{ + auto& sym = *(CallstackSymbolData*)data; + sym.file = "[unknown]"; + sym.line = 0; + sym.needFree = false; +} + +CallstackSymbolData DecodeSymbolAddress( uint64_t ptr ) +{ + CallstackSymbolData sym; + backtrace_pcinfo( cb_bts, ptr, SymbolAddressDataCb, SymbolAddressErrorCb, &sym ); + return sym; +} + +CallstackSymbolData DecodeCodeAddress( uint64_t ptr ) +{ + return DecodeSymbolAddress( ptr ); +} + +static int CallstackDataCb( void* /*data*/, uintptr_t pc, uintptr_t lowaddr, const char* fn, int lineno, const char* function ) +{ + enum { DemangleBufLen = 64*1024 }; + char demangled[DemangleBufLen]; + + cb_data[cb_num].symLen = 0; + cb_data[cb_num].symAddr = (uint64_t)lowaddr; + + if( !fn && !function ) + { + const char* symname = nullptr; + auto vptr = (void*)pc; + ptrdiff_t symoff = 0; + + Dl_info dlinfo; + if( dladdr( vptr, &dlinfo ) ) + { + symname = dlinfo.dli_sname; + symoff = (char*)pc - (char*)dlinfo.dli_saddr; + + if( symname && symname[0] == '_' ) + { + size_t len = DemangleBufLen; + int status; + abi::__cxa_demangle( symname, demangled, &len, &status ); + if( status == 0 ) + { + symname = demangled; + } + } + } + + if( !symname ) symname = "[unknown]"; + + if( symoff == 0 ) + { + cb_data[cb_num].name = CopyString( symname ); + } + else + { + char buf[32]; + const auto offlen = sprintf( buf, " + %td", symoff ); + const auto namelen = strlen( symname ); + auto name = (char*)tracy_malloc( namelen + offlen + 1 ); + memcpy( name, symname, namelen ); + memcpy( name + namelen, buf, offlen ); + name[namelen + offlen] = '\0'; + cb_data[cb_num].name = name; + } + + cb_data[cb_num].file = CopyString( "[unknown]" ); + cb_data[cb_num].line = 0; + } + else + { + if( !fn ) fn = "[unknown]"; + if( !function ) + { + function = "[unknown]"; + } + else + { + if( function[0] == '_' ) + { + size_t len = DemangleBufLen; + int status; + abi::__cxa_demangle( function, demangled, &len, &status ); + if( status == 0 ) + { + function = demangled; + } + } + } + + cb_data[cb_num].name = CopyString( function ); + cb_data[cb_num].file = CopyString( fn ); + cb_data[cb_num].line = lineno; + } + + if( ++cb_num >= MaxCbTrace ) + { + return 1; + } + else + { + return 0; + } +} + +static void CallstackErrorCb( void* /*data*/, const char* /*msg*/, int /*errnum*/ ) +{ + for( int i=0; i 0 ); + + backtrace_syminfo( cb_bts, ptr, SymInfoCallback, SymInfoError, nullptr ); + + const char* symloc = nullptr; + Dl_info dlinfo; + if( dladdr( (void*)ptr, &dlinfo ) ) symloc = dlinfo.dli_fname; + + return { cb_data, uint8_t( cb_num ), symloc ? symloc : "[unknown]" }; +} + +#elif TRACY_HAS_CALLSTACK == 5 + +void InitCallstack() +{ +} + +const char* DecodeCallstackPtrFast( uint64_t ptr ) +{ + static char ret[1024]; + auto vptr = (void*)ptr; + const char* symname = nullptr; + Dl_info dlinfo; + if( dladdr( vptr, &dlinfo ) && dlinfo.dli_sname ) + { + symname = dlinfo.dli_sname; + } + if( symname ) + { + strcpy( ret, symname ); + } + else + { + *ret = '\0'; + } + return ret; +} + +CallstackSymbolData DecodeSymbolAddress( uint64_t ptr ) +{ + const char* symloc = nullptr; + Dl_info dlinfo; + if( dladdr( (void*)ptr, &dlinfo ) ) symloc = dlinfo.dli_fname; + if( !symloc ) symloc = "[unknown]"; + return CallstackSymbolData { symloc, 0, false }; +} + +CallstackSymbolData DecodeCodeAddress( uint64_t ptr ) +{ + return DecodeSymbolAddress( ptr ); +} + +CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) +{ + static CallstackEntry cb; + cb.line = 0; + + char* demangled = nullptr; + const char* symname = nullptr; + const char* symloc = nullptr; + auto vptr = (void*)ptr; + ptrdiff_t symoff = 0; + void* symaddr = nullptr; + + Dl_info dlinfo; + if( dladdr( vptr, &dlinfo ) ) + { + symloc = dlinfo.dli_fname; + symname = dlinfo.dli_sname; + symoff = (char*)ptr - (char*)dlinfo.dli_saddr; + symaddr = dlinfo.dli_saddr; + + if( symname && symname[0] == '_' ) + { + size_t len = 0; + int status; + demangled = abi::__cxa_demangle( symname, nullptr, &len, &status ); + if( status == 0 ) + { + symname = demangled; + } + } + } + + if( !symname ) symname = "[unknown]"; + if( !symloc ) symloc = "[unknown]"; + + if( symoff == 0 ) + { + cb.name = CopyString( symname ); + } + else + { + char buf[32]; + const auto offlen = sprintf( buf, " + %td", symoff ); + const auto namelen = strlen( symname ); + auto name = (char*)tracy_malloc( namelen + offlen + 1 ); + memcpy( name, symname, namelen ); + memcpy( name + namelen, buf, offlen ); + name[namelen + offlen] = '\0'; + cb.name = name; + } + + cb.file = CopyString( "[unknown]" ); + cb.symLen = 0; + cb.symAddr = (uint64_t)symaddr; + + if( demangled ) free( demangled ); + + return { &cb, 1, symloc }; +} + +#endif + +} + +#endif diff --git a/Source/ThirdParty/tracy/client/TracyCallstack.h b/Source/ThirdParty/tracy/client/TracyCallstack.h new file mode 100644 index 000000000..87d8ce721 --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyCallstack.h @@ -0,0 +1,28 @@ +#ifndef __TRACYCALLSTACK_H__ +#define __TRACYCALLSTACK_H__ + +#if !defined _WIN32 && !defined __CYGWIN__ +# include +#endif + +#if defined _WIN32 || defined __CYGWIN__ +# define TRACY_HAS_CALLSTACK 1 +#elif defined __ANDROID__ +# if !defined __arm__ || __ANDROID_API__ >= 21 +# define TRACY_HAS_CALLSTACK 2 +# else +# define TRACY_HAS_CALLSTACK 5 +# endif +#elif defined __linux +# if defined _GNU_SOURCE && defined __GLIBC__ +# define TRACY_HAS_CALLSTACK 3 +# else +# define TRACY_HAS_CALLSTACK 2 +# endif +#elif defined __APPLE__ +# define TRACY_HAS_CALLSTACK 4 +#elif defined BSD +# define TRACY_HAS_CALLSTACK 6 +#endif + +#endif diff --git a/Source/ThirdParty/tracy/client/TracyCallstack.hpp b/Source/ThirdParty/tracy/client/TracyCallstack.hpp new file mode 100644 index 000000000..923eccc04 --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyCallstack.hpp @@ -0,0 +1,114 @@ +#ifndef __TRACYCALLSTACK_HPP__ +#define __TRACYCALLSTACK_HPP__ + +#include "TracyCallstack.h" + +#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5 +# include +#elif TRACY_HAS_CALLSTACK >= 3 +# include +#endif + + +#ifdef TRACY_HAS_CALLSTACK + +#include +#include + +#include "../common/TracyAlloc.hpp" + +namespace tracy +{ + +struct CallstackSymbolData +{ + const char* file; + uint32_t line; + bool needFree; +}; + +struct CallstackEntry +{ + const char* name; + const char* file; + uint32_t line; + uint32_t symLen; + uint64_t symAddr; +}; + +struct CallstackEntryData +{ + const CallstackEntry* data; + uint8_t size; + const char* imageName; +}; + +CallstackSymbolData DecodeSymbolAddress( uint64_t ptr ); +CallstackSymbolData DecodeCodeAddress( uint64_t ptr ); +const char* DecodeCallstackPtrFast( uint64_t ptr ); +CallstackEntryData DecodeCallstackPtr( uint64_t ptr ); +void InitCallstack(); + +#if TRACY_HAS_CALLSTACK == 1 + +TRACY_API uintptr_t* CallTrace( int depth ); + +static tracy_force_inline void* Callstack( int depth ) +{ + assert( depth >= 1 && depth < 63 ); + return CallTrace( depth ); +} + +#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5 + +struct BacktraceState +{ + void** current; + void** end; +}; + +static _Unwind_Reason_Code tracy_unwind_callback( struct _Unwind_Context* ctx, void* arg ) +{ + auto state = (BacktraceState*)arg; + uintptr_t pc = _Unwind_GetIP( ctx ); + if( pc ) + { + if( state->current == state->end ) return _URC_END_OF_STACK; + *state->current++ = (void*)pc; + } + return _URC_NO_REASON; +} + +static tracy_force_inline void* Callstack( int depth ) +{ + assert( depth >= 1 && depth < 63 ); + + auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) ); + BacktraceState state = { (void**)(trace+1), (void**)(trace+1+depth) }; + _Unwind_Backtrace( tracy_unwind_callback, &state ); + + *trace = (uintptr_t*)state.current - trace + 1; + + return trace; +} + +#elif TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6 + +static tracy_force_inline void* Callstack( int depth ) +{ + assert( depth >= 1 ); + + auto trace = (uintptr_t*)tracy_malloc( ( 1 + (size_t)depth ) * sizeof( uintptr_t ) ); + const auto num = (size_t)backtrace( (void**)(trace+1), depth ); + *trace = num; + + return trace; +} + +#endif + +} + +#endif + +#endif diff --git a/Source/ThirdParty/tracy/client/TracyFastVector.hpp b/Source/ThirdParty/tracy/client/TracyFastVector.hpp new file mode 100644 index 000000000..fc4108016 --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyFastVector.hpp @@ -0,0 +1,117 @@ +#ifndef __TRACYFASTVECTOR_HPP__ +#define __TRACYFASTVECTOR_HPP__ + +#include +#include + +#include "../common/TracyAlloc.hpp" + +namespace tracy +{ + +template +class FastVector +{ +public: + using iterator = T*; + using const_iterator = const T*; + + FastVector( size_t capacity ) + : m_ptr( (T*)tracy_malloc( sizeof( T ) * capacity ) ) + , m_write( m_ptr ) + , m_end( m_ptr + capacity ) + { + assert( capacity != 0 ); + } + + FastVector( const FastVector& ) = delete; + FastVector( FastVector&& ) = delete; + + ~FastVector() + { + tracy_free( m_ptr ); + } + + FastVector& operator=( const FastVector& ) = delete; + FastVector& operator=( FastVector&& ) = delete; + + bool empty() const { return m_ptr == m_write; } + size_t size() const { return m_write - m_ptr; } + + T* data() { return m_ptr; } + const T* data() const { return m_ptr; }; + + T* begin() { return m_ptr; } + const T* begin() const { return m_ptr; } + T* end() { return m_write; } + const T* end() const { return m_write; } + + T& front() { assert( !empty() ); return m_ptr[0]; } + const T& front() const { assert( !empty() ); return m_ptr[0]; } + + T& back() { assert( !empty() ); return m_write[-1]; } + const T& back() const { assert( !empty() ); return m_write[-1]; } + + T& operator[]( size_t idx ) { return m_ptr[idx]; } + const T& operator[]( size_t idx ) const { return m_ptr[idx]; } + + T* push_next() + { + if( m_write == m_end ) AllocMore(); + return m_write++; + } + + T* prepare_next() + { + if( m_write == m_end ) AllocMore(); + return m_write; + } + + void commit_next() + { + m_write++; + } + + void clear() + { + m_write = m_ptr; + } + + void swap( FastVector& vec ) + { + const auto ptr1 = m_ptr; + const auto ptr2 = vec.m_ptr; + const auto write1 = m_write; + const auto write2 = vec.m_write; + const auto end1 = m_end; + const auto end2 = vec.m_end; + + m_ptr = ptr2; + vec.m_ptr = ptr1; + m_write = write2; + vec.m_write = write1; + m_end = end2; + vec.m_end = end1; + } + +private: + tracy_no_inline void AllocMore() + { + const auto cap = size_t( m_end - m_ptr ) * 2; + const auto size = size_t( m_write - m_ptr ); + T* ptr = (T*)tracy_malloc( sizeof( T ) * cap ); + memcpy( ptr, m_ptr, size * sizeof( T ) ); + tracy_free( m_ptr ); + m_ptr = ptr; + m_write = m_ptr + size; + m_end = m_ptr + cap; + } + + T* m_ptr; + T* m_write; + T* m_end; +}; + +} + +#endif diff --git a/Source/ThirdParty/tracy/client/TracyLock.hpp b/Source/ThirdParty/tracy/client/TracyLock.hpp new file mode 100644 index 000000000..e513cdc5d --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyLock.hpp @@ -0,0 +1,548 @@ +#ifndef __TRACYLOCK_HPP__ +#define __TRACYLOCK_HPP__ + +#include +#include + +#include "../common/TracySystem.hpp" +#include "../common/TracyAlign.hpp" +#include "TracyProfiler.hpp" + +namespace tracy +{ + +class LockableCtx +{ +public: + tracy_force_inline LockableCtx( const SourceLocationData* srcloc ) + : m_id( GetLockCounter().fetch_add( 1, std::memory_order_relaxed ) ) +#ifdef TRACY_ON_DEMAND + , m_lockCount( 0 ) + , m_active( false ) +#endif + { + assert( m_id != std::numeric_limits::max() ); + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockAnnounce ); + MemWrite( &item->lockAnnounce.id, m_id ); + MemWrite( &item->lockAnnounce.time, Profiler::GetTime() ); + MemWrite( &item->lockAnnounce.lckloc, (uint64_t)srcloc ); + MemWrite( &item->lockAnnounce.type, LockType::Lockable ); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + Profiler::QueueSerialFinish(); + } + + LockableCtx( const LockableCtx& ) = delete; + LockableCtx& operator=( const LockableCtx& ) = delete; + + tracy_force_inline ~LockableCtx() + { + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockTerminate ); + MemWrite( &item->lockTerminate.id, m_id ); + MemWrite( &item->lockTerminate.time, Profiler::GetTime() ); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + Profiler::QueueSerialFinish(); + } + + tracy_force_inline bool BeforeLock() + { +#ifdef TRACY_ON_DEMAND + bool queue = false; + const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed ); + const auto active = m_active.load( std::memory_order_relaxed ); + if( locks == 0 || active ) + { + const bool connected = GetProfiler().IsConnected(); + if( active != connected ) m_active.store( connected, std::memory_order_relaxed ); + if( connected ) queue = true; + } + if( !queue ) return false; +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockWait ); + MemWrite( &item->lockWait.thread, GetThreadHandle() ); + MemWrite( &item->lockWait.id, m_id ); + MemWrite( &item->lockWait.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + return true; + } + + tracy_force_inline void AfterLock() + { + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockObtain ); + MemWrite( &item->lockObtain.thread, GetThreadHandle() ); + MemWrite( &item->lockObtain.id, m_id ); + MemWrite( &item->lockObtain.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void AfterUnlock() + { +#ifdef TRACY_ON_DEMAND + m_lockCount.fetch_sub( 1, std::memory_order_relaxed ); + if( !m_active.load( std::memory_order_relaxed ) ) return; + if( !GetProfiler().IsConnected() ) + { + m_active.store( false, std::memory_order_relaxed ); + return; + } +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockRelease ); + MemWrite( &item->lockRelease.thread, GetThreadHandle() ); + MemWrite( &item->lockRelease.id, m_id ); + MemWrite( &item->lockRelease.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void AfterTryLock( bool acquired ) + { +#ifdef TRACY_ON_DEMAND + if( !acquired ) return; + + bool queue = false; + const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed ); + const auto active = m_active.load( std::memory_order_relaxed ); + if( locks == 0 || active ) + { + const bool connected = GetProfiler().IsConnected(); + if( active != connected ) m_active.store( connected, std::memory_order_relaxed ); + if( connected ) queue = true; + } + if( !queue ) return; +#endif + + if( acquired ) + { + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockObtain ); + MemWrite( &item->lockObtain.thread, GetThreadHandle() ); + MemWrite( &item->lockObtain.id, m_id ); + MemWrite( &item->lockObtain.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + } + + tracy_force_inline void Mark( const SourceLocationData* srcloc ) + { +#ifdef TRACY_ON_DEMAND + const auto active = m_active.load( std::memory_order_relaxed ); + if( !active ) return; + const auto connected = GetProfiler().IsConnected(); + if( !connected ) + { + if( active ) m_active.store( false, std::memory_order_relaxed ); + return; + } +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockMark ); + MemWrite( &item->lockMark.thread, GetThreadHandle() ); + MemWrite( &item->lockMark.id, m_id ); + MemWrite( &item->lockMark.srcloc, (uint64_t)srcloc ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void CustomName( const char* name, size_t size ) + { + assert( size < std::numeric_limits::max() ); + auto ptr = (char*)tracy_malloc( size ); + memcpy( ptr, name, size ); + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockName ); + MemWrite( &item->lockNameFat.id, m_id ); + MemWrite( &item->lockNameFat.name, (uint64_t)ptr ); + MemWrite( &item->lockNameFat.size, (uint16_t)size ); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + Profiler::QueueSerialFinish(); + } + +private: + uint32_t m_id; + +#ifdef TRACY_ON_DEMAND + std::atomic m_lockCount; + std::atomic m_active; +#endif +}; + +template +class Lockable +{ +public: + tracy_force_inline Lockable( const SourceLocationData* srcloc ) + : m_ctx( srcloc ) + { + } + + Lockable( const Lockable& ) = delete; + Lockable& operator=( const Lockable& ) = delete; + + tracy_force_inline void lock() + { + const auto runAfter = m_ctx.BeforeLock(); + m_lockable.lock(); + if( runAfter ) m_ctx.AfterLock(); + } + + tracy_force_inline void unlock() + { + m_lockable.unlock(); + m_ctx.AfterUnlock(); + } + + tracy_force_inline bool try_lock() + { + const auto acquired = m_lockable.try_lock(); + m_ctx.AfterTryLock( acquired ); + return acquired; + } + + tracy_force_inline void Mark( const SourceLocationData* srcloc ) + { + m_ctx.Mark( srcloc ); + } + + tracy_force_inline void CustomName( const char* name, size_t size ) + { + m_ctx.CustomName( name, size ); + } + +private: + T m_lockable; + LockableCtx m_ctx; +}; + + +class SharedLockableCtx +{ +public: + tracy_force_inline SharedLockableCtx( const SourceLocationData* srcloc ) + : m_id( GetLockCounter().fetch_add( 1, std::memory_order_relaxed ) ) +#ifdef TRACY_ON_DEMAND + , m_lockCount( 0 ) + , m_active( false ) +#endif + { + assert( m_id != std::numeric_limits::max() ); + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockAnnounce ); + MemWrite( &item->lockAnnounce.id, m_id ); + MemWrite( &item->lockAnnounce.time, Profiler::GetTime() ); + MemWrite( &item->lockAnnounce.lckloc, (uint64_t)srcloc ); + MemWrite( &item->lockAnnounce.type, LockType::SharedLockable ); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + Profiler::QueueSerialFinish(); + } + + SharedLockableCtx( const SharedLockableCtx& ) = delete; + SharedLockableCtx& operator=( const SharedLockableCtx& ) = delete; + + tracy_force_inline ~SharedLockableCtx() + { + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockTerminate ); + MemWrite( &item->lockTerminate.id, m_id ); + MemWrite( &item->lockTerminate.time, Profiler::GetTime() ); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + Profiler::QueueSerialFinish(); + } + + tracy_force_inline bool BeforeLock() + { +#ifdef TRACY_ON_DEMAND + bool queue = false; + const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed ); + const auto active = m_active.load( std::memory_order_relaxed ); + if( locks == 0 || active ) + { + const bool connected = GetProfiler().IsConnected(); + if( active != connected ) m_active.store( connected, std::memory_order_relaxed ); + if( connected ) queue = true; + } + if( !queue ) return false; +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockWait ); + MemWrite( &item->lockWait.thread, GetThreadHandle() ); + MemWrite( &item->lockWait.id, m_id ); + MemWrite( &item->lockWait.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + return true; + } + + tracy_force_inline void AfterLock() + { + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockObtain ); + MemWrite( &item->lockObtain.thread, GetThreadHandle() ); + MemWrite( &item->lockObtain.id, m_id ); + MemWrite( &item->lockObtain.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void AfterUnlock() + { +#ifdef TRACY_ON_DEMAND + m_lockCount.fetch_sub( 1, std::memory_order_relaxed ); + if( !m_active.load( std::memory_order_relaxed ) ) return; + if( !GetProfiler().IsConnected() ) + { + m_active.store( false, std::memory_order_relaxed ); + return; + } +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockRelease ); + MemWrite( &item->lockRelease.thread, GetThreadHandle() ); + MemWrite( &item->lockRelease.id, m_id ); + MemWrite( &item->lockRelease.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void AfterTryLock( bool acquired ) + { +#ifdef TRACY_ON_DEMAND + if( !acquired ) return; + + bool queue = false; + const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed ); + const auto active = m_active.load( std::memory_order_relaxed ); + if( locks == 0 || active ) + { + const bool connected = GetProfiler().IsConnected(); + if( active != connected ) m_active.store( connected, std::memory_order_relaxed ); + if( connected ) queue = true; + } + if( !queue ) return; +#endif + + if( acquired ) + { + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockObtain ); + MemWrite( &item->lockObtain.thread, GetThreadHandle() ); + MemWrite( &item->lockObtain.id, m_id ); + MemWrite( &item->lockObtain.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + } + + tracy_force_inline bool BeforeLockShared() + { +#ifdef TRACY_ON_DEMAND + bool queue = false; + const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed ); + const auto active = m_active.load( std::memory_order_relaxed ); + if( locks == 0 || active ) + { + const bool connected = GetProfiler().IsConnected(); + if( active != connected ) m_active.store( connected, std::memory_order_relaxed ); + if( connected ) queue = true; + } + if( !queue ) return false; +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockSharedWait ); + MemWrite( &item->lockWait.thread, GetThreadHandle() ); + MemWrite( &item->lockWait.id, m_id ); + MemWrite( &item->lockWait.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + return true; + } + + tracy_force_inline void AfterLockShared() + { + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockSharedObtain ); + MemWrite( &item->lockObtain.thread, GetThreadHandle() ); + MemWrite( &item->lockObtain.id, m_id ); + MemWrite( &item->lockObtain.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void AfterUnlockShared() + { +#ifdef TRACY_ON_DEMAND + m_lockCount.fetch_sub( 1, std::memory_order_relaxed ); + if( !m_active.load( std::memory_order_relaxed ) ) return; + if( !GetProfiler().IsConnected() ) + { + m_active.store( false, std::memory_order_relaxed ); + return; + } +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockSharedRelease ); + MemWrite( &item->lockRelease.thread, GetThreadHandle() ); + MemWrite( &item->lockRelease.id, m_id ); + MemWrite( &item->lockRelease.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void AfterTryLockShared( bool acquired ) + { +#ifdef TRACY_ON_DEMAND + if( !acquired ) return; + + bool queue = false; + const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed ); + const auto active = m_active.load( std::memory_order_relaxed ); + if( locks == 0 || active ) + { + const bool connected = GetProfiler().IsConnected(); + if( active != connected ) m_active.store( connected, std::memory_order_relaxed ); + if( connected ) queue = true; + } + if( !queue ) return; +#endif + + if( acquired ) + { + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockSharedObtain ); + MemWrite( &item->lockObtain.thread, GetThreadHandle() ); + MemWrite( &item->lockObtain.id, m_id ); + MemWrite( &item->lockObtain.time, Profiler::GetTime() ); + Profiler::QueueSerialFinish(); + } + } + + tracy_force_inline void Mark( const SourceLocationData* srcloc ) + { +#ifdef TRACY_ON_DEMAND + const auto active = m_active.load( std::memory_order_relaxed ); + if( !active ) return; + const auto connected = GetProfiler().IsConnected(); + if( !connected ) + { + if( active ) m_active.store( false, std::memory_order_relaxed ); + return; + } +#endif + + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockMark ); + MemWrite( &item->lockMark.thread, GetThreadHandle() ); + MemWrite( &item->lockMark.id, m_id ); + MemWrite( &item->lockMark.srcloc, (uint64_t)srcloc ); + Profiler::QueueSerialFinish(); + } + + tracy_force_inline void CustomName( const char* name, size_t size ) + { + assert( size < std::numeric_limits::max() ); + auto ptr = (char*)tracy_malloc( size ); + memcpy( ptr, name, size ); + auto item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::LockName ); + MemWrite( &item->lockNameFat.id, m_id ); + MemWrite( &item->lockNameFat.name, (uint64_t)ptr ); + MemWrite( &item->lockNameFat.size, (uint16_t)size ); +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + Profiler::QueueSerialFinish(); + } + +private: + uint32_t m_id; + +#ifdef TRACY_ON_DEMAND + std::atomic m_lockCount; + std::atomic m_active; +#endif +}; + +template +class SharedLockable +{ +public: + tracy_force_inline SharedLockable( const SourceLocationData* srcloc ) + : m_ctx( srcloc ) + { + } + + SharedLockable( const SharedLockable& ) = delete; + SharedLockable& operator=( const SharedLockable& ) = delete; + + tracy_force_inline void lock() + { + const auto runAfter = m_ctx.BeforeLock(); + m_lockable.lock(); + if( runAfter ) m_ctx.AfterLock(); + } + + tracy_force_inline void unlock() + { + m_lockable.unlock(); + m_ctx.AfterUnlock(); + } + + tracy_force_inline bool try_lock() + { + const auto acquired = m_lockable.try_lock(); + m_ctx.AfterTryLock( acquired ); + return acquired; + } + + tracy_force_inline void lock_shared() + { + const auto runAfter = m_ctx.BeforeLockShared(); + m_lockable.lock_shared(); + if( runAfter ) m_ctx.AfterLockShared(); + } + + tracy_force_inline void unlock_shared() + { + m_lockable.unlock_shared(); + m_ctx.AfterUnlockShared(); + } + + tracy_force_inline bool try_lock_shared() + { + const auto acquired = m_lockable.try_lock_shared(); + m_ctx.AfterTryLockShared( acquired ); + return acquired; + } + + tracy_force_inline void Mark( const SourceLocationData* srcloc ) + { + m_ctx.Mark( srcloc ); + } + + tracy_force_inline void CustomName( const char* name, size_t size ) + { + m_ctx.CustomName( name, size ); + } + +private: + T m_lockable; + SharedLockableCtx m_ctx; +}; + + +} + +#endif diff --git a/Source/ThirdParty/tracy/client/TracyProfiler.cpp b/Source/ThirdParty/tracy/client/TracyProfiler.cpp new file mode 100644 index 000000000..b8783a0eb --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyProfiler.cpp @@ -0,0 +1,3573 @@ +#ifdef TRACY_ENABLE + +#ifdef _WIN32 +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +# include +# include +# include +# include +#else +# include +# include +#endif + +#ifdef __CYGWIN__ +# include +# include +# include +#endif + +#ifdef _GNU_SOURCE +# include +#endif + +#ifdef __linux__ +# include +# include +# include +# include +# include +#endif + +#if defined __APPLE__ || defined BSD +# include +# include +#endif + +#if defined __APPLE__ +# include "TargetConditionals.h" +# include +#endif + +#ifdef __ANDROID__ +# include +# include +# include +# include +# include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../common/TracyAlign.hpp" +#include "../common/TracySocket.hpp" +#include "../common/TracySystem.hpp" +#include "tracy_rpmalloc.hpp" +#include "TracyCallstack.hpp" +#include "TracyScoped.hpp" +#include "TracyProfiler.hpp" +#include "TracyThread.hpp" +#include "TracyArmCpuTable.hpp" +#include "TracySysTrace.hpp" + +#ifdef TRACY_PORT +# ifndef TRACY_DATA_PORT +# define TRACY_DATA_PORT TRACY_PORT +# endif +# ifndef TRACY_BROADCAST_PORT +# define TRACY_BROADCAST_PORT TRACY_PORT +# endif +#endif + +#ifdef __APPLE__ +# define TRACY_DELAYED_INIT +#else +# ifdef __GNUC__ +# define init_order( val ) __attribute__ ((init_priority(val))) +# else +# define init_order(x) +# endif +#endif + +#if defined _WIN32 || defined __CYGWIN__ +# include +extern "C" typedef LONG (WINAPI *t_RtlGetVersion)( PRTL_OSVERSIONINFOW ); +extern "C" typedef BOOL (WINAPI *t_GetLogicalProcessorInformationEx)( LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD ); +#else +# include +# include +#endif +#if defined __linux__ +# include +# include +#endif + +#if !defined _WIN32 && !defined __CYGWIN__ && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) +# include +#endif + +#if !( ( ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA ) || defined __linux__ ) +# include +#endif + +namespace tracy +{ + +namespace +{ +# if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA + BOOL CALLBACK InitOnceCallback( PINIT_ONCE /*initOnce*/, PVOID /*Parameter*/, PVOID* /*Context*/) + { + rpmalloc_initialize(); + return TRUE; + } + INIT_ONCE InitOnce = INIT_ONCE_STATIC_INIT; +# elif defined __linux__ + void InitOnceCallback() + { + rpmalloc_initialize(); + } + pthread_once_t once_control = PTHREAD_ONCE_INIT; +# else + void InitOnceCallback() + { + rpmalloc_initialize(); + } + std::once_flag once_flag; +# endif +} + +struct RPMallocInit +{ + RPMallocInit() + { +# if ( defined _WIN32 || defined __CYGWIN__ ) && _WIN32_WINNT >= _WIN32_WINNT_VISTA + InitOnceExecuteOnce( &InitOnce, InitOnceCallback, nullptr, nullptr ); +# elif defined __linux__ + pthread_once( &once_control, InitOnceCallback ); +# else + std::call_once( once_flag, InitOnceCallback ); +# endif + rpmalloc_thread_initialize(); + } +}; + +#ifndef TRACY_DELAYED_INIT + +struct InitTimeWrapper +{ + int64_t val; +}; + +struct ProducerWrapper +{ + tracy::moodycamel::ConcurrentQueue::ExplicitProducer* ptr; +}; + +struct ThreadHandleWrapper +{ + uint64_t val; +}; +#endif + + +#if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 +static inline void CpuId( uint32_t* regs, uint32_t leaf ) +{ + memset(regs, 0, sizeof(uint32_t) * 4); +#if defined _WIN32 || defined __CYGWIN__ + __cpuidex( (int*)regs, leaf, 0 ); +#else + __get_cpuid( leaf, regs, regs+1, regs+2, regs+3 ); +#endif +} + +static void InitFailure( const char* msg ) +{ +#if defined _WIN32 || defined __CYGWIN__ + bool hasConsole = false; + bool reopen = false; + const auto attached = AttachConsole( ATTACH_PARENT_PROCESS ); + if( attached ) + { + hasConsole = true; + reopen = true; + } + else + { + const auto err = GetLastError(); + if( err == ERROR_ACCESS_DENIED ) + { + hasConsole = true; + } + } + if( hasConsole ) + { + fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg ); + if( reopen ) + { + freopen( "CONOUT$", "w", stderr ); + fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg ); + } + } + else + { + MessageBoxA( nullptr, msg, "Tracy Profiler initialization failure", MB_ICONSTOP ); + } +#else + fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg ); +#endif + exit( 0 ); +} + +static int64_t SetupHwTimer() +{ +#if !defined TRACY_TIMER_QPC && !defined TRACY_TIMER_FALLBACK + uint32_t regs[4]; + CpuId( regs, 1 ); + if( !( regs[3] & ( 1 << 4 ) ) ) InitFailure( "CPU doesn't support RDTSC instruction." ); + CpuId( regs, 0x80000007 ); + if( !( regs[3] & ( 1 << 8 ) ) ) + { + const char* noCheck = getenv( "TRACY_NO_INVARIANT_CHECK" ); + if( !noCheck || noCheck[0] != '1' ) + { +#if defined _WIN32 || defined __CYGWIN__ + InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*.\nAlternatively you may rebuild the application with the TRACY_TIMER_QPC or TRACY_TIMER_FALLBACK define to use lower resolution timer." ); +#else + InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*.\nAlternatively you may rebuild the application with the TRACY_TIMER_FALLBACK define to use lower resolution timer." ); +#endif + } + } +#endif + + return Profiler::GetTime(); +} +#else +static int64_t SetupHwTimer() +{ + return Profiler::GetTime(); +} +#endif + +static const char* GetProcessName() +{ + const char* processName = "unknown"; +#ifdef _WIN32 + static char buf[_MAX_PATH]; + GetModuleFileNameA( nullptr, buf, _MAX_PATH ); + const char* ptr = buf; + while( *ptr != '\0' ) ptr++; + while( ptr > buf && *ptr != '\\' && *ptr != '/' ) ptr--; + if( ptr > buf ) ptr++; + processName = ptr; +#elif defined __ANDROID__ +# if __ANDROID_API__ >= 21 + auto buf = getprogname(); + if( buf ) processName = buf; +# endif +#elif defined _GNU_SOURCE || defined __CYGWIN__ + if( program_invocation_short_name ) processName = program_invocation_short_name; +#elif defined __APPLE__ || defined BSD + auto buf = getprogname(); + if( buf ) processName = buf; +#endif + return processName; +} + +static const char* GetProcessExecutablePath() +{ +#ifdef _WIN32 + static char buf[_MAX_PATH]; + GetModuleFileNameA( nullptr, buf, _MAX_PATH ); + return buf; +#elif defined __ANDROID__ + return nullptr; +#elif defined _GNU_SOURCE || defined __CYGWIN__ + return program_invocation_name; +#elif defined __APPLE__ + static char buf[1024]; + uint32_t size = 1024; + _NSGetExecutablePath( buf, &size ); + return buf; +#elif defined __DragonFly__ + static char buf[1024]; + readlink( "/proc/curproc/file", buf, 1024 ); + return buf; +#elif defined __FreeBSD__ + static char buf[1024]; + int mib[4]; + mib[0] = CTL_KERN; + mib[1] = KERN_PROC; + mib[2] = KERN_PROC_PATHNAME; + mib[3] = -1; + size_t cb = 1024; + sysctl( mib, 4, buf, &cb, nullptr, 0 ); + return buf; +#elif defined __NetBSD__ + static char buf[1024]; + readlink( "/proc/curproc/exe", buf, 1024 ); + return buf; +#else + return nullptr; +#endif +} + +#if defined __linux__ && defined __ARM_ARCH +static uint32_t GetHex( char*& ptr, int skip ) +{ + uint32_t ret; + ptr += skip; + char* end; + if( ptr[0] == '0' && ptr[1] == 'x' ) + { + ptr += 2; + ret = strtol( ptr, &end, 16 ); + } + else + { + ret = strtol( ptr, &end, 10 ); + } + ptr = end; + return ret; +} +#endif + +static const char* GetHostInfo() +{ + static char buf[1024]; + auto ptr = buf; +#if defined _WIN32 || defined __CYGWIN__ + t_RtlGetVersion RtlGetVersion = (t_RtlGetVersion)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlGetVersion" ); + if( !RtlGetVersion ) + { +# ifdef __CYGWIN__ + ptr += sprintf( ptr, "OS: Windows (Cygwin)\n" ); +# elif defined __MINGW32__ + ptr += sprintf( ptr, "OS: Windows (MingW)\n" ); +# else + ptr += sprintf( ptr, "OS: Windows\n" ); +# endif + } + else + { + RTL_OSVERSIONINFOW ver = { sizeof( RTL_OSVERSIONINFOW ) }; + RtlGetVersion( &ver ); + +# ifdef __CYGWIN__ + ptr += sprintf( ptr, "OS: Windows %i.%i.%i (Cygwin)\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber ); +# elif defined __MINGW32__ + ptr += sprintf( ptr, "OS: Windows %i.%i.%i (MingW)\n", (int)ver.dwMajorVersion, (int)ver.dwMinorVersion, (int)ver.dwBuildNumber ); +# else + ptr += sprintf( ptr, "OS: Windows %i.%i.%i\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber ); +# endif + } +#elif defined __linux__ + struct utsname utsName; + uname( &utsName ); +# if defined __ANDROID__ + ptr += sprintf( ptr, "OS: Linux %s (Android)\n", utsName.release ); +# else + ptr += sprintf( ptr, "OS: Linux %s\n", utsName.release ); +# endif +#elif defined __APPLE__ +# if TARGET_OS_IPHONE == 1 + ptr += sprintf( ptr, "OS: Darwin (iOS)\n" ); +# elif TARGET_OS_MAC == 1 + ptr += sprintf( ptr, "OS: Darwin (OSX)\n" ); +# else + ptr += sprintf( ptr, "OS: Darwin (unknown)\n" ); +# endif +#elif defined __DragonFly__ + ptr += sprintf( ptr, "OS: BSD (DragonFly)\n" ); +#elif defined __FreeBSD__ + ptr += sprintf( ptr, "OS: BSD (FreeBSD)\n" ); +#elif defined __NetBSD__ + ptr += sprintf( ptr, "OS: BSD (NetBSD)\n" ); +#elif defined __OpenBSD__ + ptr += sprintf( ptr, "OS: BSD (OpenBSD)\n" ); +#else + ptr += sprintf( ptr, "OS: unknown\n" ); +#endif + +#if defined _MSC_VER +# if defined __clang__ + ptr += sprintf( ptr, "Compiler: MSVC clang-cl %i.%i.%i\n", __clang_major__, __clang_minor__, __clang_patchlevel__ ); +# else + ptr += sprintf( ptr, "Compiler: MSVC %i\n", _MSC_VER ); +# endif +#elif defined __clang__ + ptr += sprintf( ptr, "Compiler: clang %i.%i.%i\n", __clang_major__, __clang_minor__, __clang_patchlevel__ ); +#elif defined __GNUC__ + ptr += sprintf( ptr, "Compiler: gcc %i.%i\n", __GNUC__, __GNUC_MINOR__ ); +#else + ptr += sprintf( ptr, "Compiler: unknown\n" ); +#endif + +#if defined _WIN32 || defined __CYGWIN__ +# ifndef __CYGWIN__ + InitWinSock(); +# endif + char hostname[512]; + gethostname( hostname, 512 ); + + DWORD userSz = UNLEN+1; + char user[UNLEN+1]; + GetUserNameA( user, &userSz ); + + ptr += sprintf( ptr, "User: %s@%s\n", user, hostname ); +#else + char hostname[_POSIX_HOST_NAME_MAX]{}; + char user[_POSIX_LOGIN_NAME_MAX]{}; + + gethostname( hostname, _POSIX_HOST_NAME_MAX ); +# if defined __ANDROID__ + const auto login = getlogin(); + if( login ) + { + strcpy( user, login ); + } + else + { + memcpy( user, "(?)", 4 ); + } +# else + getlogin_r( user, _POSIX_LOGIN_NAME_MAX ); +# endif + + ptr += sprintf( ptr, "User: %s@%s\n", user, hostname ); +#endif + +#if defined __i386 || defined _M_IX86 + ptr += sprintf( ptr, "Arch: x86\n" ); +#elif defined __x86_64__ || defined _M_X64 + ptr += sprintf( ptr, "Arch: x64\n" ); +#elif defined __aarch64__ + ptr += sprintf( ptr, "Arch: ARM64\n" ); +#elif defined __ARM_ARCH + ptr += sprintf( ptr, "Arch: ARM\n" ); +#else + ptr += sprintf( ptr, "Arch: unknown\n" ); +#endif + +#if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 + uint32_t regs[4]; + char cpuModel[4*4*3]; + auto modelPtr = cpuModel; + for( uint32_t i=0x80000002; i<0x80000005; ++i ) + { + CpuId( regs, i ); + memcpy( modelPtr, regs, sizeof( regs ) ); modelPtr += sizeof( regs ); + } + + ptr += sprintf( ptr, "CPU: %s\n", cpuModel ); +#elif defined __linux__ && defined __ARM_ARCH + bool cpuFound = false; + FILE* fcpuinfo = fopen( "/proc/cpuinfo", "rb" ); + if( fcpuinfo ) + { + enum { BufSize = 4*1024 }; + char buf[BufSize]; + const auto sz = fread( buf, 1, BufSize, fcpuinfo ); + fclose( fcpuinfo ); + const auto end = buf + sz; + auto cptr = buf; + + uint32_t impl = 0; + uint32_t var = 0; + uint32_t part = 0; + uint32_t rev = 0; + + while( end - cptr > 20 ) + { + while( end - cptr > 20 && memcmp( cptr, "CPU ", 4 ) != 0 ) + { + cptr += 4; + while( end - cptr > 20 && *cptr != '\n' ) cptr++; + cptr++; + } + if( end - cptr <= 20 ) break; + cptr += 4; + if( memcmp( cptr, "implementer\t: ", 14 ) == 0 ) + { + if( impl != 0 ) break; + impl = GetHex( cptr, 14 ); + } + else if( memcmp( cptr, "variant\t: ", 10 ) == 0 ) var = GetHex( cptr, 10 ); + else if( memcmp( cptr, "part\t: ", 7 ) == 0 ) part = GetHex( cptr, 7 ); + else if( memcmp( cptr, "revision\t: ", 11 ) == 0 ) rev = GetHex( cptr, 11 ); + while( *cptr != '\n' && *cptr != '\0' ) cptr++; + cptr++; + } + + if( impl != 0 || var != 0 || part != 0 || rev != 0 ) + { + cpuFound = true; + ptr += sprintf( ptr, "CPU: %s%s r%ip%i\n", DecodeArmImplementer( impl ), DecodeArmPart( impl, part ), var, rev ); + } + } + if( !cpuFound ) + { + ptr += sprintf( ptr, "CPU: unknown\n" ); + } +#elif defined __APPLE__ && TARGET_OS_IPHONE == 1 + { + size_t sz; + sysctlbyname( "hw.machine", nullptr, &sz, nullptr, 0 ); + auto str = (char*)tracy_malloc( sz ); + sysctlbyname( "hw.machine", str, &sz, nullptr, 0 ); + ptr += sprintf( ptr, "Device: %s\n", DecodeIosDevice( str ) ); + tracy_free( str ); + } +#else + ptr += sprintf( ptr, "CPU: unknown\n" ); +#endif + + ptr += sprintf( ptr, "CPU cores: %i\n", std::thread::hardware_concurrency() ); + +#if defined _WIN32 || defined __CYGWIN__ + MEMORYSTATUSEX statex; + statex.dwLength = sizeof( statex ); + GlobalMemoryStatusEx( &statex ); +# ifdef _MSC_VER + ptr += sprintf( ptr, "RAM: %I64u MB\n", statex.ullTotalPhys / 1024 / 1024 ); +# else + ptr += sprintf( ptr, "RAM: %llu MB\n", statex.ullTotalPhys / 1024 / 1024 ); +# endif +#elif defined __linux__ + struct sysinfo sysInfo; + sysinfo( &sysInfo ); + ptr += sprintf( ptr, "RAM: %lu MB\n", sysInfo.totalram / 1024 / 1024 ); +#elif defined __APPLE__ + size_t memSize; + size_t sz = sizeof( memSize ); + sysctlbyname( "hw.memsize", &memSize, &sz, nullptr, 0 ); + ptr += sprintf( ptr, "RAM: %zu MB\n", memSize / 1024 / 1024 ); +#elif defined BSD + size_t memSize; + size_t sz = sizeof( memSize ); + sysctlbyname( "hw.physmem", &memSize, &sz, nullptr, 0 ); + ptr += sprintf( ptr, "RAM: %zu MB\n", memSize / 1024 / 1024 ); +#else + ptr += sprintf( ptr, "RAM: unknown\n" ); +#endif + + return buf; +} + +static uint64_t GetPid() +{ +#if defined _WIN32 || defined __CYGWIN__ + return uint64_t( GetCurrentProcessId() ); +#else + return uint64_t( getpid() ); +#endif +} + +void Profiler::AckServerQuery() +{ + QueueItem item; + MemWrite( &item.hdr.type, QueueType::AckServerQueryNoop ); + NeedDataSize( QueueDataSize[(int)QueueType::AckServerQueryNoop] ); + AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::AckServerQueryNoop] ); +} + +void Profiler::AckSourceCodeNotAvailable() +{ + QueueItem item; + MemWrite( &item.hdr.type, QueueType::AckSourceCodeNotAvailable ); + NeedDataSize( QueueDataSize[(int)QueueType::AckSourceCodeNotAvailable] ); + AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::AckSourceCodeNotAvailable] ); +} + +static BroadcastMessage& GetBroadcastMessage( const char* procname, size_t pnsz, int& len, int port ) +{ + static BroadcastMessage msg; + + msg.broadcastVersion = BroadcastVersion; + msg.protocolVersion = ProtocolVersion; + msg.listenPort = port; + + memcpy( msg.programName, procname, pnsz ); + memset( msg.programName + pnsz, 0, WelcomeMessageProgramNameSize - pnsz ); + + len = int( offsetof( BroadcastMessage, programName ) + pnsz + 1 ); + return msg; +} + +#if defined _WIN32 || defined __CYGWIN__ +static DWORD s_profilerThreadId = 0; +static char s_crashText[1024]; + +LONG WINAPI CrashFilter( PEXCEPTION_POINTERS pExp ) +{ + if( !GetProfiler().IsConnected() ) return EXCEPTION_CONTINUE_SEARCH; + + const unsigned ec = pExp->ExceptionRecord->ExceptionCode; + auto msgPtr = s_crashText; + switch( ec ) + { + case EXCEPTION_ACCESS_VIOLATION: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ACCESS_VIOLATION (0x%x). ", ec ); + switch( pExp->ExceptionRecord->ExceptionInformation[0] ) + { + case 0: + msgPtr += sprintf( msgPtr, "Read violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] ); + break; + case 1: + msgPtr += sprintf( msgPtr, "Write violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] ); + break; + case 8: + msgPtr += sprintf( msgPtr, "DEP violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] ); + break; + default: + break; + } + break; + case EXCEPTION_ARRAY_BOUNDS_EXCEEDED: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ARRAY_BOUNDS_EXCEEDED (0x%x). ", ec ); + break; + case EXCEPTION_DATATYPE_MISALIGNMENT: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_DATATYPE_MISALIGNMENT (0x%x). ", ec ); + break; + case EXCEPTION_FLT_DIVIDE_BY_ZERO: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_FLT_DIVIDE_BY_ZERO (0x%x). ", ec ); + break; + case EXCEPTION_ILLEGAL_INSTRUCTION: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ILLEGAL_INSTRUCTION (0x%x). ", ec ); + break; + case EXCEPTION_IN_PAGE_ERROR: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_IN_PAGE_ERROR (0x%x). ", ec ); + break; + case EXCEPTION_INT_DIVIDE_BY_ZERO: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_INT_DIVIDE_BY_ZERO (0x%x). ", ec ); + break; + case EXCEPTION_PRIV_INSTRUCTION: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_PRIV_INSTRUCTION (0x%x). ", ec ); + break; + case EXCEPTION_STACK_OVERFLOW: + msgPtr += sprintf( msgPtr, "Exception EXCEPTION_STACK_OVERFLOW (0x%x). ", ec ); + break; + default: + return EXCEPTION_CONTINUE_SEARCH; + } + + { + GetProfiler().SendCallstack( 60, "KiUserExceptionDispatcher" ); + + TracyLfqPrepare( QueueType::CrashReport ); + item->crashReport.time = Profiler::GetTime(); + item->crashReport.text = (uint64_t)s_crashText; + TracyLfqCommit; + } + + HANDLE h = CreateToolhelp32Snapshot( TH32CS_SNAPTHREAD, 0 ); + if( h == INVALID_HANDLE_VALUE ) return EXCEPTION_CONTINUE_SEARCH; + + THREADENTRY32 te = { sizeof( te ) }; + if( !Thread32First( h, &te ) ) + { + CloseHandle( h ); + return EXCEPTION_CONTINUE_SEARCH; + } + + const auto pid = GetCurrentProcessId(); + const auto tid = GetCurrentThreadId(); + + do + { + if( te.th32OwnerProcessID == pid && te.th32ThreadID != tid && te.th32ThreadID != s_profilerThreadId ) + { + HANDLE th = OpenThread( THREAD_SUSPEND_RESUME, FALSE, te.th32ThreadID ); + if( th != INVALID_HANDLE_VALUE ) + { + SuspendThread( th ); + CloseHandle( th ); + } + } + } + while( Thread32Next( h, &te ) ); + CloseHandle( h ); + + { + TracyLfqPrepare( QueueType::Crash ); + TracyLfqCommit; + } + + std::this_thread::sleep_for( std::chrono::milliseconds( 500 ) ); + GetProfiler().RequestShutdown(); + while( !GetProfiler().HasShutdownFinished() ) { std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); }; + + TerminateProcess( GetCurrentProcess(), 1 ); + + return EXCEPTION_CONTINUE_SEARCH; +} +#endif + +#ifdef __linux__ +static long s_profilerTid = 0; +static char s_crashText[1024]; +static std::atomic s_alreadyCrashed( false ); + +static void ThreadFreezer( int /*signal*/ ) +{ + for(;;) sleep( 1000 ); +} + +static inline void HexPrint( char*& ptr, uint64_t val ) +{ + if( val == 0 ) + { + *ptr++ = '0'; + return; + } + + static const char HexTable[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; + char buf[16]; + auto bptr = buf; + + do + { + *bptr++ = HexTable[val%16]; + val /= 16; + } + while( val > 0 ); + + do + { + *ptr++ = *--bptr; + } + while( bptr != buf ); +} + +static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ ) +{ + bool expected = false; + if( !s_alreadyCrashed.compare_exchange_strong( expected, true ) ) ThreadFreezer( signal ); + + struct sigaction act = {}; + act.sa_handler = SIG_DFL; + sigaction( SIGABRT, &act, nullptr ); + + auto msgPtr = s_crashText; + switch( signal ) + { + case SIGILL: + strcpy( msgPtr, "Illegal Instruction.\n" ); + while( *msgPtr ) msgPtr++; + switch( info->si_code ) + { + case ILL_ILLOPC: + strcpy( msgPtr, "Illegal opcode.\n" ); + break; + case ILL_ILLOPN: + strcpy( msgPtr, "Illegal operand.\n" ); + break; + case ILL_ILLADR: + strcpy( msgPtr, "Illegal addressing mode.\n" ); + break; + case ILL_ILLTRP: + strcpy( msgPtr, "Illegal trap.\n" ); + break; + case ILL_PRVOPC: + strcpy( msgPtr, "Privileged opcode.\n" ); + break; + case ILL_PRVREG: + strcpy( msgPtr, "Privileged register.\n" ); + break; + case ILL_COPROC: + strcpy( msgPtr, "Coprocessor error.\n" ); + break; + case ILL_BADSTK: + strcpy( msgPtr, "Internal stack error.\n" ); + break; + default: + break; + } + break; + case SIGFPE: + strcpy( msgPtr, "Floating-point exception.\n" ); + while( *msgPtr ) msgPtr++; + switch( info->si_code ) + { + case FPE_INTDIV: + strcpy( msgPtr, "Integer divide by zero.\n" ); + break; + case FPE_INTOVF: + strcpy( msgPtr, "Integer overflow.\n" ); + break; + case FPE_FLTDIV: + strcpy( msgPtr, "Floating-point divide by zero.\n" ); + break; + case FPE_FLTOVF: + strcpy( msgPtr, "Floating-point overflow.\n" ); + break; + case FPE_FLTUND: + strcpy( msgPtr, "Floating-point underflow.\n" ); + break; + case FPE_FLTRES: + strcpy( msgPtr, "Floating-point inexact result.\n" ); + break; + case FPE_FLTINV: + strcpy( msgPtr, "Floating-point invalid operation.\n" ); + break; + case FPE_FLTSUB: + strcpy( msgPtr, "Subscript out of range.\n" ); + break; + default: + break; + } + break; + case SIGSEGV: + strcpy( msgPtr, "Invalid memory reference.\n" ); + while( *msgPtr ) msgPtr++; + switch( info->si_code ) + { + case SEGV_MAPERR: + strcpy( msgPtr, "Address not mapped to object.\n" ); + break; + case SEGV_ACCERR: + strcpy( msgPtr, "Invalid permissions for mapped object.\n" ); + break; +# ifdef SEGV_BNDERR + case SEGV_BNDERR: + strcpy( msgPtr, "Failed address bound checks.\n" ); + break; +# endif +# ifdef SEGV_PKUERR + case SEGV_PKUERR: + strcpy( msgPtr, "Access was denied by memory protection keys.\n" ); + break; +# endif + default: + break; + } + break; + case SIGPIPE: + strcpy( msgPtr, "Broken pipe.\n" ); + while( *msgPtr ) msgPtr++; + break; + case SIGBUS: + strcpy( msgPtr, "Bus error.\n" ); + while( *msgPtr ) msgPtr++; + switch( info->si_code ) + { + case BUS_ADRALN: + strcpy( msgPtr, "Invalid address alignment.\n" ); + break; + case BUS_ADRERR: + strcpy( msgPtr, "Nonexistent physical address.\n" ); + break; + case BUS_OBJERR: + strcpy( msgPtr, "Object-specific hardware error.\n" ); + break; +# ifdef BUS_MCEERR_AR + case BUS_MCEERR_AR: + strcpy( msgPtr, "Hardware memory error consumed on a machine check; action required.\n" ); + break; +# endif +# ifdef BUS_MCEERR_AO + case BUS_MCEERR_AO: + strcpy( msgPtr, "Hardware memory error detected in process but not consumed; action optional.\n" ); + break; +# endif + default: + break; + } + break; + case SIGABRT: + strcpy( msgPtr, "Abort signal from abort().\n" ); + break; + default: + abort(); + } + while( *msgPtr ) msgPtr++; + + if( signal != SIGPIPE ) + { + strcpy( msgPtr, "Fault address: 0x" ); + while( *msgPtr ) msgPtr++; + HexPrint( msgPtr, uint64_t( info->si_addr ) ); + *msgPtr++ = '\n'; + } + + { + GetProfiler().SendCallstack( 60, "__kernel_rt_sigreturn" ); + + TracyLfqPrepare( QueueType::CrashReport ); + item->crashReport.time = Profiler::GetTime(); + item->crashReport.text = (uint64_t)s_crashText; + TracyLfqCommit; + } + + DIR* dp = opendir( "/proc/self/task" ); + if( !dp ) abort(); + + const auto selfTid = syscall( SYS_gettid ); + + struct dirent* ep; + while( ( ep = readdir( dp ) ) != nullptr ) + { + if( ep->d_name[0] == '.' ) continue; + int tid = atoi( ep->d_name ); + if( tid != selfTid && tid != s_profilerTid ) + { + syscall( SYS_tkill, tid, SIGPWR ); + } + } + closedir( dp ); + + { + TracyLfqPrepare( QueueType::Crash ); + TracyLfqCommit; + } + + std::this_thread::sleep_for( std::chrono::milliseconds( 500 ) ); + GetProfiler().RequestShutdown(); + while( !GetProfiler().HasShutdownFinished() ) { std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); }; + + abort(); +} +#endif + + +enum { QueuePrealloc = 256 * 1024 }; + +static Profiler* s_instance = nullptr; +static Thread* s_thread; + +#ifdef TRACY_HAS_SYSTEM_TRACING +static Thread* s_sysTraceThread = nullptr; +#endif + +TRACY_API bool ProfilerAvailable() { return s_instance != nullptr; } + +TRACY_API int64_t GetFrequencyQpc() +{ +#if defined _WIN32 || defined __CYGWIN__ + LARGE_INTEGER t; + QueryPerformanceFrequency( &t ); + return t.QuadPart; +#else + return 0; +#endif +} + +#ifdef TRACY_DELAYED_INIT +struct ThreadNameData; +TRACY_API moodycamel::ConcurrentQueue& GetQueue(); +TRACY_API void InitRPMallocThread(); + +void InitRPMallocThread() +{ + RPMallocInit rpinit; + rpmalloc_thread_initialize(); +} + +struct ProfilerData +{ + int64_t initTime = SetupHwTimer(); + RPMallocInit rpmalloc_init; + moodycamel::ConcurrentQueue queue; + Profiler profiler; + std::atomic lockCounter { 0 }; + std::atomic gpuCtxCounter { 0 }; + std::atomic threadNameData { nullptr }; +}; + +struct ProducerWrapper +{ + ProducerWrapper( ProfilerData& data ) : detail( data.queue ), ptr( data.queue.get_explicit_producer( detail ) ) {} + moodycamel::ProducerToken detail; + tracy::moodycamel::ConcurrentQueue::ExplicitProducer* ptr; +}; + +struct ProfilerThreadData +{ + ProfilerThreadData( ProfilerData& data ) : token( data ), gpuCtx( { nullptr } ) {} + RPMallocInit rpmalloc_init; + ProducerWrapper token; + GpuCtxWrapper gpuCtx; +# ifdef TRACY_ON_DEMAND + LuaZoneState luaZoneState; +# endif +}; + +# ifdef TRACY_MANUAL_LIFETIME +ProfilerData* s_profilerData = nullptr; +TRACY_API void StartupProfiler() +{ + s_profilerData = new ProfilerData; + s_profilerData->profiler.SpawnWorkerThreads(); +} +static ProfilerData& GetProfilerData() +{ + assert(s_profilerData); + return *s_profilerData; +} +TRACY_API void ShutdownProfiler() +{ + delete s_profilerData; + s_profilerData = nullptr; + rpmalloc_finalize(); +} +# else +static std::atomic profilerDataLock { 0 }; +static std::atomic profilerData { nullptr }; + +static ProfilerData& GetProfilerData() +{ + auto ptr = profilerData.load( std::memory_order_acquire ); + if( !ptr ) + { + int expected = 0; + while( !profilerDataLock.compare_exchange_strong( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; } + ptr = profilerData.load( std::memory_order_acquire ); + if( !ptr ) + { + ptr = (ProfilerData*)malloc( sizeof( ProfilerData ) ); + new (ptr) ProfilerData(); + profilerData.store( ptr, std::memory_order_release ); + } + profilerDataLock.store( 0, std::memory_order_release ); + } + return *ptr; +} +# endif + +static ProfilerThreadData& GetProfilerThreadData() +{ + thread_local ProfilerThreadData data( GetProfilerData() ); + return data; +} + +TRACY_API moodycamel::ConcurrentQueue::ExplicitProducer* GetToken() { return GetProfilerThreadData().token.ptr; } +TRACY_API Profiler& GetProfiler() { return GetProfilerData().profiler; } +TRACY_API moodycamel::ConcurrentQueue& GetQueue() { return GetProfilerData().queue; } +TRACY_API int64_t GetInitTime() { return GetProfilerData().initTime; } +TRACY_API std::atomic& GetLockCounter() { return GetProfilerData().lockCounter; } +TRACY_API std::atomic& GetGpuCtxCounter() { return GetProfilerData().gpuCtxCounter; } +TRACY_API GpuCtxWrapper& GetGpuCtx() { return GetProfilerThreadData().gpuCtx; } +TRACY_API uint64_t GetThreadHandle() { return detail::GetThreadHandleImpl(); } +std::atomic& GetThreadNameData() { return GetProfilerData().threadNameData; } + +# ifdef TRACY_ON_DEMAND +TRACY_API LuaZoneState& GetLuaZoneState() { return GetProfilerThreadData().luaZoneState; } +# endif + +# ifndef TRACY_MANUAL_LIFETIME +namespace +{ + const auto& __profiler_init = GetProfiler(); +} +# endif + +#else +TRACY_API void InitRPMallocThread() +{ + rpmalloc_thread_initialize(); +} + +// MSVC static initialization order solution. gcc/clang uses init_order() to avoid all this. + +// 1a. But s_queue is needed for initialization of variables in point 2. +extern moodycamel::ConcurrentQueue s_queue; + +thread_local RPMallocInit init_order(106) s_rpmalloc_thread_init; + +// 2. If these variables would be in the .CRT$XCB section, they would be initialized only in main thread. +thread_local moodycamel::ProducerToken init_order(107) s_token_detail( s_queue ); +thread_local ProducerWrapper init_order(108) s_token { s_queue.get_explicit_producer( s_token_detail ) }; +thread_local ThreadHandleWrapper init_order(104) s_threadHandle { detail::GetThreadHandleImpl() }; + +# ifdef _MSC_VER +// 1. Initialize these static variables before all other variables. +# pragma warning( disable : 4075 ) +# pragma init_seg( ".CRT$XCB" ) +# endif + +static InitTimeWrapper init_order(101) s_initTime { SetupHwTimer() }; +static RPMallocInit init_order(102) s_rpmalloc_init; +moodycamel::ConcurrentQueue init_order(103) s_queue( QueuePrealloc ); +std::atomic init_order(104) s_lockCounter( 0 ); +std::atomic init_order(104) s_gpuCtxCounter( 0 ); + +thread_local GpuCtxWrapper init_order(104) s_gpuCtx { nullptr }; + +struct ThreadNameData; +static std::atomic init_order(104) s_threadNameDataInstance( nullptr ); +std::atomic& s_threadNameData = s_threadNameDataInstance; + +# ifdef TRACY_ON_DEMAND +thread_local LuaZoneState init_order(104) s_luaZoneState { 0, false }; +# endif + +static Profiler init_order(105) s_profiler; + +TRACY_API moodycamel::ConcurrentQueue::ExplicitProducer* GetToken() { return s_token.ptr; } +TRACY_API Profiler& GetProfiler() { return s_profiler; } +TRACY_API moodycamel::ConcurrentQueue& GetQueue() { return s_queue; } +TRACY_API int64_t GetInitTime() { return s_initTime.val; } +TRACY_API std::atomic& GetLockCounter() { return s_lockCounter; } +TRACY_API std::atomic& GetGpuCtxCounter() { return s_gpuCtxCounter; } +TRACY_API GpuCtxWrapper& GetGpuCtx() { return s_gpuCtx; } +# ifdef __CYGWIN__ +// Hackfix for cygwin reporting memory frees without matching allocations. WTF? +TRACY_API uint64_t GetThreadHandle() { return detail::GetThreadHandleImpl(); } +# else +TRACY_API uint64_t GetThreadHandle() { return s_threadHandle.val; } +# endif + +std::atomic& GetThreadNameData() { return s_threadNameData; } + +# ifdef TRACY_ON_DEMAND +TRACY_API LuaZoneState& GetLuaZoneState() { return s_luaZoneState; } +# endif +#endif + +Profiler::Profiler() + : m_timeBegin( 0 ) + , m_mainThread( detail::GetThreadHandleImpl() ) + , m_epoch( std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch() ).count() ) + , m_shutdown( false ) + , m_shutdownManual( false ) + , m_shutdownFinished( false ) + , m_sock( nullptr ) + , m_broadcast( nullptr ) + , m_noExit( false ) + , m_userPort( 0 ) + , m_zoneId( 1 ) + , m_samplingPeriod( 0 ) + , m_stream( LZ4_createStream() ) + , m_buffer( (char*)tracy_malloc( TargetFrameSize*3 ) ) + , m_bufferOffset( 0 ) + , m_bufferStart( 0 ) + , m_lz4Buf( (char*)tracy_malloc( LZ4Size + sizeof( lz4sz_t ) ) ) + , m_serialQueue( 1024*1024 ) + , m_serialDequeue( 1024*1024 ) + , m_frameCount( 0 ) + , m_isConnected( false ) +#ifdef TRACY_ON_DEMAND + , m_connectionId( 0 ) + , m_deferredQueue( 64*1024 ) +#endif + , m_paramCallback( nullptr ) + , m_queryData( nullptr ) +{ + assert( !s_instance ); + s_instance = this; + +#ifndef TRACY_DELAYED_INIT +# ifdef _MSC_VER + // 3. But these variables need to be initialized in main thread within the .CRT$XCB section. Do it here. + s_token_detail = moodycamel::ProducerToken( s_queue ); + s_token = ProducerWrapper { s_queue.get_explicit_producer( s_token_detail ) }; + s_threadHandle = ThreadHandleWrapper { m_mainThread }; +# endif +#endif + + CalibrateTimer(); + CalibrateDelay(); + ReportTopology(); + +#ifndef TRACY_NO_EXIT + const char* noExitEnv = getenv( "TRACY_NO_EXIT" ); + if( noExitEnv && noExitEnv[0] == '1' ) + { + m_noExit = true; + } +#endif + + const char* userPort = getenv( "TRACY_PORT" ); + if( userPort ) + { + m_userPort = atoi( userPort ); + } + +#if !defined(TRACY_DELAYED_INIT) || !defined(TRACY_MANUAL_LIFETIME) + SpawnWorkerThreads(); +#endif +} + +void Profiler::SpawnWorkerThreads() +{ + s_thread = (Thread*)tracy_malloc( sizeof( Thread ) ); + new(s_thread) Thread( LaunchWorker, this ); + +#ifdef TRACY_HAS_SYSTEM_TRACING + if( SysTraceStart( m_samplingPeriod ) ) + { + s_sysTraceThread = (Thread*)tracy_malloc( sizeof( Thread ) ); + new(s_sysTraceThread) Thread( SysTraceWorker, nullptr ); + std::this_thread::sleep_for( std::chrono::milliseconds( 1 ) ); + } +#endif + +#if defined _WIN32 || defined __CYGWIN__ + s_profilerThreadId = GetThreadId( s_thread->Handle() ); + AddVectoredExceptionHandler( 1, CrashFilter ); +#endif + +#ifdef __linux__ + struct sigaction threadFreezer = {}; + threadFreezer.sa_handler = ThreadFreezer; + sigaction( SIGPWR, &threadFreezer, nullptr ); + + struct sigaction crashHandler = {}; + crashHandler.sa_sigaction = CrashHandler; + crashHandler.sa_flags = SA_SIGINFO; + sigaction( SIGILL, &crashHandler, nullptr ); + sigaction( SIGFPE, &crashHandler, nullptr ); + sigaction( SIGSEGV, &crashHandler, nullptr ); + sigaction( SIGPIPE, &crashHandler, nullptr ); + sigaction( SIGBUS, &crashHandler, nullptr ); + sigaction( SIGABRT, &crashHandler, nullptr ); +#endif + +#ifdef TRACY_HAS_CALLSTACK + InitCallstack(); +#endif + + m_timeBegin.store( GetTime(), std::memory_order_relaxed ); +} + +Profiler::~Profiler() +{ + m_shutdown.store( true, std::memory_order_relaxed ); + +#ifdef TRACY_HAS_SYSTEM_TRACING + if( s_sysTraceThread ) + { + SysTraceStop(); + s_sysTraceThread->~Thread(); + tracy_free( s_sysTraceThread ); + } +#endif + + s_thread->~Thread(); + tracy_free( s_thread ); + + tracy_free( m_lz4Buf ); + tracy_free( m_buffer ); + LZ4_freeStream( (LZ4_stream_t*)m_stream ); + + if( m_sock ) + { + m_sock->~Socket(); + tracy_free( m_sock ); + } + + if( m_broadcast ) + { + m_broadcast->~UdpBroadcast(); + tracy_free( m_broadcast ); + } + + assert( s_instance ); + s_instance = nullptr; +} + +bool Profiler::ShouldExit() +{ + return s_instance->m_shutdown.load( std::memory_order_relaxed ); +} + +void Profiler::Worker() +{ +#ifdef __linux__ + s_profilerTid = syscall( SYS_gettid ); +#endif + + ThreadExitHandler threadExitHandler; + + SetThreadName( "Tracy Profiler" ); + +#ifdef TRACY_DATA_PORT + const bool dataPortSearch = false; + auto dataPort = m_userPort != 0 ? m_userPort : TRACY_DATA_PORT; +#else + const bool dataPortSearch = m_userPort == 0; + auto dataPort = m_userPort != 0 ? m_userPort : 8086; +#endif +#ifdef TRACY_BROADCAST_PORT + const auto broadcastPort = TRACY_BROADCAST_PORT; +#else + const auto broadcastPort = 8086; +#endif + + while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + + rpmalloc_thread_initialize(); + + m_exectime = 0; + const auto execname = GetProcessExecutablePath(); + if( execname ) + { + struct stat st; + if( stat( execname, &st ) == 0 ) + { + m_exectime = (uint64_t)st.st_mtime; + } + } + + const auto procname = GetProcessName(); + const auto pnsz = std::min( strlen( procname ), WelcomeMessageProgramNameSize - 1 ); + + const auto hostinfo = GetHostInfo(); + const auto hisz = std::min( strlen( hostinfo ), WelcomeMessageHostInfoSize - 1 ); + + const uint64_t pid = GetPid(); + +#ifdef TRACY_ON_DEMAND + uint8_t onDemand = 1; +#else + uint8_t onDemand = 0; +#endif + +#ifdef __APPLE__ + uint8_t isApple = 1; +#else + uint8_t isApple = 0; +#endif + +#if defined __i386 || defined _M_IX86 + uint8_t cpuArch = CpuArchX86; +#elif defined __x86_64__ || defined _M_X64 + uint8_t cpuArch = CpuArchX64; +#elif defined __aarch64__ + uint8_t cpuArch = CpuArchArm64; +#elif defined __ARM_ARCH + uint8_t cpuArch = CpuArchArm32; +#else + uint8_t cpuArch = CpuArchUnknown; +#endif + +#ifdef TRACY_NO_CODE_TRANSFER + uint8_t codeTransfer = 0; +#else + uint8_t codeTransfer = 1; +#endif + +#if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 + uint32_t regs[4]; + char manufacturer[12]; + CpuId( regs, 0 ); + memcpy( manufacturer, regs+1, 4 ); + memcpy( manufacturer+4, regs+3, 4 ); + memcpy( manufacturer+8, regs+2, 4 ); + + CpuId( regs, 1 ); + uint32_t cpuId = ( regs[0] & 0xFFF ) | ( ( regs[0] & 0xFFF0000 ) >> 4 ); +#else + const char manufacturer[12] = {}; + uint32_t cpuId = 0; +#endif + + WelcomeMessage welcome; + MemWrite( &welcome.timerMul, m_timerMul ); + MemWrite( &welcome.initBegin, GetInitTime() ); + MemWrite( &welcome.initEnd, m_timeBegin.load( std::memory_order_relaxed ) ); + MemWrite( &welcome.delay, m_delay ); + MemWrite( &welcome.resolution, m_resolution ); + MemWrite( &welcome.epoch, m_epoch ); + MemWrite( &welcome.exectime, m_exectime ); + MemWrite( &welcome.pid, pid ); + MemWrite( &welcome.samplingPeriod, m_samplingPeriod ); + MemWrite( &welcome.onDemand, onDemand ); + MemWrite( &welcome.isApple, isApple ); + MemWrite( &welcome.cpuArch, cpuArch ); + MemWrite( &welcome.codeTransfer, codeTransfer ); + memcpy( welcome.cpuManufacturer, manufacturer, 12 ); + MemWrite( &welcome.cpuId, cpuId ); + memcpy( welcome.programName, procname, pnsz ); + memset( welcome.programName + pnsz, 0, WelcomeMessageProgramNameSize - pnsz ); + memcpy( welcome.hostInfo, hostinfo, hisz ); + memset( welcome.hostInfo + hisz, 0, WelcomeMessageHostInfoSize - hisz ); + + moodycamel::ConsumerToken token( GetQueue() ); + + ListenSocket listen; + bool isListening = false; + if( !dataPortSearch ) + { + isListening = listen.Listen( dataPort, 4 ); + } + else + { + for( uint32_t i=0; i<20; i++ ) + { + if( listen.Listen( dataPort+i, 4 ) ) + { + dataPort += i; + isListening = true; + break; + } + } + } + if( !isListening ) + { + for(;;) + { + if( ShouldExit() ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + + ClearQueues( token ); + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + } + +#ifndef TRACY_NO_BROADCAST + m_broadcast = (UdpBroadcast*)tracy_malloc( sizeof( UdpBroadcast ) ); + new(m_broadcast) UdpBroadcast(); +# ifdef TRACY_ONLY_LOCALHOST + const char* addr = "127.255.255.255"; +# else + const char* addr = "255.255.255.255"; +# endif + if( !m_broadcast->Open( addr, broadcastPort ) ) + { + m_broadcast->~UdpBroadcast(); + tracy_free( m_broadcast ); + m_broadcast = nullptr; + } +#endif + + int broadcastLen = 0; + auto& broadcastMsg = GetBroadcastMessage( procname, pnsz, broadcastLen, dataPort ); + uint64_t lastBroadcast = 0; + + // Connections loop. + // Each iteration of the loop handles whole connection. Multiple iterations will only + // happen in the on-demand mode or when handshake fails. + for(;;) + { + // Wait for incoming connection + for(;;) + { +#ifndef TRACY_NO_EXIT + if( !m_noExit && ShouldExit() ) + { + if( m_broadcast ) + { + broadcastMsg.activeTime = -1; + m_broadcast->Send( broadcastPort, &broadcastMsg, broadcastLen ); + } + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } +#endif + m_sock = listen.Accept(); + if( m_sock ) break; +#ifndef TRACY_ON_DEMAND + ProcessSysTime(); +#endif + + if( m_broadcast ) + { + const auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count(); + if( t - lastBroadcast > 3000000000 ) // 3s + { + lastBroadcast = t; + const auto ts = std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch() ).count(); + broadcastMsg.activeTime = int32_t( ts - m_epoch ); + assert( broadcastMsg.activeTime >= 0 ); + m_broadcast->Send( broadcastPort, &broadcastMsg, broadcastLen ); + } + } + } + + if( m_broadcast ) + { + lastBroadcast = 0; + broadcastMsg.activeTime = -1; + m_broadcast->Send( broadcastPort, &broadcastMsg, broadcastLen ); + } + + // Handshake + { + char shibboleth[HandshakeShibbolethSize]; + auto res = m_sock->ReadRaw( shibboleth, HandshakeShibbolethSize, 2000 ); + if( !res || memcmp( shibboleth, HandshakeShibboleth, HandshakeShibbolethSize ) != 0 ) + { + m_sock->~Socket(); + tracy_free( m_sock ); + m_sock = nullptr; + continue; + } + + uint32_t protocolVersion; + res = m_sock->ReadRaw( &protocolVersion, sizeof( protocolVersion ), 2000 ); + if( !res ) + { + m_sock->~Socket(); + tracy_free( m_sock ); + m_sock = nullptr; + continue; + } + + if( protocolVersion != ProtocolVersion ) + { + HandshakeStatus status = HandshakeProtocolMismatch; + m_sock->Send( &status, sizeof( status ) ); + m_sock->~Socket(); + tracy_free( m_sock ); + m_sock = nullptr; + continue; + } + } + +#ifdef TRACY_ON_DEMAND + const auto currentTime = GetTime(); + ClearQueues( token ); + m_connectionId.fetch_add( 1, std::memory_order_release ); +#endif + m_isConnected.store( true, std::memory_order_release ); + + HandshakeStatus handshake = HandshakeWelcome; + m_sock->Send( &handshake, sizeof( handshake ) ); + + LZ4_resetStream( (LZ4_stream_t*)m_stream ); + m_sock->Send( &welcome, sizeof( welcome ) ); + + m_threadCtx = 0; + m_refTimeSerial = 0; + m_refTimeCtx = 0; + m_refTimeGpu = 0; + +#ifdef TRACY_ON_DEMAND + OnDemandPayloadMessage onDemand; + onDemand.frames = m_frameCount.load( std::memory_order_relaxed ); + onDemand.currentTime = currentTime; + + m_sock->Send( &onDemand, sizeof( onDemand ) ); + + m_deferredLock.lock(); + for( auto& item : m_deferredQueue ) + { + uint64_t ptr; + uint16_t size; + const auto idx = MemRead( &item.hdr.idx ); + switch( (QueueType)idx ) + { + case QueueType::MessageAppInfo: + ptr = MemRead( &item.messageFat.text ); + size = MemRead( &item.messageFat.size ); + SendSingleString( (const char*)ptr, size ); + break; + case QueueType::LockName: + ptr = MemRead( &item.lockNameFat.name ); + size = MemRead( &item.lockNameFat.size ); + SendSingleString( (const char*)ptr, size ); + break; + case QueueType::GpuContextName: + ptr = MemRead( &item.gpuContextNameFat.ptr ); + size = MemRead( &item.gpuContextNameFat.size ); + SendSingleString( (const char*)ptr, size ); + break; + default: + break; + } + AppendData( &item, QueueDataSize[idx] ); + } + m_deferredLock.unlock(); +#endif + + // Main communications loop + int keepAlive = 0; + for(;;) + { + ProcessSysTime(); + const auto status = Dequeue( token ); + const auto serialStatus = DequeueSerial(); + if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost ) + { + break; + } + else if( status == DequeueStatus::QueueEmpty && serialStatus == DequeueStatus::QueueEmpty ) + { + if( ShouldExit() ) break; + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) break; + } + if( keepAlive == 500 ) + { + QueueItem ka; + ka.hdr.type = QueueType::KeepAlive; + AppendData( &ka, QueueDataSize[ka.hdr.idx] ); + if( !CommitData() ) break; + + keepAlive = 0; + } + else + { + keepAlive++; + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + } + else + { + keepAlive = 0; + } + + bool connActive = true; + while( m_sock->HasData() && connActive ) + { + connActive = HandleServerQuery(); + } + if( !connActive ) break; + } + if( ShouldExit() ) break; + + m_isConnected.store( false, std::memory_order_release ); +#ifdef TRACY_ON_DEMAND + m_bufferOffset = 0; + m_bufferStart = 0; +#endif + + m_sock->~Socket(); + tracy_free( m_sock ); + m_sock = nullptr; + +#ifndef TRACY_ON_DEMAND + // Client is no longer available here. Accept incoming connections, but reject handshake. + for(;;) + { + if( ShouldExit() ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + + ClearQueues( token ); + + m_sock = listen.Accept(); + if( m_sock ) + { + char shibboleth[HandshakeShibbolethSize]; + auto res = m_sock->ReadRaw( shibboleth, HandshakeShibbolethSize, 1000 ); + if( !res || memcmp( shibboleth, HandshakeShibboleth, HandshakeShibbolethSize ) != 0 ) + { + m_sock->~Socket(); + tracy_free( m_sock ); + m_sock = nullptr; + continue; + } + + uint32_t protocolVersion; + res = m_sock->ReadRaw( &protocolVersion, sizeof( protocolVersion ), 1000 ); + if( !res ) + { + m_sock->~Socket(); + tracy_free( m_sock ); + m_sock = nullptr; + continue; + } + + HandshakeStatus status = HandshakeNotAvailable; + m_sock->Send( &status, sizeof( status ) ); + m_sock->~Socket(); + tracy_free( m_sock ); + } + } +#endif + } + // End of connections loop + + // Client is exiting. Send items remaining in queues. + for(;;) + { + const auto status = Dequeue( token ); + const auto serialStatus = DequeueSerial(); + if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + else if( status == DequeueStatus::QueueEmpty && serialStatus == DequeueStatus::QueueEmpty ) + { + if( m_bufferOffset != m_bufferStart ) CommitData(); + break; + } + + while( m_sock->HasData() ) + { + if( !HandleServerQuery() ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + } + } + + // Send client termination notice to the server + QueueItem terminate; + MemWrite( &terminate.hdr.type, QueueType::Terminate ); + if( !SendData( (const char*)&terminate, 1 ) ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + // Handle remaining server queries + for(;;) + { + if( m_sock->HasData() ) + { + while( m_sock->HasData() ) + { + if( !HandleServerQuery() ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + } + while( Dequeue( token ) == DequeueStatus::DataDequeued ) {} + while( DequeueSerial() == DequeueStatus::DataDequeued ) {} + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) + { + m_shutdownFinished.store( true, std::memory_order_relaxed ); + return; + } + } + } + else + { + if( m_bufferOffset != m_bufferStart ) CommitData(); + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + } +} + +static void FreeAssociatedMemory( const QueueItem& item ) +{ + if( item.hdr.idx >= (int)QueueType::Terminate ) return; + + uint64_t ptr; + switch( item.hdr.type ) + { + case QueueType::ZoneText: + case QueueType::ZoneName: + ptr = MemRead( &item.zoneTextFat.text ); + tracy_free( (void*)ptr ); + break; + case QueueType::MessageColor: + case QueueType::MessageColorCallstack: + ptr = MemRead( &item.messageColorFat.text ); + tracy_free( (void*)ptr ); + break; + case QueueType::Message: + case QueueType::MessageCallstack: +#ifndef TRACY_ON_DEMAND + case QueueType::MessageAppInfo: +#endif + ptr = MemRead( &item.messageFat.text ); + tracy_free( (void*)ptr ); + break; + case QueueType::ZoneBeginAllocSrcLoc: + case QueueType::ZoneBeginAllocSrcLocCallstack: + ptr = MemRead( &item.zoneBegin.srcloc ); + tracy_free( (void*)ptr ); + break; + case QueueType::GpuZoneBeginAllocSrcLoc: + case QueueType::GpuZoneBeginAllocSrcLocCallstack: + case QueueType::GpuZoneBeginAllocSrcLocSerial: + case QueueType::GpuZoneBeginAllocSrcLocCallstackSerial: + ptr = MemRead( &item.gpuZoneBegin.srcloc ); + tracy_free( (void*)ptr ); + break; + case QueueType::CallstackSerial: + case QueueType::Callstack: + ptr = MemRead( &item.callstackFat.ptr ); + tracy_free( (void*)ptr ); + break; + case QueueType::CallstackAlloc: + ptr = MemRead( &item.callstackAllocFat.nativePtr ); + tracy_free( (void*)ptr ); + ptr = MemRead( &item.callstackAllocFat.ptr ); + tracy_free( (void*)ptr ); + break; + case QueueType::CallstackSample: + ptr = MemRead( &item.callstackSampleFat.ptr ); + tracy_free( (void*)ptr ); + break; + case QueueType::FrameImage: + ptr = MemRead( &item.frameImageFat.image ); + tracy_free( (void*)ptr ); + break; +#ifndef TRACY_ON_DEMAND + case QueueType::LockName: + ptr = MemRead( &item.lockNameFat.name ); + tracy_free( (void*)ptr ); + break; + case QueueType::GpuContextName: + ptr = MemRead( &item.gpuContextNameFat.ptr ); + tracy_free( (void*)ptr ); + break; +#endif +#ifdef TRACY_ON_DEMAND + case QueueType::MessageAppInfo: + case QueueType::GpuContextName: + // Don't free memory associated with deferred messages. + break; +#endif + default: + break; + } +} + +void Profiler::ClearQueues( moodycamel::ConsumerToken& token ) +{ + for(;;) + { + const auto sz = GetQueue().try_dequeue_bulk_single( token, [](const uint64_t&){}, []( QueueItem* item, size_t sz ) { assert( sz > 0 ); while( sz-- > 0 ) FreeAssociatedMemory( *item++ ); } ); + if( sz == 0 ) break; + } + + ClearSerial(); +} + +void Profiler::ClearSerial() +{ + bool lockHeld = true; + while( !m_serialLock.try_lock() ) + { + if( m_shutdownManual.load( std::memory_order_relaxed ) ) + { + lockHeld = false; + break; + } + } + for( auto& v : m_serialQueue ) FreeAssociatedMemory( v ); + m_serialQueue.clear(); + if( lockHeld ) + { + m_serialLock.unlock(); + } + + for( auto& v : m_serialDequeue ) FreeAssociatedMemory( v ); + m_serialDequeue.clear(); +} + +Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token ) +{ + bool connectionLost = false; + const auto sz = GetQueue().try_dequeue_bulk_single( token, + [this, &connectionLost] ( const uint64_t& threadId ) + { + if( threadId != m_threadCtx ) + { + QueueItem item; + MemWrite( &item.hdr.type, QueueType::ThreadContext ); + MemWrite( &item.threadCtx.thread, threadId ); + if( !AppendData( &item, QueueDataSize[(int)QueueType::ThreadContext] ) ) connectionLost = true; + m_threadCtx = threadId; + m_refTimeThread = 0; + } + }, + [this, &connectionLost] ( QueueItem* item, size_t sz ) + { + if( connectionLost ) return; + assert( sz > 0 ); + int64_t refThread = m_refTimeThread; + int64_t refCtx = m_refTimeCtx; + int64_t refGpu = m_refTimeGpu; + while( sz-- > 0 ) + { + uint64_t ptr; + uint16_t size; + auto idx = MemRead( &item->hdr.idx ); + if( idx < (int)QueueType::Terminate ) + { + switch( (QueueType)idx ) + { + case QueueType::ZoneText: + case QueueType::ZoneName: + ptr = MemRead( &item->zoneTextFat.text ); + size = MemRead( &item->zoneTextFat.size ); + SendSingleString( (const char*)ptr, size ); + tracy_free( (void*)ptr ); + break; + case QueueType::Message: + case QueueType::MessageCallstack: + ptr = MemRead( &item->messageFat.text ); + size = MemRead( &item->messageFat.size ); + SendSingleString( (const char*)ptr, size ); + tracy_free( (void*)ptr ); + break; + case QueueType::MessageColor: + case QueueType::MessageColorCallstack: + ptr = MemRead( &item->messageColorFat.text ); + size = MemRead( &item->messageColorFat.size ); + SendSingleString( (const char*)ptr, size ); + tracy_free( (void*)ptr ); + break; + case QueueType::MessageAppInfo: + ptr = MemRead( &item->messageFat.text ); + size = MemRead( &item->messageFat.size ); + SendSingleString( (const char*)ptr, size ); +#ifndef TRACY_ON_DEMAND + tracy_free( (void*)ptr ); +#endif + break; + case QueueType::ZoneBeginAllocSrcLoc: + case QueueType::ZoneBeginAllocSrcLocCallstack: + { + int64_t t = MemRead( &item->zoneBegin.time ); + int64_t dt = t - refThread; + refThread = t; + MemWrite( &item->zoneBegin.time, dt ); + ptr = MemRead( &item->zoneBegin.srcloc ); + SendSourceLocationPayload( ptr ); + tracy_free( (void*)ptr ); + break; + } + case QueueType::Callstack: + ptr = MemRead( &item->callstackFat.ptr ); + SendCallstackPayload( ptr ); + tracy_free( (void*)ptr ); + break; + case QueueType::CallstackAlloc: + ptr = MemRead( &item->callstackAllocFat.nativePtr ); + if( ptr != 0 ) + { + CutCallstack( (void*)ptr, "lua_pcall" ); + SendCallstackPayload( ptr ); + tracy_free( (void*)ptr ); + } + ptr = MemRead( &item->callstackAllocFat.ptr ); + SendCallstackAlloc( ptr ); + tracy_free( (void*)ptr ); + break; + case QueueType::CallstackSample: + { + ptr = MemRead( &item->callstackSampleFat.ptr ); + SendCallstackPayload64( ptr ); + tracy_free( (void*)ptr ); + int64_t t = MemRead( &item->callstackSampleFat.time ); + int64_t dt = t - refCtx; + refCtx = t; + MemWrite( &item->callstackSampleFat.time, dt ); + break; + } + case QueueType::FrameImage: + { + ptr = MemRead( &item->frameImageFat.image ); + const auto w = MemRead( &item->frameImageFat.w ); + const auto h = MemRead( &item->frameImageFat.h ); + const auto csz = size_t( w * h / 2 ); + SendLongString( ptr, (const char*)ptr, csz, QueueType::FrameImageData ); + tracy_free( (void*)ptr ); + break; + } + case QueueType::ZoneBegin: + case QueueType::ZoneBeginCallstack: + { + int64_t t = MemRead( &item->zoneBegin.time ); + int64_t dt = t - refThread; + refThread = t; + MemWrite( &item->zoneBegin.time, dt ); + break; + } + case QueueType::ZoneEnd: + { + int64_t t = MemRead( &item->zoneEnd.time ); + int64_t dt = t - refThread; + refThread = t; + MemWrite( &item->zoneEnd.time, dt ); + break; + } + case QueueType::GpuZoneBegin: + case QueueType::GpuZoneBeginCallstack: + { + int64_t t = MemRead( &item->gpuZoneBegin.cpuTime ); + int64_t dt = t - refThread; + refThread = t; + MemWrite( &item->gpuZoneBegin.cpuTime, dt ); + break; + } + case QueueType::GpuZoneBeginAllocSrcLoc: + case QueueType::GpuZoneBeginAllocSrcLocCallstack: + { + int64_t t = MemRead( &item->gpuZoneBegin.cpuTime ); + int64_t dt = t - refThread; + refThread = t; + MemWrite( &item->gpuZoneBegin.cpuTime, dt ); + ptr = MemRead( &item->gpuZoneBegin.srcloc ); + SendSourceLocationPayload( ptr ); + tracy_free( (void*)ptr ); + break; + } + case QueueType::GpuZoneEnd: + { + int64_t t = MemRead( &item->gpuZoneEnd.cpuTime ); + int64_t dt = t - refThread; + refThread = t; + MemWrite( &item->gpuZoneEnd.cpuTime, dt ); + break; + } + case QueueType::GpuContextName: + ptr = MemRead( &item->gpuContextNameFat.ptr ); + size = MemRead( &item->gpuContextNameFat.size ); + SendSingleString( (const char*)ptr, size ); +#ifndef TRACY_ON_DEMAND + tracy_free( (void*)ptr ); +#endif + break; + case QueueType::PlotData: + { + int64_t t = MemRead( &item->plotData.time ); + int64_t dt = t - refThread; + refThread = t; + MemWrite( &item->plotData.time, dt ); + break; + } + case QueueType::ContextSwitch: + { + int64_t t = MemRead( &item->contextSwitch.time ); + int64_t dt = t - refCtx; + refCtx = t; + MemWrite( &item->contextSwitch.time, dt ); + break; + } + case QueueType::ThreadWakeup: + { + int64_t t = MemRead( &item->threadWakeup.time ); + int64_t dt = t - refCtx; + refCtx = t; + MemWrite( &item->threadWakeup.time, dt ); + break; + } + case QueueType::GpuTime: + { + int64_t t = MemRead( &item->gpuTime.gpuTime ); + int64_t dt = t - refGpu; + refGpu = t; + MemWrite( &item->gpuTime.gpuTime, dt ); + break; + } + default: + assert( false ); + break; + } + } + if( !AppendData( item++, QueueDataSize[idx] ) ) + { + connectionLost = true; + m_refTimeThread = refThread; + m_refTimeCtx = refCtx; + m_refTimeGpu = refGpu; + return; + } + } + m_refTimeThread = refThread; + m_refTimeCtx = refCtx; + m_refTimeGpu = refGpu; + } + ); + if( connectionLost ) return DequeueStatus::ConnectionLost; + return sz > 0 ? DequeueStatus::DataDequeued : DequeueStatus::QueueEmpty; +} + +Profiler::DequeueStatus Profiler::DequeueContextSwitches( tracy::moodycamel::ConsumerToken& token, int64_t& timeStop ) +{ + const auto sz = GetQueue().try_dequeue_bulk_single( token, [] ( const uint64_t& ) {}, + [this, &timeStop] ( QueueItem* item, size_t sz ) + { + assert( sz > 0 ); + int64_t refCtx = m_refTimeCtx; + while( sz-- > 0 ) + { + FreeAssociatedMemory( *item ); + if( timeStop < 0 ) return; + const auto idx = MemRead( &item->hdr.idx ); + if( idx == (uint8_t)QueueType::ContextSwitch ) + { + const auto csTime = MemRead( &item->contextSwitch.time ); + if( csTime > timeStop ) + { + timeStop = -1; + m_refTimeCtx = refCtx; + return; + } + int64_t dt = csTime - refCtx; + refCtx = csTime; + MemWrite( &item->contextSwitch.time, dt ); + if( !AppendData( item, QueueDataSize[(int)QueueType::ContextSwitch] ) ) + { + timeStop = -2; + m_refTimeCtx = refCtx; + return; + } + } + else if( idx == (uint8_t)QueueType::ThreadWakeup ) + { + const auto csTime = MemRead( &item->threadWakeup.time ); + if( csTime > timeStop ) + { + timeStop = -1; + m_refTimeCtx = refCtx; + return; + } + int64_t dt = csTime - refCtx; + refCtx = csTime; + MemWrite( &item->threadWakeup.time, dt ); + if( !AppendData( item, QueueDataSize[(int)QueueType::ThreadWakeup] ) ) + { + timeStop = -2; + m_refTimeCtx = refCtx; + return; + } + } + item++; + } + m_refTimeCtx = refCtx; + } + ); + + if( timeStop == -2 ) return DequeueStatus::ConnectionLost; + return ( timeStop == -1 || sz > 0 ) ? DequeueStatus::DataDequeued : DequeueStatus::QueueEmpty; +} + +Profiler::DequeueStatus Profiler::DequeueSerial() +{ + { + bool lockHeld = true; + while( !m_serialLock.try_lock() ) + { + if( m_shutdownManual.load( std::memory_order_relaxed ) ) + { + lockHeld = false; + break; + } + } + if( !m_serialQueue.empty() ) m_serialQueue.swap( m_serialDequeue ); + if( lockHeld ) + { + m_serialLock.unlock(); + } + } + + const auto sz = m_serialDequeue.size(); + if( sz > 0 ) + { + int64_t refSerial = m_refTimeSerial; + int64_t refGpu = m_refTimeGpu; + auto item = m_serialDequeue.data(); + auto end = item + sz; + while( item != end ) + { + uint64_t ptr; + auto idx = MemRead( &item->hdr.idx ); + if( idx < (int)QueueType::Terminate ) + { + switch( (QueueType)idx ) + { + case QueueType::CallstackSerial: + ptr = MemRead( &item->callstackFat.ptr ); + SendCallstackPayload( ptr ); + tracy_free( (void*)ptr ); + break; + case QueueType::LockWait: + case QueueType::LockSharedWait: + { + int64_t t = MemRead( &item->lockWait.time ); + int64_t dt = t - refSerial; + refSerial = t; + MemWrite( &item->lockWait.time, dt ); + break; + } + case QueueType::LockObtain: + case QueueType::LockSharedObtain: + { + int64_t t = MemRead( &item->lockObtain.time ); + int64_t dt = t - refSerial; + refSerial = t; + MemWrite( &item->lockObtain.time, dt ); + break; + } + case QueueType::LockRelease: + case QueueType::LockSharedRelease: + { + int64_t t = MemRead( &item->lockRelease.time ); + int64_t dt = t - refSerial; + refSerial = t; + MemWrite( &item->lockRelease.time, dt ); + break; + } + case QueueType::LockName: + { + ptr = MemRead( &item->lockNameFat.name ); + uint16_t size = MemRead( &item->lockNameFat.size ); + SendSingleString( (const char*)ptr, size ); +#ifndef TRACY_ON_DEMAND + tracy_free( (void*)ptr ); +#endif + break; + } + case QueueType::MemAlloc: + case QueueType::MemAllocNamed: + case QueueType::MemAllocCallstack: + case QueueType::MemAllocCallstackNamed: + { + int64_t t = MemRead( &item->memAlloc.time ); + int64_t dt = t - refSerial; + refSerial = t; + MemWrite( &item->memAlloc.time, dt ); + break; + } + case QueueType::MemFree: + case QueueType::MemFreeNamed: + case QueueType::MemFreeCallstack: + case QueueType::MemFreeCallstackNamed: + { + int64_t t = MemRead( &item->memFree.time ); + int64_t dt = t - refSerial; + refSerial = t; + MemWrite( &item->memFree.time, dt ); + break; + } + case QueueType::GpuZoneBeginSerial: + case QueueType::GpuZoneBeginCallstackSerial: + { + int64_t t = MemRead( &item->gpuZoneBegin.cpuTime ); + int64_t dt = t - refSerial; + refSerial = t; + MemWrite( &item->gpuZoneBegin.cpuTime, dt ); + break; + } + case QueueType::GpuZoneBeginAllocSrcLocSerial: + case QueueType::GpuZoneBeginAllocSrcLocCallstackSerial: + { + int64_t t = MemRead( &item->gpuZoneBegin.cpuTime ); + int64_t dt = t - refSerial; + refSerial = t; + MemWrite( &item->gpuZoneBegin.cpuTime, dt ); + ptr = MemRead( &item->gpuZoneBegin.srcloc ); + SendSourceLocationPayload( ptr ); + tracy_free( (void*)ptr ); + break; + } + case QueueType::GpuZoneEndSerial: + { + int64_t t = MemRead( &item->gpuZoneEnd.cpuTime ); + int64_t dt = t - refSerial; + refSerial = t; + MemWrite( &item->gpuZoneEnd.cpuTime, dt ); + break; + } + case QueueType::GpuTime: + { + int64_t t = MemRead( &item->gpuTime.gpuTime ); + int64_t dt = t - refGpu; + refGpu = t; + MemWrite( &item->gpuTime.gpuTime, dt ); + break; + } + case QueueType::GpuContextName: + { + ptr = MemRead( &item->gpuContextNameFat.ptr ); + uint16_t size = MemRead( &item->gpuContextNameFat.size ); + SendSingleString( (const char*)ptr, size ); +#ifndef TRACY_ON_DEMAND + tracy_free( (void*)ptr ); +#endif + break; + } + default: + assert( false ); + break; + } + } + if( !AppendData( item, QueueDataSize[idx] ) ) return DequeueStatus::ConnectionLost; + item++; + } + m_refTimeSerial = refSerial; + m_refTimeGpu = refGpu; + m_serialDequeue.clear(); + } + else + { + return DequeueStatus::QueueEmpty; + } + return DequeueStatus::DataDequeued; +} + +bool Profiler::CommitData() +{ + bool ret = SendData( m_buffer + m_bufferStart, m_bufferOffset - m_bufferStart ); + if( m_bufferOffset > TargetFrameSize * 2 ) m_bufferOffset = 0; + m_bufferStart = m_bufferOffset; + return ret; +} + +bool Profiler::SendData( const char* data, size_t len ) +{ + const lz4sz_t lz4sz = LZ4_compress_fast_continue( (LZ4_stream_t*)m_stream, data, m_lz4Buf + sizeof( lz4sz_t ), (int)len, LZ4Size, 1 ); + memcpy( m_lz4Buf, &lz4sz, sizeof( lz4sz ) ); + return m_sock->Send( m_lz4Buf, lz4sz + sizeof( lz4sz_t ) ) != -1; +} + +void Profiler::SendString( uint64_t str, const char* ptr, size_t len, QueueType type ) +{ + assert( type == QueueType::StringData || + type == QueueType::ThreadName || + type == QueueType::PlotName || + type == QueueType::FrameName || + type == QueueType::ExternalName || + type == QueueType::ExternalThreadName ); + + QueueItem item; + MemWrite( &item.hdr.type, type ); + MemWrite( &item.stringTransfer.ptr, str ); + + assert( len <= std::numeric_limits::max() ); + auto l16 = uint16_t( len ); + + NeedDataSize( QueueDataSize[(int)type] + sizeof( l16 ) + l16 ); + + AppendDataUnsafe( &item, QueueDataSize[(int)type] ); + AppendDataUnsafe( &l16, sizeof( l16 ) ); + AppendDataUnsafe( ptr, l16 ); +} + +void Profiler::SendSingleString( const char* ptr, size_t len ) +{ + QueueItem item; + MemWrite( &item.hdr.type, QueueType::SingleStringData ); + + assert( len <= std::numeric_limits::max() ); + auto l16 = uint16_t( len ); + + NeedDataSize( QueueDataSize[(int)QueueType::SingleStringData] + sizeof( l16 ) + l16 ); + + AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::SingleStringData] ); + AppendDataUnsafe( &l16, sizeof( l16 ) ); + AppendDataUnsafe( ptr, l16 ); +} + +void Profiler::SendSecondString( const char* ptr, size_t len ) +{ + QueueItem item; + MemWrite( &item.hdr.type, QueueType::SecondStringData ); + + assert( len <= std::numeric_limits::max() ); + auto l16 = uint16_t( len ); + + NeedDataSize( QueueDataSize[(int)QueueType::SecondStringData] + sizeof( l16 ) + l16 ); + + AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::SecondStringData] ); + AppendDataUnsafe( &l16, sizeof( l16 ) ); + AppendDataUnsafe( ptr, l16 ); +} + +void Profiler::SendLongString( uint64_t str, const char* ptr, size_t len, QueueType type ) +{ + assert( type == QueueType::FrameImageData || + type == QueueType::SymbolCode || + type == QueueType::SourceCode ); + + QueueItem item; + MemWrite( &item.hdr.type, type ); + MemWrite( &item.stringTransfer.ptr, str ); + + assert( len <= std::numeric_limits::max() ); + assert( QueueDataSize[(int)type] + sizeof( uint32_t ) + len <= TargetFrameSize ); + auto l32 = uint32_t( len ); + + NeedDataSize( QueueDataSize[(int)type] + sizeof( l32 ) + l32 ); + + AppendDataUnsafe( &item, QueueDataSize[(int)type] ); + AppendDataUnsafe( &l32, sizeof( l32 ) ); + AppendDataUnsafe( ptr, l32 ); +} + +void Profiler::SendSourceLocation( uint64_t ptr ) +{ + auto srcloc = (const SourceLocationData*)ptr; + QueueItem item; + MemWrite( &item.hdr.type, QueueType::SourceLocation ); + MemWrite( &item.srcloc.name, (uint64_t)srcloc->name ); + MemWrite( &item.srcloc.file, (uint64_t)srcloc->file ); + MemWrite( &item.srcloc.function, (uint64_t)srcloc->function ); + MemWrite( &item.srcloc.line, srcloc->line ); + MemWrite( &item.srcloc.r, uint8_t( ( srcloc->color ) & 0xFF ) ); + MemWrite( &item.srcloc.g, uint8_t( ( srcloc->color >> 8 ) & 0xFF ) ); + MemWrite( &item.srcloc.b, uint8_t( ( srcloc->color >> 16 ) & 0xFF ) ); + AppendData( &item, QueueDataSize[(int)QueueType::SourceLocation] ); +} + +void Profiler::SendSourceLocationPayload( uint64_t _ptr ) +{ + auto ptr = (const char*)_ptr; + + QueueItem item; + MemWrite( &item.hdr.type, QueueType::SourceLocationPayload ); + MemWrite( &item.stringTransfer.ptr, _ptr ); + + uint16_t len; + memcpy( &len, ptr, sizeof( len ) ); + assert( len > 2 ); + len -= 2; + ptr += 2; + + NeedDataSize( QueueDataSize[(int)QueueType::SourceLocationPayload] + sizeof( len ) + len ); + + AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::SourceLocationPayload] ); + AppendDataUnsafe( &len, sizeof( len ) ); + AppendDataUnsafe( ptr, len ); +} + +void Profiler::SendCallstackPayload( uint64_t _ptr ) +{ + auto ptr = (uintptr_t*)_ptr; + + QueueItem item; + MemWrite( &item.hdr.type, QueueType::CallstackPayload ); + MemWrite( &item.stringTransfer.ptr, _ptr ); + + const auto sz = *ptr++; + const auto len = sz * sizeof( uint64_t ); + const auto l16 = uint16_t( len ); + + NeedDataSize( QueueDataSize[(int)QueueType::CallstackPayload] + sizeof( l16 ) + l16 ); + + AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::CallstackPayload] ); + AppendDataUnsafe( &l16, sizeof( l16 ) ); + + if( compile_time_condition::value ) + { + AppendDataUnsafe( ptr, sizeof( uint64_t ) * sz ); + } + else + { + for( uintptr_t i=0; iRead( &payload, sizeof( payload ), 10 ) ) return false; + + uint8_t type; + uint64_t ptr; + uint32_t extra; + memcpy( &type, &payload.type, sizeof( payload.type ) ); + memcpy( &ptr, &payload.ptr, sizeof( payload.ptr ) ); + memcpy( &extra, &payload.extra, sizeof( payload.extra ) ); + + switch( type ) + { + case ServerQueryString: + SendString( ptr, (const char*)ptr, QueueType::StringData ); + break; + case ServerQueryThreadString: + if( ptr == m_mainThread ) + { + SendString( ptr, "Main thread", 11, QueueType::ThreadName ); + } + else + { + SendString( ptr, GetThreadName( ptr ), QueueType::ThreadName ); + } + break; + case ServerQuerySourceLocation: + SendSourceLocation( ptr ); + break; + case ServerQueryPlotName: + SendString( ptr, (const char*)ptr, QueueType::PlotName ); + break; + case ServerQueryTerminate: + return false; + case ServerQueryCallstackFrame: + SendCallstackFrame( ptr ); + break; + case ServerQueryFrameName: + SendString( ptr, (const char*)ptr, QueueType::FrameName ); + break; + case ServerQueryDisconnect: + HandleDisconnect(); + return false; +#ifdef TRACY_HAS_SYSTEM_TRACING + case ServerQueryExternalName: + SysTraceSendExternalName( ptr ); + break; +#endif + case ServerQueryParameter: + HandleParameter( ptr ); + break; + case ServerQuerySymbol: + HandleSymbolQuery( ptr ); + break; +#ifndef TRACY_NO_CODE_TRANSFER + case ServerQuerySymbolCode: + HandleSymbolCodeQuery( ptr, extra ); + break; +#endif + case ServerQueryCodeLocation: + SendCodeLocation( ptr ); + break; + case ServerQuerySourceCode: + HandleSourceCodeQuery(); + break; + case ServerQueryDataTransfer: + assert( !m_queryData ); + m_queryDataPtr = m_queryData = (char*)tracy_malloc( ptr + 11 ); + AckServerQuery(); + break; + case ServerQueryDataTransferPart: + memcpy( m_queryDataPtr, &ptr, 8 ); + memcpy( m_queryDataPtr+8, &extra, 4 ); + m_queryDataPtr += 12; + AckServerQuery(); + break; + default: + assert( false ); + break; + } + + return true; +} + +void Profiler::HandleDisconnect() +{ + moodycamel::ConsumerToken token( GetQueue() ); + +#ifdef TRACY_HAS_SYSTEM_TRACING + if( s_sysTraceThread ) + { + auto timestamp = GetTime(); + for(;;) + { + const auto status = DequeueContextSwitches( token, timestamp ); + if( status == DequeueStatus::ConnectionLost ) + { + return; + } + else if( status == DequeueStatus::QueueEmpty ) + { + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) return; + } + } + if( timestamp < 0 ) + { + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) return; + } + break; + } + ClearSerial(); + if( m_sock->HasData() ) + { + while( m_sock->HasData() ) + { + if( !HandleServerQuery() ) return; + } + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) return; + } + } + else + { + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) return; + } + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + } + } +#endif + + QueueItem terminate; + MemWrite( &terminate.hdr.type, QueueType::Terminate ); + if( !SendData( (const char*)&terminate, 1 ) ) return; + for(;;) + { + ClearQueues( token ); + if( m_sock->HasData() ) + { + while( m_sock->HasData() ) + { + if( !HandleServerQuery() ) return; + } + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) return; + } + } + else + { + if( m_bufferOffset != m_bufferStart ) + { + if( !CommitData() ) return; + } + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + } +} + +void Profiler::CalibrateTimer() +{ +#ifdef TRACY_HW_TIMER + std::atomic_signal_fence( std::memory_order_acq_rel ); + const auto t0 = std::chrono::high_resolution_clock::now(); + const auto r0 = GetTime(); + std::atomic_signal_fence( std::memory_order_acq_rel ); + std::this_thread::sleep_for( std::chrono::milliseconds( 200 ) ); + std::atomic_signal_fence( std::memory_order_acq_rel ); + const auto t1 = std::chrono::high_resolution_clock::now(); + const auto r1 = GetTime(); + std::atomic_signal_fence( std::memory_order_acq_rel ); + + const auto dt = std::chrono::duration_cast( t1 - t0 ).count(); + const auto dr = r1 - r0; + + m_timerMul = double( dt ) / double( dr ); +#else + m_timerMul = 1.; +#endif +} + +void Profiler::CalibrateDelay() +{ + constexpr int Iterations = 50000; + + auto mindiff = std::numeric_limits::max(); + for( int i=0; i 0 && dti < mindiff ) mindiff = dti; + } + m_resolution = mindiff; + +#ifdef TRACY_DELAYED_INIT + m_delay = m_resolution; +#else + constexpr int Events = Iterations * 2; // start + end + static_assert( Events < QueuePrealloc, "Delay calibration loop will allocate memory in queue" ); + + static const tracy::SourceLocationData __tracy_source_location { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; + const auto t0 = GetTime(); + for( int i=0; izoneBegin.time, Profiler::GetTime() ); + MemWrite( &item->zoneBegin.srcloc, (uint64_t)&__tracy_source_location ); + TracyLfqCommit; + } + { + TracyLfqPrepare( QueueType::ZoneEnd ); + MemWrite( &item->zoneEnd.time, GetTime() ); + TracyLfqCommit; + } + } + const auto t1 = GetTime(); + const auto dt = t1 - t0; + m_delay = dt / Events; + + moodycamel::ConsumerToken token( GetQueue() ); + int left = Events; + while( left != 0 ) + { + const auto sz = GetQueue().try_dequeue_bulk_single( token, [](const uint64_t&){}, [](QueueItem* item, size_t sz){} ); + assert( sz > 0 ); + left -= (int)sz; + } + assert( GetQueue().size_approx() == 0 ); +#endif +} + +void Profiler::ReportTopology() +{ +#ifndef TRACY_DELAYED_INIT + struct CpuData + { + uint32_t package; + uint32_t core; + uint32_t thread; + }; + +#if defined _WIN32 || defined __CYGWIN__ + t_GetLogicalProcessorInformationEx _GetLogicalProcessorInformationEx = (t_GetLogicalProcessorInformationEx)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetLogicalProcessorInformationEx" ); + if( !_GetLogicalProcessorInformationEx ) return; + + DWORD psz = 0; + _GetLogicalProcessorInformationEx( RelationProcessorPackage, nullptr, &psz ); + auto packageInfo = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)tracy_malloc( psz ); + auto res = _GetLogicalProcessorInformationEx( RelationProcessorPackage, packageInfo, &psz ); + assert( res ); + + DWORD csz = 0; + _GetLogicalProcessorInformationEx( RelationProcessorCore, nullptr, &csz ); + auto coreInfo = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)tracy_malloc( csz ); + res = _GetLogicalProcessorInformationEx( RelationProcessorCore, coreInfo, &csz ); + assert( res ); + + SYSTEM_INFO sysinfo; + GetSystemInfo( &sysinfo ); + const uint32_t numcpus = sysinfo.dwNumberOfProcessors; + + auto cpuData = (CpuData*)tracy_malloc( sizeof( CpuData ) * numcpus ); + for( uint32_t i=0; iRelationship == RelationProcessorPackage ); + // FIXME account for GroupCount + auto mask = ptr->Processor.GroupMask[0].Mask; + int core = 0; + while( mask != 0 ) + { + if( mask & 1 ) cpuData[core].package = idx; + core++; + mask >>= 1; + } + ptr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(((char*)ptr) + ptr->Size); + idx++; + } + + idx = 0; + ptr = coreInfo; + while( (char*)ptr < ((char*)coreInfo) + csz ) + { + assert( ptr->Relationship == RelationProcessorCore ); + // FIXME account for GroupCount + auto mask = ptr->Processor.GroupMask[0].Mask; + int core = 0; + while( mask != 0 ) + { + if( mask & 1 ) cpuData[core].core = idx; + core++; + mask >>= 1; + } + ptr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(((char*)ptr) + ptr->Size); + idx++; + } + + for( uint32_t i=0; icpuTopology.package, data.package ); + MemWrite( &item->cpuTopology.core, data.core ); + MemWrite( &item->cpuTopology.thread, data.thread ); + +#ifdef TRACY_ON_DEMAND + DeferItem( *item ); +#endif + + TracyLfqCommit; + } + + tracy_free( cpuData ); + tracy_free( coreInfo ); + tracy_free( packageInfo ); +#elif defined __linux__ + const int numcpus = std::thread::hardware_concurrency(); + auto cpuData = (CpuData*)tracy_malloc( sizeof( CpuData ) * numcpus ); + memset( cpuData, 0, sizeof( CpuData ) * numcpus ); + + const char* basePath = "/sys/devices/system/cpu/cpu"; + for( int i=0; icpuTopology.package, data.package ); + MemWrite( &item->cpuTopology.core, data.core ); + MemWrite( &item->cpuTopology.thread, data.thread ); + +#ifdef TRACY_ON_DEMAND + DeferItem( *item ); +#endif + + TracyLfqCommit; + } + + tracy_free( cpuData ); +#endif +#endif +} + +void Profiler::SendFrameMark( const char* name ) +{ + if( !name ) GetProfiler().m_frameCount.fetch_add( 1, std::memory_order_relaxed ); +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + TracyLfqPrepare( QueueType::FrameMarkMsg ); + MemWrite( &item->frameMark.time, GetTime() ); + MemWrite( &item->frameMark.name, uint64_t( name ) ); + TracyLfqCommit; +} + +void Profiler::SendFrameMark( const char* name, QueueType type ) +{ + assert( type == QueueType::FrameMarkMsgStart || type == QueueType::FrameMarkMsgEnd ); +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + auto item = QueueSerial(); + MemWrite( &item->hdr.type, type ); + MemWrite( &item->frameMark.time, GetTime() ); + MemWrite( &item->frameMark.name, uint64_t( name ) ); + QueueSerialFinish(); +} + +void Profiler::PlotData( const char* name, int64_t val ) +{ +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + TracyLfqPrepare( QueueType::PlotData ); + MemWrite( &item->plotData.name, (uint64_t)name ); + MemWrite( &item->plotData.time, GetTime() ); + MemWrite( &item->plotData.type, PlotDataType::Int ); + MemWrite( &item->plotData.data.i, val ); + TracyLfqCommit; +} + +void Profiler::PlotData( const char* name, float val ) +{ +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + TracyLfqPrepare( QueueType::PlotData ); + MemWrite( &item->plotData.name, (uint64_t)name ); + MemWrite( &item->plotData.time, GetTime() ); + MemWrite( &item->plotData.type, PlotDataType::Float ); + MemWrite( &item->plotData.data.f, val ); + TracyLfqCommit; +} + +void Profiler::PlotData( const char* name, double val ) +{ +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + TracyLfqPrepare( QueueType::PlotData ); + MemWrite( &item->plotData.name, (uint64_t)name ); + MemWrite( &item->plotData.time, GetTime() ); + MemWrite( &item->plotData.type, PlotDataType::Double ); + MemWrite( &item->plotData.data.d, val ); + TracyLfqCommit; +} + +void Profiler::ConfigurePlot( const char* name, PlotFormatType type ) +{ + TracyLfqPrepare( QueueType::PlotConfig ); + MemWrite( &item->plotConfig.name, (uint64_t)name ); + MemWrite( &item->plotConfig.type, (uint8_t)type ); + +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + + TracyLfqCommit; +} + + void Profiler::Message( const char* txt, size_t size, int callstack ) +{ + assert( size < std::numeric_limits::max() ); +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + if( callstack != 0 ) + { + InitRPMallocThread(); + tracy::GetProfiler().SendCallstack( callstack ); + } + + TracyLfqPrepare( callstack == 0 ? QueueType::Message : QueueType::MessageCallstack ); + auto ptr = (char*)tracy_malloc( size ); + memcpy( ptr, txt, size ); + MemWrite( &item->messageFat.time, GetTime() ); + MemWrite( &item->messageFat.text, (uint64_t)ptr ); + MemWrite( &item->messageFat.size, (uint16_t)size ); + TracyLfqCommit; +} + +void Profiler::Message( const char* txt, int callstack ) +{ +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + if( callstack != 0 ) + { + InitRPMallocThread(); + tracy::GetProfiler().SendCallstack( callstack ); + } + + TracyLfqPrepare( callstack == 0 ? QueueType::MessageLiteral : QueueType::MessageLiteralCallstack ); + MemWrite( &item->messageLiteral.time, GetTime() ); + MemWrite( &item->messageLiteral.text, (uint64_t)txt ); + TracyLfqCommit; +} + +void Profiler::MessageColor( const char* txt, size_t size, uint32_t color, int callstack ) +{ + assert( size < std::numeric_limits::max() ); +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + if( callstack != 0 ) + { + InitRPMallocThread(); + tracy::GetProfiler().SendCallstack( callstack ); + } + + TracyLfqPrepare( callstack == 0 ? QueueType::MessageColor : QueueType::MessageColorCallstack ); + auto ptr = (char*)tracy_malloc( size ); + memcpy( ptr, txt, size ); + MemWrite( &item->messageColorFat.time, GetTime() ); + MemWrite( &item->messageColorFat.text, (uint64_t)ptr ); + MemWrite( &item->messageColorFat.r, uint8_t( ( color ) & 0xFF ) ); + MemWrite( &item->messageColorFat.g, uint8_t( ( color >> 8 ) & 0xFF ) ); + MemWrite( &item->messageColorFat.b, uint8_t( ( color >> 16 ) & 0xFF ) ); + MemWrite( &item->messageColorFat.size, (uint16_t)size ); + TracyLfqCommit; +} + +void Profiler::MessageColor( const char* txt, uint32_t color, int callstack ) +{ +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + if( callstack != 0 ) + { + InitRPMallocThread(); + tracy::GetProfiler().SendCallstack( callstack ); + } + + TracyLfqPrepare( callstack == 0 ? QueueType::MessageLiteralColor : QueueType::MessageLiteralColorCallstack ); + MemWrite( &item->messageColorLiteral.time, GetTime() ); + MemWrite( &item->messageColorLiteral.text, (uint64_t)txt ); + MemWrite( &item->messageColorLiteral.r, uint8_t( ( color ) & 0xFF ) ); + MemWrite( &item->messageColorLiteral.g, uint8_t( ( color >> 8 ) & 0xFF ) ); + MemWrite( &item->messageColorLiteral.b, uint8_t( ( color >> 16 ) & 0xFF ) ); + TracyLfqCommit; +} + +void Profiler::MessageAppInfo( const char* txt, size_t size ) +{ + assert( size < std::numeric_limits::max() ); + InitRPMallocThread(); + auto ptr = (char*)tracy_malloc( size ); + memcpy( ptr, txt, size ); + TracyLfqPrepare( QueueType::MessageAppInfo ); + MemWrite( &item->messageFat.time, GetTime() ); + MemWrite( &item->messageFat.text, (uint64_t)ptr ); + MemWrite( &item->messageFat.size, (uint16_t)size ); + +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + + TracyLfqCommit; +} + +void Profiler::MemAlloc(const void* ptr, size_t size, bool secure) +{ + if( secure && !ProfilerAvailable() ) return; +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + const auto thread = GetThreadHandle(); + + GetProfiler().m_serialLock.lock(); + SendMemAlloc( QueueType::MemAlloc, thread, ptr, size ); + GetProfiler().m_serialLock.unlock(); +} + +void Profiler::MemFree( const void* ptr, bool secure ) +{ + if( secure && !ProfilerAvailable() ) return; +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + const auto thread = GetThreadHandle(); + + GetProfiler().m_serialLock.lock(); + SendMemFree( QueueType::MemFree, thread, ptr ); + GetProfiler().m_serialLock.unlock(); +} + +void Profiler::MemAllocCallstack( const void* ptr, size_t size, int depth, bool secure ) +{ + if( secure && !ProfilerAvailable() ) return; +#ifdef TRACY_HAS_CALLSTACK + auto& profiler = GetProfiler(); +# ifdef TRACY_ON_DEMAND + if( !profiler.IsConnected() ) return; +# endif + const auto thread = GetThreadHandle(); + + InitRPMallocThread(); + auto callstack = Callstack( depth ); + + profiler.m_serialLock.lock(); + SendCallstackSerial( callstack ); + SendMemAlloc( QueueType::MemAllocCallstack, thread, ptr, size ); + profiler.m_serialLock.unlock(); +#else + MemAlloc( ptr, size, secure ); +#endif +} + +void Profiler::MemFreeCallstack( const void* ptr, int depth, bool secure ) +{ + if( secure && !ProfilerAvailable() ) return; +#ifdef TRACY_HAS_CALLSTACK + auto& profiler = GetProfiler(); +# ifdef TRACY_ON_DEMAND + if( !profiler.IsConnected() ) return; +# endif + const auto thread = GetThreadHandle(); + + InitRPMallocThread(); + auto callstack = Callstack( depth ); + + profiler.m_serialLock.lock(); + SendCallstackSerial( callstack ); + SendMemFree( QueueType::MemFreeCallstack, thread, ptr ); + profiler.m_serialLock.unlock(); +#else + MemFree( ptr, secure ); +#endif +} + +void Profiler::MemAllocNamed( const void* ptr, size_t size, bool secure, const char* name ) +{ + if( secure && !ProfilerAvailable() ) return; +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + const auto thread = GetThreadHandle(); + + GetProfiler().m_serialLock.lock(); + SendMemName( name ); + SendMemAlloc( QueueType::MemAllocNamed, thread, ptr, size ); + GetProfiler().m_serialLock.unlock(); +} + +void Profiler::MemFreeNamed( const void* ptr, bool secure, const char* name ) +{ + if( secure && !ProfilerAvailable() ) return; +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + const auto thread = GetThreadHandle(); + + GetProfiler().m_serialLock.lock(); + SendMemName( name ); + SendMemFree( QueueType::MemFreeNamed, thread, ptr ); + GetProfiler().m_serialLock.unlock(); +} + +void Profiler::MemAllocCallstackNamed( const void* ptr, size_t size, int depth, bool secure, const char* name ) +{ + if( secure && !ProfilerAvailable() ) return; +#ifdef TRACY_HAS_CALLSTACK + auto& profiler = GetProfiler(); +# ifdef TRACY_ON_DEMAND + if( !profiler.IsConnected() ) return; +# endif + const auto thread = GetThreadHandle(); + + InitRPMallocThread(); + auto callstack = Callstack( depth ); + + profiler.m_serialLock.lock(); + SendCallstackSerial( callstack ); + SendMemName( name ); + SendMemAlloc( QueueType::MemAllocCallstackNamed, thread, ptr, size ); + profiler.m_serialLock.unlock(); +#else + MemAlloc( ptr, size, secure ); +#endif +} + +void Profiler::MemFreeCallstackNamed( const void* ptr, int depth, bool secure, const char* name ) +{ + if( secure && !ProfilerAvailable() ) return; +#ifdef TRACY_HAS_CALLSTACK + auto& profiler = GetProfiler(); +# ifdef TRACY_ON_DEMAND + if( !profiler.IsConnected() ) return; +# endif + const auto thread = GetThreadHandle(); + + InitRPMallocThread(); + auto callstack = Callstack( depth ); + + profiler.m_serialLock.lock(); + SendCallstackSerial( callstack ); + SendMemName( name ); + SendMemFree( QueueType::MemFreeCallstackNamed, thread, ptr ); + profiler.m_serialLock.unlock(); +#else + MemFree( ptr, secure ); +#endif +} + +void Profiler::SendCallstack( int depth ) +{ +#ifdef TRACY_HAS_CALLSTACK + auto ptr = Callstack( depth ); + TracyLfqPrepare( QueueType::Callstack ); + MemWrite( &item->callstackFat.ptr, (uint64_t)ptr ); + TracyLfqCommit; +#endif +} + +void Profiler::ParameterRegister( ParameterCallback cb ) { GetProfiler().m_paramCallback = cb; } +void Profiler::ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val ) +{ + TracyLfqPrepare( QueueType::ParamSetup ); + tracy::MemWrite( &item->paramSetup.idx, idx ); + tracy::MemWrite( &item->paramSetup.name, (uint64_t)name ); + tracy::MemWrite( &item->paramSetup.isBool, (uint8_t)isBool ); + tracy::MemWrite( &item->paramSetup.val, val ); + +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem( *item ); +#endif + + TracyLfqCommit; +} + +void Profiler::SendCallstack( int depth, const char* skipBefore ) +{ +#ifdef TRACY_HAS_CALLSTACK + TracyLfqPrepare( QueueType::Callstack ); + auto ptr = Callstack( depth ); + CutCallstack( ptr, skipBefore ); + MemWrite( &item->callstackFat.ptr, (uint64_t)ptr ); + TracyLfqCommit; +#endif +} + +void Profiler::CutCallstack( void* callstack, const char* skipBefore ) +{ +#ifdef TRACY_HAS_CALLSTACK + auto data = (uintptr_t*)callstack; + const auto sz = *data++; + uintptr_t i; + for( i=0; i 100000000 ) // 100 ms + { + auto sysTime = m_sysTime.Get(); + if( sysTime >= 0 ) + { + m_sysTimeLast = t; + + TracyLfqPrepare( QueueType::SysTimeReport ); + MemWrite( &item->sysTime.time, GetTime() ); + MemWrite( &item->sysTime.sysTime, sysTime ); + TracyLfqCommit; + } + } +} +#endif + +void Profiler::HandleParameter( uint64_t payload ) +{ + assert( m_paramCallback ); + const auto idx = uint32_t( payload >> 32 ); + const auto val = int32_t( payload & 0xFFFFFFFF ); + m_paramCallback( idx, val ); + AckServerQuery(); +} + +#ifdef __ANDROID__ +// Implementation helpers of EnsureReadable(address). +// This is so far only needed on Android, where it is common for libraries to be mapped +// with only executable, not readable, permissions. Typical example (line from /proc/self/maps): +/* +746b63b000-746b6dc000 --xp 00042000 07:48 35 /apex/com.android.runtime/lib64/bionic/libc.so +*/ +// See https://github.com/wolfpld/tracy/issues/125 . +// To work around this, we parse /proc/self/maps and we use mprotect to set read permissions +// on any mappings that contain symbols addresses hit by HandleSymbolCodeQuery. + +namespace { +// Holds some information about a single memory mapping. +struct MappingInfo { + // Start of address range. Inclusive. + uintptr_t start_address; + // End of address range. Exclusive, so the mapping is the half-open interval + // [start, end) and its length in bytes is `end - start`. As in /proc/self/maps. + uintptr_t end_address; + // Read/Write/Executable permissions. + bool perm_r, perm_w, perm_x; +}; +} // anonymous namespace + +// Internal implementation helper for LookUpMapping(address). +// +// Parses /proc/self/maps returning a vector. +// /proc/self/maps is assumed to be sorted by ascending address, so the resulting +// vector is sorted by ascending address too. +static std::vector ParseMappings() +{ + std::vector result; + FILE* file = fopen( "/proc/self/maps", "r" ); + if( !file ) return result; + char line[1024]; + while( fgets( line, sizeof( line ), file ) ) + { + uintptr_t start_addr; + uintptr_t end_addr; + if( sscanf( line, "%lx-%lx", &start_addr, &end_addr ) != 2 ) continue; + char* first_space = strchr( line, ' ' ); + if( !first_space ) continue; + char* perm = first_space + 1; + char* second_space = strchr( perm, ' ' ); + if( !second_space || second_space - perm != 4 ) continue; + result.emplace_back(); + auto& mapping = result.back(); + mapping.start_address = start_addr; + mapping.end_address = end_addr; + mapping.perm_r = perm[0] == 'r'; + mapping.perm_w = perm[1] == 'w'; + mapping.perm_x = perm[2] == 'x'; + } + fclose( file ); + return result; +} + +// Internal implementation helper for LookUpMapping(address). +// +// Takes as input an `address` and a known vector `mappings`, assumed to be +// sorted by increasing addresses, as /proc/self/maps seems to be. +// Returns a pointer to the MappingInfo describing the mapping that this +// address belongs to, or nullptr if the address isn't in `mappings`. +static MappingInfo* LookUpMapping(std::vector& mappings, uintptr_t address) +{ + // Comparison function for std::lower_bound. Returns true if all addresses in `m1` + // are lower than `addr`. + auto Compare = []( const MappingInfo& m1, uintptr_t addr ) { + // '<=' because the address ranges are half-open intervals, [start, end). + return m1.end_address <= addr; + }; + auto iter = std::lower_bound( mappings.begin(), mappings.end(), address, Compare ); + if( iter == mappings.end() || iter->start_address > address) { + return nullptr; + } + return &*iter; +} + +// Internal implementation helper for EnsureReadable(address). +// +// Takes as input an `address` and returns a pointer to a MappingInfo +// describing the mapping that this address belongs to, or nullptr if +// the address isn't in any known mapping. +// +// This function is stateful and not reentrant (assumes to be called from +// only one thread). It holds a vector of mappings parsed from /proc/self/maps. +// +// Attempts to react to mappings changes by re-parsing /proc/self/maps. +static MappingInfo* LookUpMapping(uintptr_t address) +{ + // Static state managed by this function. Not constant, we mutate that state as + // we turn some mappings readable. Initially parsed once here, updated as needed below. + static std::vector s_mappings = ParseMappings(); + MappingInfo* mapping = LookUpMapping( s_mappings, address ); + if( mapping ) return mapping; + + // This address isn't in any known mapping. Try parsing again, maybe + // mappings changed. + s_mappings = ParseMappings(); + return LookUpMapping( s_mappings, address ); +} + +// Internal implementation helper for EnsureReadable(address). +// +// Attempts to make the specified `mapping` readable if it isn't already. +// Returns true if and only if the mapping is readable. +static bool EnsureReadable( MappingInfo& mapping ) +{ + if( mapping.perm_r ) + { + // The mapping is already readable. + return true; + } + int prot = PROT_READ; + if( mapping.perm_w ) prot |= PROT_WRITE; + if( mapping.perm_x ) prot |= PROT_EXEC; + if( mprotect( reinterpret_cast( mapping.start_address ), + mapping.end_address - mapping.start_address, prot ) == -1 ) + { + // Failed to make the mapping readable. Shouldn't happen, hasn't + // been observed yet. If it happened in practice, we should consider + // adding a bool to MappingInfo to track this to avoid retrying mprotect + // everytime on such mappings. + return false; + } + // The mapping is now readable. Update `mapping` so the next call will be fast. + mapping.perm_r = true; + return true; +} + +// Attempts to set the read permission on the entire mapping containing the +// specified address. Returns true if and only if the mapping is now readable. +static bool EnsureReadable( uintptr_t address ) +{ + MappingInfo* mapping = LookUpMapping(address); + return mapping && EnsureReadable( *mapping ); +} + +#endif // defined __ANDROID__ + +void Profiler::HandleSymbolQuery( uint64_t symbol ) +{ +#ifdef TRACY_HAS_CALLSTACK +#ifdef __ANDROID__ + // On Android it's common for code to be in mappings that are only executable + // but not readable. + if( !EnsureReadable( symbol ) ) + { + return; + } +#endif + const auto sym = DecodeSymbolAddress( symbol ); + + SendSingleString( sym.file ); + + QueueItem item; + MemWrite( &item.hdr.type, QueueType::SymbolInformation ); + MemWrite( &item.symbolInformation.line, sym.line ); + MemWrite( &item.symbolInformation.symAddr, symbol ); + + AppendData( &item, QueueDataSize[(int)QueueType::SymbolInformation] ); + + if( sym.needFree ) tracy_free( (void*)sym.file ); +#endif +} + +void Profiler::HandleSymbolCodeQuery( uint64_t symbol, uint32_t size ) +{ +#ifdef __ANDROID__ + // On Android it's common for code to be in mappings that are only executable + // but not readable. + if( !EnsureReadable( symbol ) ) + { + return; + } +#endif + SendLongString( symbol, (const char*)symbol, size, QueueType::SymbolCode ); +} + +void Profiler::HandleSourceCodeQuery() +{ + assert( m_exectime != 0 ); + assert( m_queryData ); + + struct stat st; + if( stat( m_queryData, &st ) == 0 && (uint64_t)st.st_mtime < m_exectime && st.st_size < ( TargetFrameSize - 16 ) ) + { + FILE* f = fopen( m_queryData, "rb" ); + tracy_free( m_queryData ); + if( f ) + { + auto ptr = (char*)tracy_malloc( st.st_size ); + auto rd = fread( ptr, 1, st.st_size, f ); + fclose( f ); + if( rd == (size_t)st.st_size ) + { + SendLongString( (uint64_t)ptr, ptr, rd, QueueType::SourceCode ); + } + else + { + AckSourceCodeNotAvailable(); + } + tracy_free( ptr ); + } + else + { + AckSourceCodeNotAvailable(); + } + } + else + { + tracy_free( m_queryData ); + AckSourceCodeNotAvailable(); + } + m_queryData = nullptr; +} + +void Profiler::SendCodeLocation( uint64_t ptr ) +{ +#ifdef TRACY_HAS_CALLSTACK + const auto sym = DecodeCodeAddress( ptr ); + + SendSingleString( sym.file ); + + QueueItem item; + MemWrite( &item.hdr.type, QueueType::CodeInformation ); + MemWrite( &item.codeInformation.ptr, ptr ); + MemWrite( &item.codeInformation.line, sym.line ); + + AppendData( &item, QueueDataSize[(int)QueueType::CodeInformation] ); + + if( sym.needFree ) tracy_free( (void*)sym.file ); +#endif +} + +#if ( defined _WIN32 || defined __CYGWIN__ ) && defined TRACY_TIMER_QPC +int64_t Profiler::GetTimeQpc() +{ + LARGE_INTEGER t; + QueryPerformanceCounter( &t ); + return t.QuadPart; +} +#endif + +} + +#endif diff --git a/Source/ThirdParty/tracy/client/TracyProfiler.hpp b/Source/ThirdParty/tracy/client/TracyProfiler.hpp new file mode 100644 index 000000000..0cec00f11 --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyProfiler.hpp @@ -0,0 +1,451 @@ +#ifndef __TRACYPROFILER_HPP__ +#define __TRACYPROFILER_HPP__ + +#include +#include +#include +#include +#include + +#include "tracy_concurrentqueue.h" +#include "TracyCallstack.hpp" +#include "TracySysTime.hpp" +#include "TracyFastVector.hpp" +#include "../common/TracyQueue.hpp" +#include "../common/TracyAlign.hpp" +#include "../common/TracyAlloc.hpp" +#include "../common/TracyMutex.hpp" +#include "../common/TracyProtocol.hpp" + +#if defined _WIN32 || defined __CYGWIN__ +# include +#endif +#ifdef __APPLE__ +# include +# include +#endif + +#if !defined TRACY_TIMER_FALLBACK && ( defined _WIN32 || defined __CYGWIN__ || ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) || ( defined TARGET_OS_IOS && TARGET_OS_IOS == 1 ) ) +# define TRACY_HW_TIMER +#endif + +#if !defined TRACY_HW_TIMER +# include +#endif + +namespace tracy +{ +#if defined(TRACY_DELAYED_INIT) && defined(TRACY_MANUAL_LIFETIME) +TRACY_API void StartupProfiler(); +TRACY_API void ShutdownProfiler(); +#endif + +class GpuCtx; +class Profiler; +class Socket; +class UdpBroadcast; + +struct GpuCtxWrapper +{ + GpuCtx* ptr; +}; + +TRACY_API moodycamel::ConcurrentQueue::ExplicitProducer* GetToken(); +TRACY_API Profiler& GetProfiler(); +TRACY_API std::atomic& GetLockCounter(); +TRACY_API std::atomic& GetGpuCtxCounter(); +TRACY_API GpuCtxWrapper& GetGpuCtx(); +TRACY_API uint64_t GetThreadHandle(); +TRACY_API void InitRPMallocThread(); +TRACY_API bool ProfilerAvailable(); +TRACY_API int64_t GetFrequencyQpc(); + +#ifdef TRACY_ON_DEMAND +struct LuaZoneState +{ + uint32_t counter; + bool active; +}; +#endif + + +#define TracyLfqPrepare( _type ) \ + moodycamel::ConcurrentQueueDefaultTraits::index_t __magic; \ + auto __token = GetToken(); \ + auto& __tail = __token->get_tail_index(); \ + auto item = __token->enqueue_begin( __magic ); \ + MemWrite( &item->hdr.type, _type ); + +#define TracyLfqCommit \ + __tail.store( __magic + 1, std::memory_order_release ); + +#define TracyLfqPrepareC( _type ) \ + tracy::moodycamel::ConcurrentQueueDefaultTraits::index_t __magic; \ + auto __token = tracy::GetToken(); \ + auto& __tail = __token->get_tail_index(); \ + auto item = __token->enqueue_begin( __magic ); \ + tracy::MemWrite( &item->hdr.type, _type ); + +#define TracyLfqCommitC \ + __tail.store( __magic + 1, std::memory_order_release ); + + +class TRACY_API Profiler +{ + struct FrameImageQueueItem + { + void* image; + uint32_t frame; + uint16_t w; + uint16_t h; + uint8_t offset; + bool flip; + }; + +public: + Profiler(); + ~Profiler(); + + void SpawnWorkerThreads(); + + static tracy_force_inline int64_t GetTime() + { +#ifdef TRACY_HW_TIMER +# if defined TARGET_OS_IOS && TARGET_OS_IOS == 1 + return mach_absolute_time(); +# elif defined _WIN32 || defined __CYGWIN__ +# ifdef TRACY_TIMER_QPC + return GetTimeQpc(); +# else + return int64_t( __rdtsc() ); +# endif +# elif defined __i386 || defined _M_IX86 + uint32_t eax, edx; + asm volatile ( "rdtsc" : "=a" (eax), "=d" (edx) ); + return ( uint64_t( edx ) << 32 ) + uint64_t( eax ); +# elif defined __x86_64__ || defined _M_X64 + uint64_t rax, rdx; + asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) ); + return (int64_t)(( rdx << 32 ) + rax); +# else +# error "TRACY_HW_TIMER detection logic needs fixing" +# endif +#else +# if defined __linux__ && defined CLOCK_MONOTONIC_RAW + struct timespec ts; + clock_gettime( CLOCK_MONOTONIC_RAW, &ts ); + return int64_t( ts.tv_sec ) * 1000000000ll + int64_t( ts.tv_nsec ); +# else + return std::chrono::duration_cast( std::chrono::high_resolution_clock::now().time_since_epoch() ).count(); +# endif +#endif + } + + tracy_force_inline uint32_t GetNextZoneId() + { + return m_zoneId.fetch_add( 1, std::memory_order_relaxed ); + } + + static tracy_force_inline QueueItem* QueueSerial() + { + auto& p = GetProfiler(); + p.m_serialLock.lock(); + return p.m_serialQueue.prepare_next(); + } + + static tracy_force_inline QueueItem* QueueSerialCallstack( void* ptr ) + { + auto& p = GetProfiler(); + p.m_serialLock.lock(); + p.SendCallstackSerial( ptr ); + return p.m_serialQueue.prepare_next(); + } + + static tracy_force_inline void QueueSerialFinish() + { + auto& p = GetProfiler(); + p.m_serialQueue.commit_next(); + p.m_serialLock.unlock(); + } + + static void SendFrameMark( const char* name ); + static void SendFrameMark( const char* name, QueueType type ); + static void PlotData( const char* name, int64_t val ); + static void PlotData( const char* name, float val ); + static void PlotData( const char* name, double val ); + static void ConfigurePlot( const char* name, PlotFormatType type ); + static void Message( const char* txt, size_t size, int callstack ); + static void Message( const char* txt, int callstack ); + static void MessageColor( const char* txt, size_t size, uint32_t color, int callstack ); + static void MessageColor( const char* txt, uint32_t color, int callstack ); + static void MessageAppInfo( const char* txt, size_t size ); + static void MemAlloc( const void* ptr, size_t size, bool secure ); + static void MemFree( const void* ptr, bool secure ); + static void MemAllocCallstack( const void* ptr, size_t size, int depth, bool secure ); + static void MemFreeCallstack( const void* ptr, int depth, bool secure ); + static void MemAllocNamed( const void* ptr, size_t size, bool secure, const char* name ); + static void MemFreeNamed( const void* ptr, bool secure, const char* name ); + static void MemAllocCallstackNamed( const void* ptr, size_t size, int depth, bool secure, const char* name ); + static void MemFreeCallstackNamed( const void* ptr, int depth, bool secure, const char* name ); + static void SendCallstack( int depth ); + static void ParameterRegister( ParameterCallback cb ); + + void SendCallstack( int depth, const char* skipBefore ); + static void CutCallstack( void* callstack, const char* skipBefore ); + + static bool ShouldExit(); + + tracy_force_inline bool IsConnected() const + { + return m_isConnected.load( std::memory_order_acquire ); + } + +#ifdef TRACY_ON_DEMAND + tracy_force_inline uint64_t ConnectionId() const + { + return m_connectionId.load( std::memory_order_acquire ); + } + + tracy_force_inline void DeferItem( const QueueItem& item ) + { + m_deferredLock.lock(); + auto dst = m_deferredQueue.push_next(); + memcpy( dst, &item, sizeof( item ) ); + m_deferredLock.unlock(); + } +#endif + + void RequestShutdown() { m_shutdown.store( true, std::memory_order_relaxed ); m_shutdownManual.store( true, std::memory_order_relaxed ); } + bool HasShutdownFinished() const { return m_shutdownFinished.load( std::memory_order_relaxed ); } + + void SendString( uint64_t str, const char* ptr, QueueType type ) { SendString( str, ptr, strlen( ptr ), type ); } + void SendString( uint64_t str, const char* ptr, size_t len, QueueType type ); + void SendSingleString( const char* ptr ) { SendSingleString( ptr, strlen( ptr ) ); } + void SendSingleString( const char* ptr, size_t len ); + void SendSecondString( const char* ptr ) { SendSecondString( ptr, strlen( ptr ) ); } + void SendSecondString( const char* ptr, size_t len ); + + + // Allocated source location data layout: + // 2b payload size + // 4b color + // 4b source line + // fsz function name + // 1b null terminator + // ssz source file name + // 1b null terminator + // nsz zone name (optional) + + static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, const char* function ) + { + return AllocSourceLocation( line, source, function, nullptr, 0 ); + } + + static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, const char* function, const char* name, size_t nameSz ) + { + return AllocSourceLocation( line, source, strlen(source), function, strlen(function), name, nameSz ); + } + + static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz ) + { + return AllocSourceLocation( line, source, sourceSz, function, functionSz, nullptr, 0 ); + } + + static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz ) + { + const auto sz32 = uint32_t( 2 + 4 + 4 + functionSz + 1 + sourceSz + 1 + nameSz ); + assert( sz32 <= std::numeric_limits::max() ); + const auto sz = uint16_t( sz32 ); + auto ptr = (char*)tracy_malloc( sz ); + memcpy( ptr, &sz, 2 ); + memset( ptr + 2, 0, 4 ); + memcpy( ptr + 6, &line, 4 ); + memcpy( ptr + 10, function, functionSz ); + ptr[10 + functionSz] = '\0'; + memcpy( ptr + 10 + functionSz + 1, source, sourceSz ); + ptr[10 + functionSz + 1 + sourceSz] = '\0'; + if( nameSz != 0 ) + { + memcpy( ptr + 10 + functionSz + 1 + sourceSz + 1, name, nameSz ); + } + return uint64_t( ptr ); + } + +private: + enum class DequeueStatus { DataDequeued, ConnectionLost, QueueEmpty }; + + static void LaunchWorker( void* ptr ) { ((Profiler*)ptr)->Worker(); } + void Worker(); + + void ClearQueues( tracy::moodycamel::ConsumerToken& token ); + void ClearSerial(); + DequeueStatus Dequeue( tracy::moodycamel::ConsumerToken& token ); + DequeueStatus DequeueContextSwitches( tracy::moodycamel::ConsumerToken& token, int64_t& timeStop ); + DequeueStatus DequeueSerial(); + bool CommitData(); + + tracy_force_inline bool AppendData( const void* data, size_t len ) + { + const auto ret = NeedDataSize( len ); + AppendDataUnsafe( data, len ); + return ret; + } + + tracy_force_inline bool NeedDataSize( size_t len ) + { + assert( len <= TargetFrameSize ); + bool ret = true; + if( m_bufferOffset - m_bufferStart + (int)len > TargetFrameSize ) + { + ret = CommitData(); + } + return ret; + } + + tracy_force_inline void AppendDataUnsafe( const void* data, size_t len ) + { + memcpy( m_buffer + m_bufferOffset, data, len ); + m_bufferOffset += int( len ); + } + + bool SendData( const char* data, size_t len ); + void SendLongString( uint64_t ptr, const char* str, size_t len, QueueType type ); + void SendSourceLocation( uint64_t ptr ); + void SendSourceLocationPayload( uint64_t ptr ); + void SendCallstackPayload( uint64_t ptr ); + void SendCallstackPayload64( uint64_t ptr ); + void SendCallstackAlloc( uint64_t ptr ); + void SendCallstackFrame( uint64_t ptr ); + void SendCodeLocation( uint64_t ptr ); + + bool HandleServerQuery(); + void HandleDisconnect(); + void HandleParameter( uint64_t payload ); + void HandleSymbolQuery( uint64_t symbol ); + void HandleSymbolCodeQuery( uint64_t symbol, uint32_t size ); + void HandleSourceCodeQuery(); + + void AckServerQuery(); + void AckSourceCodeNotAvailable(); + + void CalibrateTimer(); + void CalibrateDelay(); + void ReportTopology(); + + static tracy_force_inline void SendCallstackSerial( void* ptr ) + { +#ifdef TRACY_HAS_CALLSTACK + auto item = GetProfiler().m_serialQueue.prepare_next(); + MemWrite( &item->hdr.type, QueueType::CallstackSerial ); + MemWrite( &item->callstackFat.ptr, (uint64_t)ptr ); + GetProfiler().m_serialQueue.commit_next(); +#endif + } + + static tracy_force_inline void SendMemAlloc( QueueType type, const uint64_t thread, const void* ptr, size_t size ) + { + assert( type == QueueType::MemAlloc || type == QueueType::MemAllocCallstack || type == QueueType::MemAllocNamed || type == QueueType::MemAllocCallstackNamed ); + + auto item = GetProfiler().m_serialQueue.prepare_next(); + MemWrite( &item->hdr.type, type ); + MemWrite( &item->memAlloc.time, GetTime() ); + MemWrite( &item->memAlloc.thread, thread ); + MemWrite( &item->memAlloc.ptr, (uint64_t)ptr ); + if( compile_time_condition::value ) + { + memcpy( &item->memAlloc.size, &size, 4 ); + memset( &item->memAlloc.size + 4, 0, 2 ); + } + else + { + assert( sizeof( size ) == 8 ); + memcpy( &item->memAlloc.size, &size, 4 ); + memcpy( ((char*)&item->memAlloc.size)+4, ((char*)&size)+4, 2 ); + } + GetProfiler().m_serialQueue.commit_next(); + } + + static tracy_force_inline void SendMemFree( QueueType type, const uint64_t thread, const void* ptr ) + { + assert( type == QueueType::MemFree || type == QueueType::MemFreeCallstack || type == QueueType::MemFreeNamed || type == QueueType::MemFreeCallstackNamed ); + + auto item = GetProfiler().m_serialQueue.prepare_next(); + MemWrite( &item->hdr.type, type ); + MemWrite( &item->memFree.time, GetTime() ); + MemWrite( &item->memFree.thread, thread ); + MemWrite( &item->memFree.ptr, (uint64_t)ptr ); + GetProfiler().m_serialQueue.commit_next(); + } + + static tracy_force_inline void SendMemName( const char* name ) + { + assert( name ); + auto item = GetProfiler().m_serialQueue.prepare_next(); + MemWrite( &item->hdr.type, QueueType::MemNamePayload ); + MemWrite( &item->memName.name, (uint64_t)name ); + GetProfiler().m_serialQueue.commit_next(); + } + +#if ( defined _WIN32 || defined __CYGWIN__ ) && defined TRACY_TIMER_QPC + static int64_t GetTimeQpc(); +#endif + + double m_timerMul; + uint64_t m_resolution; + uint64_t m_delay; + std::atomic m_timeBegin; + uint64_t m_mainThread; + uint64_t m_epoch, m_exectime; + std::atomic m_shutdown; + std::atomic m_shutdownManual; + std::atomic m_shutdownFinished; + Socket* m_sock; + UdpBroadcast* m_broadcast; + bool m_noExit; + uint32_t m_userPort; + std::atomic m_zoneId; + int64_t m_samplingPeriod; + + uint64_t m_threadCtx; + int64_t m_refTimeThread; + int64_t m_refTimeSerial; + int64_t m_refTimeCtx; + int64_t m_refTimeGpu; + + void* m_stream; // LZ4_stream_t* + char* m_buffer; + int m_bufferOffset; + int m_bufferStart; + + char* m_lz4Buf; + + FastVector m_serialQueue, m_serialDequeue; + TracyMutex m_serialLock; + + std::atomic m_frameCount; + std::atomic m_isConnected; +#ifdef TRACY_ON_DEMAND + std::atomic m_connectionId; + + TracyMutex m_deferredLock; + FastVector m_deferredQueue; +#endif + +#ifdef TRACY_HAS_SYSTIME + void ProcessSysTime(); + + SysTime m_sysTime; + uint64_t m_sysTimeLast = 0; +#else + void ProcessSysTime() {} +#endif + + ParameterCallback m_paramCallback; + + char* m_queryData; + char* m_queryDataPtr; +}; + +} + +#endif diff --git a/Source/ThirdParty/tracy/client/TracyRingBuffer.hpp b/Source/ThirdParty/tracy/client/TracyRingBuffer.hpp new file mode 100644 index 000000000..29d935596 --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyRingBuffer.hpp @@ -0,0 +1,116 @@ +namespace tracy +{ + +template +class RingBuffer +{ +public: + RingBuffer( int fd ) + : m_fd( fd ) + { + const auto pageSize = uint32_t( getpagesize() ); + assert( Size >= pageSize ); + assert( __builtin_popcount( Size ) == 1 ); + m_mapSize = Size + pageSize; + auto mapAddr = mmap( nullptr, m_mapSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 ); + if( !mapAddr ) + { + m_fd = 0; + close( fd ); + return; + } + m_metadata = (perf_event_mmap_page*)mapAddr; + assert( m_metadata->data_offset == pageSize ); + m_buffer = ((char*)mapAddr) + pageSize; + } + + ~RingBuffer() + { + if( m_metadata ) munmap( m_metadata, m_mapSize ); + if( m_fd ) close( m_fd ); + } + + RingBuffer( const RingBuffer& ) = delete; + RingBuffer& operator=( const RingBuffer& ) = delete; + + RingBuffer( RingBuffer&& other ) + { + memcpy( (char*)&other, (char*)this, sizeof( RingBuffer ) ); + m_metadata = nullptr; + m_fd = 0; + } + + RingBuffer& operator=( RingBuffer&& other ) + { + memcpy( (char*)&other, (char*)this, sizeof( RingBuffer ) ); + m_metadata = nullptr; + m_fd = 0; + return *this; + } + + bool IsValid() const { return m_metadata != nullptr; } + + void Enable() + { + ioctl( m_fd, PERF_EVENT_IOC_ENABLE, 0 ); + } + + bool HasData() const + { + const auto head = LoadHead(); + return head > m_metadata->data_tail; + } + + void Read( void* dst, uint64_t offset, uint64_t cnt ) + { + auto src = ( m_metadata->data_tail + offset ) % Size; + if( src + cnt <= Size ) + { + memcpy( dst, m_buffer + src, cnt ); + } + else + { + const auto s0 = Size - src; + memcpy( dst, m_buffer + src, s0 ); + memcpy( (char*)dst + s0, m_buffer, cnt - s0 ); + } + } + + void Advance( uint64_t cnt ) + { + StoreTail( m_metadata->data_tail + cnt ); + } + + bool CheckTscCaps() const + { + return m_metadata->cap_user_time_zero; + } + + int64_t ConvertTimeToTsc( int64_t timestamp ) const + { + assert( m_metadata->cap_user_time_zero ); + const auto time = timestamp - m_metadata->time_zero; + const auto quot = time / m_metadata->time_mult; + const auto rem = time % m_metadata->time_mult; + return ( quot << m_metadata->time_shift ) + ( rem << m_metadata->time_shift ) / m_metadata->time_mult; + } + +private: + uint64_t LoadHead() const + { + return std::atomic_load_explicit( (const volatile std::atomic*)&m_metadata->data_head, std::memory_order_acquire ); + } + + void StoreTail( uint64_t tail ) + { + std::atomic_store_explicit( (volatile std::atomic*)&m_metadata->data_tail, tail, std::memory_order_release ); + } + + perf_event_mmap_page* m_metadata; + char* m_buffer; + + size_t m_mapSize; + int m_fd; +}; + +} diff --git a/Source/ThirdParty/tracy/client/TracyScoped.hpp b/Source/ThirdParty/tracy/client/TracyScoped.hpp new file mode 100644 index 000000000..fa6a52808 --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyScoped.hpp @@ -0,0 +1,174 @@ +#ifndef __TRACYSCOPED_HPP__ +#define __TRACYSCOPED_HPP__ + +#include +#include +#include + +#include "../common/TracySystem.hpp" +#include "../common/TracyAlign.hpp" +#include "../common/TracyAlloc.hpp" +#include "TracyProfiler.hpp" + +namespace tracy +{ +inline ScopedZone::ScopedZone( const SourceLocationData* srcloc, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif +{ + if( !m_active ) return; +#ifdef TRACY_ON_DEMAND + m_connectionId = GetProfiler().ConnectionId(); +#endif + TracyLfqPrepare( QueueType::ZoneBegin ); + MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); + MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc ); + TracyLfqCommit; +} + +inline ScopedZone::ScopedZone( const SourceLocationData* srcloc, int depth, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif +{ + if( !m_active ) return; +#ifdef TRACY_ON_DEMAND + m_connectionId = GetProfiler().ConnectionId(); +#endif + GetProfiler().SendCallstack( depth ); + + TracyLfqPrepare( QueueType::ZoneBeginCallstack ); + MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); + MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc ); + TracyLfqCommit; +} + +inline ScopedZone::ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif +{ + if( !m_active ) return; +#ifdef TRACY_ON_DEMAND + m_connectionId = GetProfiler().ConnectionId(); +#endif + TracyLfqPrepare( QueueType::ZoneBeginAllocSrcLoc ); + const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); + MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); + MemWrite( &item->zoneBegin.srcloc, srcloc ); + TracyLfqCommit; +} + +inline ScopedZone::ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif +{ + if( !m_active ) return; +#ifdef TRACY_ON_DEMAND + m_connectionId = GetProfiler().ConnectionId(); +#endif + GetProfiler().SendCallstack( depth ); + + TracyLfqPrepare( QueueType::ZoneBeginAllocSrcLocCallstack ); + const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); + MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); + MemWrite( &item->zoneBegin.srcloc, srcloc ); + TracyLfqCommit; +} + +inline ScopedZone::~ScopedZone() +{ + if( !m_active ) return; +#ifdef TRACY_ON_DEMAND + if( GetProfiler().ConnectionId() != m_connectionId ) return; +#endif + TracyLfqPrepare( QueueType::ZoneEnd ); + MemWrite( &item->zoneEnd.time, Profiler::GetTime() ); + TracyLfqCommit; +} + +inline void ScopedZone::Text( const char* txt, size_t size ) +{ + assert( size < std::numeric_limits::max() ); + if( !m_active ) return; +#ifdef TRACY_ON_DEMAND + if( GetProfiler().ConnectionId() != m_connectionId ) return; +#endif + auto ptr = (char*)tracy_malloc( size ); + memcpy( ptr, txt, size ); + TracyLfqPrepare( QueueType::ZoneText ); + MemWrite( &item->zoneTextFat.text, (uint64_t)ptr ); + MemWrite( &item->zoneTextFat.size, (uint16_t)size ); + TracyLfqCommit; +} + +inline void ScopedZone::Name( const char* txt, size_t size ) +{ + assert( size < std::numeric_limits::max() ); + if( !m_active ) return; +#ifdef TRACY_ON_DEMAND + if( GetProfiler().ConnectionId() != m_connectionId ) return; +#endif + auto ptr = (char*)tracy_malloc( size ); + memcpy( ptr, txt, size ); + TracyLfqPrepare( QueueType::ZoneName ); + MemWrite( &item->zoneTextFat.text, (uint64_t)ptr ); + MemWrite( &item->zoneTextFat.size, (uint16_t)size ); + TracyLfqCommit; +} + +inline void ScopedZone::Name( const Char* txt, size_t size ) +{ + assert( size < std::numeric_limits::max() ); + if( !m_active ) return; +#ifdef TRACY_ON_DEMAND + if( GetProfiler().ConnectionId() != m_connectionId ) return; +#endif + auto ptr = (char*)tracy_malloc( size ); + for( int i = 0; i < size; i++) + ptr[i] = (char)txt[i]; + TracyLfqPrepare( QueueType::ZoneName ); + MemWrite( &item->zoneTextFat.text, (uint64_t)ptr ); + MemWrite( &item->zoneTextFat.size, (uint16_t)size ); + TracyLfqCommit; +} + +inline void ScopedZone::Color( uint32_t color ) +{ + if( !m_active ) return; +#ifdef TRACY_ON_DEMAND + if( GetProfiler().ConnectionId() != m_connectionId ) return; +#endif + TracyLfqPrepare( QueueType::ZoneColor ); + MemWrite( &item->zoneColor.r, uint8_t( ( color ) & 0xFF ) ); + MemWrite( &item->zoneColor.g, uint8_t( ( color >> 8 ) & 0xFF ) ); + MemWrite( &item->zoneColor.b, uint8_t( ( color >> 16 ) & 0xFF ) ); + TracyLfqCommit; +} + +inline void ScopedZone::Value( uint64_t value ) +{ + if( !m_active ) return; +#ifdef TRACY_ON_DEMAND + if( GetProfiler().ConnectionId() != m_connectionId ) return; +#endif + TracyLfqPrepare( QueueType::ZoneValue ); + MemWrite( &item->zoneValue.value, value ); + TracyLfqCommit; +} + +inline bool ScopedZone::IsActive() const { return m_active; } + +} + +#endif diff --git a/Source/ThirdParty/tracy/client/TracySysTime.cpp b/Source/ThirdParty/tracy/client/TracySysTime.cpp new file mode 100644 index 000000000..e5903467d --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracySysTime.cpp @@ -0,0 +1,108 @@ +#include "TracySysTime.hpp" + +#ifdef TRACY_HAS_SYSTIME + +# if defined _WIN32 || defined __CYGWIN__ +# include +# elif defined __linux__ +# include +# include +# elif defined __APPLE__ +# include +# include +# elif defined BSD +# include +# include +# endif + +namespace tracy +{ + +# if defined _WIN32 || defined __CYGWIN__ + +static inline uint64_t ConvertTime( const FILETIME& t ) +{ + return ( uint64_t( t.dwHighDateTime ) << 32 ) | uint64_t( t.dwLowDateTime ); +} + +void SysTime::ReadTimes() +{ + FILETIME idleTime; + FILETIME kernelTime; + FILETIME userTime; + + GetSystemTimes( &idleTime, &kernelTime, &userTime ); + + idle = ConvertTime( idleTime ); + const auto kernel = ConvertTime( kernelTime ); + const auto user = ConvertTime( userTime ); + used = kernel + user; +} + +# elif defined __linux__ + +void SysTime::ReadTimes() +{ + uint64_t user, nice, system; + FILE* f = fopen( "/proc/stat", "r" ); + if( f ) + { + int read = fscanf( f, "cpu %" PRIu64 " %" PRIu64 " %" PRIu64" %" PRIu64, &user, &nice, &system, &idle ); + fclose( f ); + if (read == 4) + { + used = user + nice + system; + } + } +} + +# elif defined __APPLE__ + +void SysTime::ReadTimes() +{ + host_cpu_load_info_data_t info; + mach_msg_type_number_t cnt = HOST_CPU_LOAD_INFO_COUNT; + host_statistics( mach_host_self(), HOST_CPU_LOAD_INFO, reinterpret_cast( &info ), &cnt ); + used = info.cpu_ticks[CPU_STATE_USER] + info.cpu_ticks[CPU_STATE_NICE] + info.cpu_ticks[CPU_STATE_SYSTEM]; + idle = info.cpu_ticks[CPU_STATE_IDLE]; +} + +# elif defined BSD + +void SysTime::ReadTimes() +{ + u_long data[5]; + size_t sz = sizeof( data ); + sysctlbyname( "kern.cp_time", &data, &sz, nullptr, 0 ); + used = data[0] + data[1] + data[2] + data[3]; + idle = data[4]; +} + +#endif + +SysTime::SysTime() +{ + ReadTimes(); +} + +float SysTime::Get() +{ + const auto oldUsed = used; + const auto oldIdle = idle; + + ReadTimes(); + + const auto diffIdle = idle - oldIdle; + const auto diffUsed = used - oldUsed; + +#if defined _WIN32 || defined __CYGWIN__ + return diffUsed == 0 ? -1 : ( diffUsed - diffIdle ) * 100.f / diffUsed; +#elif defined __linux__ || defined __APPLE__ || defined BSD + const auto total = diffUsed + diffIdle; + return total == 0 ? -1 : diffUsed * 100.f / total; +#endif +} + +} + +#endif diff --git a/Source/ThirdParty/tracy/client/TracySysTime.hpp b/Source/ThirdParty/tracy/client/TracySysTime.hpp new file mode 100644 index 000000000..fc6ba321a --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracySysTime.hpp @@ -0,0 +1,36 @@ +#ifndef __TRACYSYSTIME_HPP__ +#define __TRACYSYSTIME_HPP__ + +#if defined _WIN32 || defined __CYGWIN__ || defined __linux__ || defined __APPLE__ +# define TRACY_HAS_SYSTIME +#else +# include +#endif + +#ifdef BSD +# define TRACY_HAS_SYSTIME +#endif + +#ifdef TRACY_HAS_SYSTIME + +#include + +namespace tracy +{ + +class SysTime +{ +public: + SysTime(); + float Get(); + + void ReadTimes(); + +private: + uint64_t idle, used; +}; + +} +#endif + +#endif diff --git a/Source/ThirdParty/tracy/client/TracySysTrace.cpp b/Source/ThirdParty/tracy/client/TracySysTrace.cpp new file mode 100644 index 000000000..972779770 --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracySysTrace.cpp @@ -0,0 +1,1326 @@ +#include "TracySysTrace.hpp" + +#ifdef TRACY_HAS_SYSTEM_TRACING + +# if defined _WIN32 || defined __CYGWIN__ + +# ifndef NOMINMAX +# define NOMINMAX +# endif + +# define INITGUID +# include +# include +# include +# include +# include +# include +# include +# include + +# include "../common/TracyAlloc.hpp" +# include "../common/TracySystem.hpp" +# include "TracyProfiler.hpp" +# include "TracyThread.hpp" + +namespace tracy +{ + +static const GUID PerfInfoGuid = { 0xce1dbfb4, 0x137e, 0x4da6, { 0x87, 0xb0, 0x3f, 0x59, 0xaa, 0x10, 0x2c, 0xbc } }; +static const GUID DxgKrnlGuid = { 0x802ec45a, 0x1e99, 0x4b83, { 0x99, 0x20, 0x87, 0xc9, 0x82, 0x77, 0xba, 0x9d } }; + + +static TRACEHANDLE s_traceHandle; +static TRACEHANDLE s_traceHandle2; +static EVENT_TRACE_PROPERTIES* s_prop; +static DWORD s_pid; + +static EVENT_TRACE_PROPERTIES* s_propVsync; +static TRACEHANDLE s_traceHandleVsync; +static TRACEHANDLE s_traceHandleVsync2; +Thread* s_threadVsync = nullptr; + +struct CSwitch +{ + uint32_t newThreadId; + uint32_t oldThreadId; + int8_t newThreadPriority; + int8_t oldThreadPriority; + uint8_t previousCState; + int8_t spareByte; + int8_t oldThreadWaitReason; + int8_t oldThreadWaitMode; + int8_t oldThreadState; + int8_t oldThreadWaitIdealProcessor; + uint32_t newThreadWaitTime; + uint32_t reserved; +}; + +struct ReadyThread +{ + uint32_t threadId; + int8_t adjustReason; + int8_t adjustIncrement; + int8_t flag; + int8_t reserverd; +}; + +struct ThreadTrace +{ + uint32_t processId; + uint32_t threadId; + uint32_t stackBase; + uint32_t stackLimit; + uint32_t userStackBase; + uint32_t userStackLimit; + uint32_t startAddr; + uint32_t win32StartAddr; + uint32_t tebBase; + uint32_t subProcessTag; +}; + +struct StackWalkEvent +{ + uint64_t eventTimeStamp; + uint32_t stackProcess; + uint32_t stackThread; + uint64_t stack[192]; +}; + +struct VSyncInfo +{ + void* dxgAdapter; + uint32_t vidPnTargetId; + uint64_t scannedPhysicalAddress; + uint32_t vidPnSourceId; + uint32_t frameNumber; + int64_t frameQpcTime; + void* hFlipDevice; + uint32_t flipType; + uint64_t flipFenceId; +}; + +#ifdef __CYGWIN__ +extern "C" typedef DWORD (WINAPI *t_GetProcessIdOfThread)( HANDLE ); +extern "C" typedef DWORD (WINAPI *t_GetProcessImageFileNameA)( HANDLE, LPSTR, DWORD ); +extern "C" ULONG WMIAPI TraceSetInformation(TRACEHANDLE SessionHandle, TRACE_INFO_CLASS InformationClass, PVOID TraceInformation, ULONG InformationLength); +t_GetProcessIdOfThread GetProcessIdOfThread = (t_GetProcessIdOfThread)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetProcessIdOfThread" ); +t_GetProcessImageFileNameA GetProcessImageFileNameA = (t_GetProcessImageFileNameA)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetProcessImageFileNameA" ); +#endif + +extern "C" typedef NTSTATUS (WINAPI *t_NtQueryInformationThread)( HANDLE, THREADINFOCLASS, PVOID, ULONG, PULONG ); +extern "C" typedef BOOL (WINAPI *t_EnumProcessModules)( HANDLE, HMODULE*, DWORD, LPDWORD ); +extern "C" typedef BOOL (WINAPI *t_GetModuleInformation)( HANDLE, HMODULE, LPMODULEINFO, DWORD ); +extern "C" typedef DWORD (WINAPI *t_GetModuleBaseNameA)( HANDLE, HMODULE, LPSTR, DWORD ); +extern "C" typedef HRESULT (WINAPI *t_GetThreadDescription)( HANDLE, PWSTR* ); + +t_NtQueryInformationThread NtQueryInformationThread = (t_NtQueryInformationThread)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "NtQueryInformationThread" ); +t_EnumProcessModules _EnumProcessModules = (t_EnumProcessModules)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32EnumProcessModules" ); +t_GetModuleInformation _GetModuleInformation = (t_GetModuleInformation)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetModuleInformation" ); +t_GetModuleBaseNameA _GetModuleBaseNameA = (t_GetModuleBaseNameA)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetModuleBaseNameA" ); + +static t_GetThreadDescription _GetThreadDescription = 0; + + +void WINAPI EventRecordCallback( PEVENT_RECORD record ) +{ +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + + const auto& hdr = record->EventHeader; + switch( hdr.ProviderId.Data1 ) + { + case 0x3d6fa8d1: // Thread Guid + if( hdr.EventDescriptor.Opcode == 36 ) + { + const auto cswitch = (const CSwitch*)record->UserData; + + TracyLfqPrepare( QueueType::ContextSwitch ); + MemWrite( &item->contextSwitch.time, hdr.TimeStamp.QuadPart ); + memcpy( &item->contextSwitch.oldThread, &cswitch->oldThreadId, sizeof( cswitch->oldThreadId ) ); + memcpy( &item->contextSwitch.newThread, &cswitch->newThreadId, sizeof( cswitch->newThreadId ) ); + memset( ((char*)&item->contextSwitch.oldThread)+4, 0, 4 ); + memset( ((char*)&item->contextSwitch.newThread)+4, 0, 4 ); + MemWrite( &item->contextSwitch.cpu, record->BufferContext.ProcessorNumber ); + MemWrite( &item->contextSwitch.reason, cswitch->oldThreadWaitReason ); + MemWrite( &item->contextSwitch.state, cswitch->oldThreadState ); + TracyLfqCommit; + } + else if( hdr.EventDescriptor.Opcode == 50 ) + { + const auto rt = (const ReadyThread*)record->UserData; + + TracyLfqPrepare( QueueType::ThreadWakeup ); + MemWrite( &item->threadWakeup.time, hdr.TimeStamp.QuadPart ); + memcpy( &item->threadWakeup.thread, &rt->threadId, sizeof( rt->threadId ) ); + memset( ((char*)&item->threadWakeup.thread)+4, 0, 4 ); + TracyLfqCommit; + } + else if( hdr.EventDescriptor.Opcode == 1 || hdr.EventDescriptor.Opcode == 3 ) + { + const auto tt = (const ThreadTrace*)record->UserData; + + uint64_t tid = tt->threadId; + if( tid == 0 ) return; + uint64_t pid = tt->processId; + TracyLfqPrepare( QueueType::TidToPid ); + MemWrite( &item->tidToPid.tid, tid ); + MemWrite( &item->tidToPid.pid, pid ); + TracyLfqCommit; + } + break; + case 0xdef2fe46: // StackWalk Guid + if( hdr.EventDescriptor.Opcode == 32 ) + { + const auto sw = (const StackWalkEvent*)record->UserData; + if( sw->stackProcess == s_pid && ( sw->stack[0] & 0x8000000000000000 ) == 0 ) + { + const uint64_t sz = ( record->UserDataLength - 16 ) / 8; + if( sz > 0 ) + { + auto trace = (uint64_t*)tracy_malloc( ( 1 + sz ) * sizeof( uint64_t ) ); + memcpy( trace, &sz, sizeof( uint64_t ) ); + memcpy( trace+1, sw->stack, sizeof( uint64_t ) * sz ); + TracyLfqPrepare( QueueType::CallstackSample ); + MemWrite( &item->callstackSampleFat.time, sw->eventTimeStamp ); + MemWrite( &item->callstackSampleFat.thread, (uint64_t)sw->stackThread ); + MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace ); + TracyLfqCommit; + } + } + } + break; + default: + break; + } +} + +static constexpr const char* VsyncName[] = { + "[0] Vsync", + "[1] Vsync", + "[2] Vsync", + "[3] Vsync", + "[4] Vsync", + "[5] Vsync", + "[6] Vsync", + "[7] Vsync", + "Vsync" +}; + +static uint32_t VsyncTarget[8] = {}; + +void WINAPI EventRecordCallbackVsync( PEVENT_RECORD record ) +{ +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif + + const auto& hdr = record->EventHeader; + assert( hdr.ProviderId.Data1 == 0x802EC45A ); + assert( hdr.EventDescriptor.Id == 0x0011 ); + + const auto vs = (const VSyncInfo*)record->UserData; + + int idx = 0; + do + { + if( VsyncTarget[idx] == 0 ) + { + VsyncTarget[idx] = vs->vidPnTargetId; + break; + } + else if( VsyncTarget[idx] == vs->vidPnTargetId ) + { + break; + } + } + while( ++idx < 8 ); + + TracyLfqPrepare( QueueType::FrameMarkMsg ); + MemWrite( &item->frameMark.time, hdr.TimeStamp.QuadPart ); + MemWrite( &item->frameMark.name, uint64_t( VsyncName[idx] ) ); + TracyLfqCommit; +} + +static void SetupVsync() +{ +#if _WIN32_WINNT >= _WIN32_WINNT_WINBLUE + const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + MAX_PATH; + s_propVsync = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz ); + memset( s_propVsync, 0, sizeof( EVENT_TRACE_PROPERTIES ) ); + s_propVsync->LogFileMode = EVENT_TRACE_REAL_TIME_MODE; + s_propVsync->Wnode.BufferSize = psz; +#ifdef TRACY_TIMER_QPC + s_propVsync->Wnode.ClientContext = 1; +#else + s_propVsync->Wnode.ClientContext = 3; +#endif + s_propVsync->LoggerNameOffset = sizeof( EVENT_TRACE_PROPERTIES ); + strcpy( ((char*)s_propVsync) + sizeof( EVENT_TRACE_PROPERTIES ), "TracyVsync" ); + + auto backup = tracy_malloc( psz ); + memcpy( backup, s_propVsync, psz ); + + const auto controlStatus = ControlTraceA( 0, "TracyVsync", s_propVsync, EVENT_TRACE_CONTROL_STOP ); + if( controlStatus != ERROR_SUCCESS && controlStatus != ERROR_WMI_INSTANCE_NOT_FOUND ) + { + tracy_free( backup ); + tracy_free( s_propVsync ); + return; + } + + memcpy( s_propVsync, backup, psz ); + tracy_free( backup ); + + const auto startStatus = StartTraceA( &s_traceHandleVsync, "TracyVsync", s_propVsync ); + if( startStatus != ERROR_SUCCESS ) + { + tracy_free( s_propVsync ); + return; + } + + EVENT_FILTER_EVENT_ID fe = {}; + fe.FilterIn = TRUE; + fe.Count = 1; + fe.Events[0] = 0x0011; // VSyncDPC_Info + + EVENT_FILTER_DESCRIPTOR desc = {}; + desc.Ptr = (ULONGLONG)&fe; + desc.Size = sizeof( fe ); + desc.Type = EVENT_FILTER_TYPE_EVENT_ID; + + ENABLE_TRACE_PARAMETERS params = {}; + params.Version = ENABLE_TRACE_PARAMETERS_VERSION_2; + params.EnableProperty = EVENT_ENABLE_PROPERTY_IGNORE_KEYWORD_0; + params.SourceId = s_propVsync->Wnode.Guid; + params.EnableFilterDesc = &desc; + params.FilterDescCount = 1; + + uint64_t mask = 0x4000000000000001; // Microsoft_Windows_DxgKrnl_Performance | Base + if( EnableTraceEx2( s_traceHandleVsync, &DxgKrnlGuid, EVENT_CONTROL_CODE_ENABLE_PROVIDER, TRACE_LEVEL_INFORMATION, mask, mask, 0, ¶ms ) != ERROR_SUCCESS ) + { + tracy_free( s_propVsync ); + return; + } + + char loggerName[MAX_PATH]; + strcpy( loggerName, "TracyVsync" ); + + EVENT_TRACE_LOGFILEA log = {}; + log.LoggerName = loggerName; + log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP; + log.EventRecordCallback = EventRecordCallbackVsync; + + s_traceHandleVsync2 = OpenTraceA( &log ); + if( s_traceHandleVsync2 == (TRACEHANDLE)INVALID_HANDLE_VALUE ) + { + CloseTrace( s_traceHandleVsync ); + tracy_free( s_propVsync ); + return; + } + + s_threadVsync = (Thread*)tracy_malloc( sizeof( Thread ) ); + new(s_threadVsync) Thread( [] (void*) { + ThreadExitHandler threadExitHandler; + SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL ); + SetThreadName( "Tracy Vsync" ); + ProcessTrace( &s_traceHandleVsync2, 1, nullptr, nullptr ); + }, nullptr ); +#endif +} + +bool SysTraceStart( int64_t& samplingPeriod ) +{ + if( !_GetThreadDescription ) _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" ); + + s_pid = GetCurrentProcessId(); + +#if defined _WIN64 + constexpr bool isOs64Bit = true; +#else + BOOL _iswow64; + IsWow64Process( GetCurrentProcess(), &_iswow64 ); + const bool isOs64Bit = _iswow64; +#endif + + TOKEN_PRIVILEGES priv = {}; + priv.PrivilegeCount = 1; + priv.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + if( LookupPrivilegeValue( nullptr, SE_SYSTEM_PROFILE_NAME, &priv.Privileges[0].Luid ) == 0 ) return false; + + HANDLE pt; + if( OpenProcessToken( GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &pt ) == 0 ) return false; + const auto adjust = AdjustTokenPrivileges( pt, FALSE, &priv, 0, nullptr, nullptr ); + CloseHandle( pt ); + if( adjust == 0 ) return false; + const auto status = GetLastError(); + if( status != ERROR_SUCCESS ) return false; + + if( isOs64Bit ) + { + TRACE_PROFILE_INTERVAL interval = {}; + interval.Interval = 1250; // 8 kHz + const auto intervalStatus = TraceSetInformation( 0, TraceSampledProfileIntervalInfo, &interval, sizeof( interval ) ); + if( intervalStatus != ERROR_SUCCESS ) return false; + samplingPeriod = 125*1000; + } + + const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + sizeof( KERNEL_LOGGER_NAME ); + s_prop = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz ); + memset( s_prop, 0, sizeof( EVENT_TRACE_PROPERTIES ) ); + ULONG flags = 0; +#ifndef TRACY_NO_CONTEXT_SWITCH + flags = EVENT_TRACE_FLAG_CSWITCH | EVENT_TRACE_FLAG_DISPATCHER | EVENT_TRACE_FLAG_THREAD; +#endif +#ifndef TRACY_NO_SAMPLING + if( isOs64Bit ) flags |= EVENT_TRACE_FLAG_PROFILE; +#endif + s_prop->EnableFlags = flags; + s_prop->LogFileMode = EVENT_TRACE_REAL_TIME_MODE; + s_prop->Wnode.BufferSize = psz; + s_prop->Wnode.Flags = WNODE_FLAG_TRACED_GUID; +#ifdef TRACY_TIMER_QPC + s_prop->Wnode.ClientContext = 1; +#else + s_prop->Wnode.ClientContext = 3; +#endif + s_prop->Wnode.Guid = SystemTraceControlGuid; + s_prop->BufferSize = 1024; + s_prop->MinimumBuffers = std::thread::hardware_concurrency() * 4; + s_prop->MaximumBuffers = std::thread::hardware_concurrency() * 6; + s_prop->LoggerNameOffset = sizeof( EVENT_TRACE_PROPERTIES ); + memcpy( ((char*)s_prop) + sizeof( EVENT_TRACE_PROPERTIES ), KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) ); + + auto backup = tracy_malloc( psz ); + memcpy( backup, s_prop, psz ); + + const auto controlStatus = ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP ); + if( controlStatus != ERROR_SUCCESS && controlStatus != ERROR_WMI_INSTANCE_NOT_FOUND ) + { + tracy_free( backup ); + tracy_free( s_prop ); + return false; + } + + memcpy( s_prop, backup, psz ); + tracy_free( backup ); + + const auto startStatus = StartTrace( &s_traceHandle, KERNEL_LOGGER_NAME, s_prop ); + if( startStatus != ERROR_SUCCESS ) + { + tracy_free( s_prop ); + return false; + } + + if( isOs64Bit ) + { + CLASSIC_EVENT_ID stackId; + stackId.EventGuid = PerfInfoGuid; + stackId.Type = 46; + const auto stackStatus = TraceSetInformation( s_traceHandle, TraceStackTracingInfo, &stackId, sizeof( stackId ) ); + if( stackStatus != ERROR_SUCCESS ) + { + tracy_free( s_prop ); + return false; + } + } + +#ifdef UNICODE + WCHAR KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )]; +#else + char KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )]; +#endif + memcpy( KernelLoggerName, KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) ); + EVENT_TRACE_LOGFILE log = {}; + log.LoggerName = KernelLoggerName; + log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP; + log.EventRecordCallback = EventRecordCallback; + + s_traceHandle2 = OpenTrace( &log ); + if( s_traceHandle2 == (TRACEHANDLE)INVALID_HANDLE_VALUE ) + { + CloseTrace( s_traceHandle ); + tracy_free( s_prop ); + return false; + } + +#ifndef TRACY_NO_VSYNC_CAPTURE + SetupVsync(); +#endif + + return true; +} + +void SysTraceStop() +{ + if( s_threadVsync ) + { + CloseTrace( s_traceHandleVsync2 ); + CloseTrace( s_traceHandleVsync ); + s_threadVsync->~Thread(); + tracy_free( s_threadVsync ); + } + + CloseTrace( s_traceHandle2 ); + CloseTrace( s_traceHandle ); +} + +void SysTraceWorker( void* ptr ) +{ + ThreadExitHandler threadExitHandler; + SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL ); + SetThreadName( "Tracy SysTrace" ); + ProcessTrace( &s_traceHandle2, 1, 0, 0 ); + ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP ); + tracy_free( s_prop ); +} + +void SysTraceSendExternalName( uint64_t thread ) +{ + bool threadSent = false; + auto hnd = OpenThread( THREAD_QUERY_INFORMATION, FALSE, DWORD( thread ) ); + if( hnd == 0 ) + { + hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, DWORD( thread ) ); + } + if( hnd != 0 ) + { + PWSTR tmp; + _GetThreadDescription( hnd, &tmp ); + char buf[256]; + if( tmp ) + { + auto ret = wcstombs( buf, tmp, 256 ); + if( ret != 0 ) + { + GetProfiler().SendString( thread, buf, ret, QueueType::ExternalThreadName ); + threadSent = true; + } + } + const auto pid = GetProcessIdOfThread( hnd ); + if( !threadSent && NtQueryInformationThread && _EnumProcessModules && _GetModuleInformation && _GetModuleBaseNameA ) + { + void* ptr; + ULONG retlen; + auto status = NtQueryInformationThread( hnd, (THREADINFOCLASS)9 /*ThreadQuerySetWin32StartAddress*/, &ptr, sizeof( &ptr ), &retlen ); + if( status == 0 ) + { + const auto phnd = OpenProcess( PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, pid ); + if( phnd != INVALID_HANDLE_VALUE ) + { + HMODULE modules[1024]; + DWORD needed; + if( _EnumProcessModules( phnd, modules, 1024 * sizeof( HMODULE ), &needed ) != 0 ) + { + const auto sz = std::min( DWORD( needed / sizeof( HMODULE ) ), DWORD( 1024 ) ); + for( DWORD i=0; i= (uint64_t)info.lpBaseOfDll && (uint64_t)ptr <= (uint64_t)info.lpBaseOfDll + (uint64_t)info.SizeOfImage ) + { + char buf2[1024]; + const auto modlen = _GetModuleBaseNameA( phnd, modules[i], buf2, 1024 ); + if( modlen != 0 ) + { + GetProfiler().SendString( thread, buf2, modlen, QueueType::ExternalThreadName ); + threadSent = true; + } + } + } + } + } + CloseHandle( phnd ); + } + } + } + CloseHandle( hnd ); + if( !threadSent ) + { + GetProfiler().SendString( thread, "???", 3, QueueType::ExternalThreadName ); + threadSent = true; + } + if( pid != 0 ) + { + { + uint64_t _pid = pid; + TracyLfqPrepare( QueueType::TidToPid ); + MemWrite( &item->tidToPid.tid, thread ); + MemWrite( &item->tidToPid.pid, _pid ); + TracyLfqCommit; + } + if( pid == 4 ) + { + GetProfiler().SendString( thread, "System", 6, QueueType::ExternalName ); + return; + } + else + { + const auto phnd = OpenProcess( PROCESS_QUERY_LIMITED_INFORMATION, FALSE, pid ); + if( phnd != INVALID_HANDLE_VALUE ) + { + char buf2[1024]; + const auto sz = GetProcessImageFileNameA( phnd, buf2, 1024 ); + CloseHandle( phnd ); + if( sz != 0 ) + { + auto ptr = buf2 + sz - 1; + while( ptr > buf2 && *ptr != '\\' ) ptr--; + if( *ptr == '\\' ) ptr++; + GetProfiler().SendString( thread, ptr, QueueType::ExternalName ); + return; + } + } + } + } + } + + if( !threadSent ) + { + GetProfiler().SendString( thread, "???", 3, QueueType::ExternalThreadName ); + } + GetProfiler().SendString( thread, "???", 3, QueueType::ExternalName ); +} + +} + +# elif defined __linux__ + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include "TracyProfiler.hpp" +# include "TracyRingBuffer.hpp" +# include "TracyThread.hpp" + +# ifdef __ANDROID__ +# include "TracySysTracePayload.hpp" +# endif + +namespace tracy +{ + +static const char BasePath[] = "/sys/kernel/debug/tracing/"; +static const char TracingOn[] = "tracing_on"; +static const char CurrentTracer[] = "current_tracer"; +static const char TraceOptions[] = "trace_options"; +static const char TraceClock[] = "trace_clock"; +static const char SchedSwitch[] = "events/sched/sched_switch/enable"; +static const char SchedWakeup[] = "events/sched/sched_wakeup/enable"; +static const char BufferSizeKb[] = "buffer_size_kb"; +static const char TracePipe[] = "trace_pipe"; + +static std::atomic traceActive { false }; +static Thread* s_threadSampling = nullptr; +static int s_numCpus = 0; + +static constexpr size_t RingBufSize = 64*1024; +static RingBuffer* s_ring = nullptr; + +static int perf_event_open( struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags ) +{ + return syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags ); +} + +static void SetupSampling( int64_t& samplingPeriod ) +{ +#ifndef CLOCK_MONOTONIC_RAW + return; +#endif + + samplingPeriod = 100*1000; + + s_numCpus = (int)std::thread::hardware_concurrency(); + s_ring = (RingBuffer*)tracy_malloc( sizeof( RingBuffer ) * s_numCpus ); + + perf_event_attr pe = {}; + + pe.type = PERF_TYPE_SOFTWARE; + pe.size = sizeof( perf_event_attr ); + pe.config = PERF_COUNT_SW_CPU_CLOCK; + + pe.sample_freq = 10000; + pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN; +#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 ) + pe.sample_max_stack = 127; +#endif + pe.exclude_callchain_kernel = 1; + + pe.disabled = 1; + pe.freq = 1; +#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + pe.use_clockid = 1; + pe.clockid = CLOCK_MONOTONIC_RAW; +#endif + + for( int i=0; i(); + tracy_free( s_ring ); + return; + } + new( s_ring+i ) RingBuffer( fd ); + } + + s_threadSampling = (Thread*)tracy_malloc( sizeof( Thread ) ); + new(s_threadSampling) Thread( [] (void*) { + ThreadExitHandler threadExitHandler; + SetThreadName( "Tracy Sampling" ); + sched_param sp = { 5 }; + pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp ); + uint32_t currentPid = (uint32_t)getpid(); +#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + for( int i=0; i(); + tracy_free( s_ring ); + const char* err = "Tracy Profiler: sampling is disabled due to non-native scheduler clock. Are you running under a VM?"; + Profiler::MessageAppInfo( err, strlen( err ) ); + return; + } + } +#endif + for( int i=0; i> 63; + const auto m2 = test >> 47; + if( m1 == m2 ) break; + } + while( --cnt > 0 ); + for( uint64_t j=1; j> 63; + const auto m2 = test >> 47; + if( m1 != m2 ) trace[j] = 0; + } + + // skip kernel frames + uint64_t j; + for( j=0; j= 0 ) break; + } + if( j == cnt ) + { + tracy_free( trace ); + } + else + { + if( j > 0 ) + { + cnt -= j; + memmove( trace+1, trace+1+j, sizeof( uint64_t ) * cnt ); + } + memcpy( trace, &cnt, sizeof( uint64_t ) ); + +#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + t0 = s_ring[i].ConvertTimeToTsc( t0 ); +#endif + + TracyLfqPrepare( QueueType::CallstackSample ); + MemWrite( &item->callstackSampleFat.time, t0 ); + MemWrite( &item->callstackSampleFat.thread, (uint64_t)tid ); + MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace ); + TracyLfqCommit; + } + } + } + s_ring[i].Advance( hdr.size ); + } + if( !traceActive.load( std::memory_order_relaxed) ) break; + if( !hadData ) + { + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + } + + for( int i=0; i(); + tracy_free( s_ring ); + }, nullptr ); +} + +#ifdef __ANDROID__ +static bool TraceWrite( const char* path, size_t psz, const char* val, size_t vsz ) +{ + // Explanation for "su root sh -c": there are 2 flavors of "su" in circulation + // on Android. The default Android su has the following syntax to run a command + // as root: + // su root 'command' + // and 'command' is exec'd not passed to a shell, so if shell interpretation is + // wanted, one needs to do: + // su root sh -c 'command' + // Besides that default Android 'su' command, some Android devices use a different + // su with a command-line interface closer to the familiar util-linux su found + // on Linux distributions. Fortunately, both the util-linux su and the one + // in https://github.com/topjohnwu/Magisk seem to be happy with the above + // `su root sh -c 'command'` command line syntax. + char tmp[256]; + sprintf( tmp, "su root sh -c 'echo \"%s\" > %s%s'", val, BasePath, path ); + return system( tmp ) == 0; +} +#else +static bool TraceWrite( const char* path, size_t psz, const char* val, size_t vsz ) +{ + char tmp[256]; + memcpy( tmp, BasePath, sizeof( BasePath ) - 1 ); + memcpy( tmp + sizeof( BasePath ) - 1, path, psz ); + + int fd = open( tmp, O_WRONLY ); + if( fd < 0 ) return false; + + for(;;) + { + ssize_t cnt = write( fd, val, vsz ); + if( cnt == (ssize_t)vsz ) + { + close( fd ); + return true; + } + if( cnt < 0 ) + { + close( fd ); + return false; + } + vsz -= cnt; + val += cnt; + } +} +#endif + +#ifdef __ANDROID__ +void SysTraceInjectPayload() +{ + int pipefd[2]; + if( pipe( pipefd ) == 0 ) + { + const auto pid = fork(); + if( pid == 0 ) + { + // child + close( pipefd[1] ); + if( dup2( pipefd[0], STDIN_FILENO ) >= 0 ) + { + close( pipefd[0] ); + execlp( "su", "su", "root", "sh", "-c", "cat > /data/tracy_systrace", (char*)nullptr ); + exit( 1 ); + } + } + else if( pid > 0 ) + { + // parent + close( pipefd[0] ); + +#ifdef __aarch64__ + write( pipefd[1], tracy_systrace_aarch64_data, tracy_systrace_aarch64_size ); +#else + write( pipefd[1], tracy_systrace_armv7_data, tracy_systrace_armv7_size ); +#endif + close( pipefd[1] ); + waitpid( pid, nullptr, 0 ); + + system( "su root sh -c 'chmod 700 /data/tracy_systrace'" ); + } + } +} +#endif + +bool SysTraceStart( int64_t& samplingPeriod ) +{ +#ifndef CLOCK_MONOTONIC_RAW + return false; +#endif + + if( !TraceWrite( TracingOn, sizeof( TracingOn ), "0", 2 ) ) return false; + if( !TraceWrite( CurrentTracer, sizeof( CurrentTracer ), "nop", 4 ) ) return false; + TraceWrite( TraceOptions, sizeof( TraceOptions ), "norecord-cmd", 13 ); + TraceWrite( TraceOptions, sizeof( TraceOptions ), "norecord-tgid", 14 ); + TraceWrite( TraceOptions, sizeof( TraceOptions ), "noirq-info", 11 ); + TraceWrite( TraceOptions, sizeof( TraceOptions ), "noannotate", 11 ); +#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + if( !TraceWrite( TraceClock, sizeof( TraceClock ), "x86-tsc", 8 ) ) return false; +#else + if( !TraceWrite( TraceClock, sizeof( TraceClock ), "mono_raw", 9 ) ) return false; +#endif + if( !TraceWrite( SchedSwitch, sizeof( SchedSwitch ), "1", 2 ) ) return false; + if( !TraceWrite( SchedWakeup, sizeof( SchedWakeup ), "1", 2 ) ) return false; + if( !TraceWrite( BufferSizeKb, sizeof( BufferSizeKb ), "4096", 5 ) ) return false; + +#if defined __ANDROID__ && ( defined __aarch64__ || defined __ARM_ARCH ) + SysTraceInjectPayload(); +#endif + + if( !TraceWrite( TracingOn, sizeof( TracingOn ), "1", 2 ) ) return false; + traceActive.store( true, std::memory_order_relaxed ); + + SetupSampling( samplingPeriod ); + + return true; +} + +void SysTraceStop() +{ + TraceWrite( TracingOn, sizeof( TracingOn ), "0", 2 ); + traceActive.store( false, std::memory_order_relaxed ); + if( s_threadSampling ) + { + s_threadSampling->~Thread(); + tracy_free( s_threadSampling ); + } +} + +static uint64_t ReadNumber( const char*& data ) +{ + auto ptr = data; + assert( *ptr >= '0' && *ptr <= '9' ); + uint64_t val = *ptr++ - '0'; + for(;;) + { + const uint8_t v = uint8_t( *ptr - '0' ); + if( v > 9 ) break; + val = val * 10 + v; + ptr++; + } + data = ptr; + return val; +} + +static uint8_t ReadState( char state ) +{ + switch( state ) + { + case 'D': return 101; + case 'I': return 102; + case 'R': return 103; + case 'S': return 104; + case 'T': return 105; + case 't': return 106; + case 'W': return 107; + case 'X': return 108; + case 'Z': return 109; + default: return 100; + } +} + +#if defined __ANDROID__ && defined __ANDROID_API__ && __ANDROID_API__ < 18 +/*- + * Copyright (c) 2011 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Christos Zoulas. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +ssize_t getdelim(char **buf, size_t *bufsiz, int delimiter, FILE *fp) +{ + char *ptr, *eptr; + + if (*buf == NULL || *bufsiz == 0) { + *bufsiz = BUFSIZ; + if ((*buf = (char*)malloc(*bufsiz)) == NULL) + return -1; + } + + for (ptr = *buf, eptr = *buf + *bufsiz;;) { + int c = fgetc(fp); + if (c == -1) { + if (feof(fp)) + return ptr == *buf ? -1 : ptr - *buf; + else + return -1; + } + *ptr++ = c; + if (c == delimiter) { + *ptr = '\0'; + return ptr - *buf; + } + if (ptr + 2 >= eptr) { + char *nbuf; + size_t nbufsiz = *bufsiz * 2; + ssize_t d = ptr - *buf; + if ((nbuf = (char*)realloc(*buf, nbufsiz)) == NULL) + return -1; + *buf = nbuf; + *bufsiz = nbufsiz; + eptr = nbuf + nbufsiz; + ptr = nbuf + d; + } + } +} + +ssize_t getline(char **buf, size_t *bufsiz, FILE *fp) +{ + return getdelim(buf, bufsiz, '\n', fp); +} +#endif + +static void HandleTraceLine( const char* line ) +{ + line += 23; + while( *line != '[' ) line++; + line++; + const auto cpu = (uint8_t)ReadNumber( line ); + line++; // ']' + while( *line == ' ' ) line++; + +#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + const auto time = ReadNumber( line ); +#else + const auto ts = ReadNumber( line ); + line++; // '.' + const auto tus = ReadNumber( line ); + const auto time = ts * 1000000000ll + tus * 1000ll; +#endif + + line += 2; // ': ' + if( memcmp( line, "sched_switch", 12 ) == 0 ) + { + line += 14; + + while( memcmp( line, "prev_pid", 8 ) != 0 ) line++; + line += 9; + + const auto oldPid = ReadNumber( line ); + line++; + + while( memcmp( line, "prev_state", 10 ) != 0 ) line++; + line += 11; + + const auto oldState = (uint8_t)ReadState( *line ); + line += 5; + + while( memcmp( line, "next_pid", 8 ) != 0 ) line++; + line += 9; + + const auto newPid = ReadNumber( line ); + + uint8_t reason = 100; + + TracyLfqPrepare( QueueType::ContextSwitch ); + MemWrite( &item->contextSwitch.time, time ); + MemWrite( &item->contextSwitch.oldThread, oldPid ); + MemWrite( &item->contextSwitch.newThread, newPid ); + MemWrite( &item->contextSwitch.cpu, cpu ); + MemWrite( &item->contextSwitch.reason, reason ); + MemWrite( &item->contextSwitch.state, oldState ); + TracyLfqCommit; + } + else if( memcmp( line, "sched_wakeup", 12 ) == 0 ) + { + line += 14; + + while( memcmp( line, "pid=", 4 ) != 0 ) line++; + line += 4; + + const auto pid = ReadNumber( line ); + + TracyLfqPrepare( QueueType::ThreadWakeup ); + MemWrite( &item->threadWakeup.time, time ); + MemWrite( &item->threadWakeup.thread, pid ); + TracyLfqCommit; + } +} + +#ifdef __ANDROID__ +static void ProcessTraceLines( int fd ) +{ + // Linux pipe buffer is 64KB, additional 1KB is for unfinished lines + char* buf = (char*)tracy_malloc( (64+1)*1024 ); + char* line = buf; + + for(;;) + { + if( !traceActive.load( std::memory_order_relaxed ) ) break; + + const auto rd = read( fd, line, 64*1024 ); + if( rd <= 0 ) break; + +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) + { + if( rd < 64*1024 ) + { + assert( line[rd-1] == '\n' ); + line = buf; + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + else + { + const auto end = line + rd; + line = end - 1; + while( line > buf && *line != '\n' ) line--; + if( line > buf ) + { + line++; + const auto lsz = end - line; + memmove( buf, line, lsz ); + line = buf + lsz; + } + } + continue; + } +#endif + + const auto end = line + rd; + line = buf; + for(;;) + { + auto next = (char*)memchr( line, '\n', end - line ); + if( !next ) + { + const auto lsz = end - line; + memmove( buf, line, lsz ); + line = buf + lsz; + break; + } + HandleTraceLine( line ); + line = ++next; + } + if( rd < 64*1024 ) + { + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + } + + tracy_free( buf ); +} + +void SysTraceWorker( void* ptr ) +{ + ThreadExitHandler threadExitHandler; + SetThreadName( "Tracy SysTrace" ); + int pipefd[2]; + if( pipe( pipefd ) == 0 ) + { + const auto pid = fork(); + if( pid == 0 ) + { + // child + close( pipefd[0] ); + dup2( open( "/dev/null", O_WRONLY ), STDERR_FILENO ); + if( dup2( pipefd[1], STDOUT_FILENO ) >= 0 ) + { + close( pipefd[1] ); + sched_param sp = { 4 }; + pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp ); +#if defined __ANDROID__ && ( defined __aarch64__ || defined __ARM_ARCH ) + execlp( "su", "su", "root", "sh", "-c", "/data/tracy_systrace", (char*)nullptr ); +#endif + execlp( "su", "su", "root", "sh", "-c", "cat /sys/kernel/debug/tracing/trace_pipe", (char*)nullptr ); + exit( 1 ); + } + } + else if( pid > 0 ) + { + // parent + close( pipefd[1] ); + sched_param sp = { 5 }; + pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp ); + ProcessTraceLines( pipefd[0] ); + close( pipefd[0] ); + waitpid( pid, nullptr, 0 ); + } + } +} +#else +static void ProcessTraceLines( int fd ) +{ + char* buf = (char*)tracy_malloc( 64*1024 ); + + struct pollfd pfd; + pfd.fd = fd; + pfd.events = POLLIN | POLLERR; + + for(;;) + { + while( poll( &pfd, 1, 0 ) <= 0 ) + { + if( !traceActive.load( std::memory_order_relaxed ) ) break; + std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); + } + + const auto rd = read( fd, buf, 64*1024 ); + if( rd <= 0 ) break; + +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) continue; +#endif + + auto line = buf; + const auto end = buf + rd; + for(;;) + { + auto next = (char*)memchr( line, '\n', end - line ); + if( !next ) break; + HandleTraceLine( line ); + line = ++next; + } + } + + tracy_free( buf ); +} + +void SysTraceWorker( void* ptr ) +{ + ThreadExitHandler threadExitHandler; + SetThreadName( "Tracy SysTrace" ); + char tmp[256]; + memcpy( tmp, BasePath, sizeof( BasePath ) - 1 ); + memcpy( tmp + sizeof( BasePath ) - 1, TracePipe, sizeof( TracePipe ) ); + + int fd = open( tmp, O_RDONLY ); + if( fd < 0 ) return; + sched_param sp = { 5 }; + pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp ); + ProcessTraceLines( fd ); + close( fd ); +} +#endif + +void SysTraceSendExternalName( uint64_t thread ) +{ + FILE* f; + char fn[256]; + sprintf( fn, "/proc/%" PRIu64 "/comm", thread ); + f = fopen( fn, "rb" ); + if( f ) + { + char buf[256]; + const auto sz = fread( buf, 1, 256, f ); + if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0'; + GetProfiler().SendString( thread, buf, QueueType::ExternalThreadName ); + fclose( f ); + } + else + { + GetProfiler().SendString( thread, "???", 3, QueueType::ExternalThreadName ); + } + + sprintf( fn, "/proc/%" PRIu64 "/status", thread ); + f = fopen( fn, "rb" ); + if( f ) + { + int pid = -1; + size_t lsz = 1024; + auto line = (char*)tracy_malloc( lsz ); + for(;;) + { + auto rd = getline( &line, &lsz, f ); + if( rd <= 0 ) break; + if( memcmp( "Tgid:\t", line, 6 ) == 0 ) + { + pid = atoi( line + 6 ); + break; + } + } + tracy_free( line ); + fclose( f ); + if( pid >= 0 ) + { + { + uint64_t _pid = pid; + TracyLfqPrepare( QueueType::TidToPid ); + MemWrite( &item->tidToPid.tid, thread ); + MemWrite( &item->tidToPid.pid, _pid ); + TracyLfqCommit; + } + sprintf( fn, "/proc/%i/comm", pid ); + f = fopen( fn, "rb" ); + if( f ) + { + char buf[256]; + const auto sz = fread( buf, 1, 256, f ); + if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0'; + GetProfiler().SendString( thread, buf, QueueType::ExternalName ); + fclose( f ); + return; + } + } + } + GetProfiler().SendString( thread, "???", 3, QueueType::ExternalName ); +} + +} + +# endif + +#endif diff --git a/Source/ThirdParty/tracy/client/TracySysTrace.hpp b/Source/ThirdParty/tracy/client/TracySysTrace.hpp new file mode 100644 index 000000000..688cbf2ae --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracySysTrace.hpp @@ -0,0 +1,25 @@ +#ifndef __TRACYSYSTRACE_HPP__ +#define __TRACYSYSTRACE_HPP__ + +#if !defined TRACY_NO_SYSTEM_TRACING && ( defined _WIN32 || defined __CYGWIN__ || defined __linux__ ) +# define TRACY_HAS_SYSTEM_TRACING +#endif + +#ifdef TRACY_HAS_SYSTEM_TRACING + +#include + +namespace tracy +{ + +bool SysTraceStart( int64_t& samplingPeriod ); +void SysTraceStop(); +void SysTraceWorker( void* ptr ); + +void SysTraceSendExternalName( uint64_t thread ); + +} + +#endif + +#endif diff --git a/Source/ThirdParty/tracy/client/TracySysTracePayload.hpp b/Source/ThirdParty/tracy/client/TracySysTracePayload.hpp new file mode 100644 index 000000000..7c292f9d0 --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracySysTracePayload.hpp @@ -0,0 +1,78 @@ +// File: 'extra/systrace/tracy_systrace.armv7' (1149 bytes) +// File: 'extra/systrace/tracy_systrace.aarch64' (1650 bytes) + +// Exported using binary_to_compressed_c.cpp + +namespace tracy +{ + +static const unsigned int tracy_systrace_armv7_size = 1149; +static const unsigned int tracy_systrace_armv7_data[1152/4] = +{ + 0x464c457f, 0x00010101, 0x00000000, 0x00000000, 0x00280003, 0x00000001, 0x000001f0, 0x00000034, 0x00000000, 0x05000200, 0x00200034, 0x00280007, + 0x00000000, 0x00000006, 0x00000034, 0x00000034, 0x00000034, 0x000000e0, 0x000000e0, 0x00000004, 0x00000004, 0x00000003, 0x00000114, 0x00000114, + 0x00000114, 0x00000013, 0x00000013, 0x00000004, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x000003fd, 0x000003fd, 0x00000005, + 0x00001000, 0x00000001, 0x000003fd, 0x000013fd, 0x000013fd, 0x00000080, 0x000000b3, 0x00000006, 0x00001000, 0x00000002, 0x00000400, 0x00001400, + 0x00001400, 0x0000007d, 0x000000b0, 0x00000006, 0x00000004, 0x6474e551, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000006, + 0x00000004, 0x70000001, 0x000003a4, 0x000003a4, 0x000003a4, 0x00000008, 0x00000008, 0x00000004, 0x00000004, 0x7379732f, 0x2f6d6574, 0x2f6e6962, + 0x6b6e696c, 0x00007265, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000012, 0x00000016, 0x00000000, + 0x00000000, 0x00000012, 0x6f6c6400, 0x006e6570, 0x4342494c, 0x62696c00, 0x732e6c64, 0x6c64006f, 0x006d7973, 0x00000001, 0x00000003, 0x00000001, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00010001, 0x0000000d, 0x00000010, 0x00000000, 0x00050d63, 0x00020000, 0x00000008, + 0x00000000, 0x000014bc, 0x00000116, 0x000014c0, 0x00000216, 0xe52de004, 0xe59fe004, 0xe08fe00e, 0xe5bef008, 0x000012dc, 0xe28fc600, 0xe28cca01, + 0xe5bcf2dc, 0xe28fc600, 0xe28cca01, 0xe5bcf2d4, 0xe92d4ff0, 0xe28db01c, 0xe24dd024, 0xe24dd801, 0xe59f017c, 0xe3a01001, 0xe3a08001, 0xe08f0000, + 0xebfffff0, 0xe59f116c, 0xe1a04000, 0xe08f1001, 0xebffffef, 0xe59f1160, 0xe1a06000, 0xe1a00004, 0xe08f1001, 0xebffffea, 0xe59f1150, 0xe1a07000, + 0xe1a00004, 0xe08f1001, 0xebffffe5, 0xe59f1140, 0xe1a05000, 0xe1a00004, 0xe08f1001, 0xebffffe0, 0xe58d0004, 0xe1a00004, 0xe59f1128, 0xe08f1001, + 0xebffffdb, 0xe59f1120, 0xe1a0a000, 0xe1a00004, 0xe08f1001, 0xebffffd6, 0xe1a04000, 0xe59f010c, 0xe3a01000, 0xe3a09000, 0xe08f0000, 0xe12fff36, + 0xe1a06000, 0xe3700001, 0xca000001, 0xe3a00000, 0xe12fff37, 0xe3a00009, 0xe3a01001, 0xe1cd01bc, 0xe3a00008, 0xe1cd01b4, 0xe3090680, 0xe3400098, + 0xe3a02000, 0xe58d000c, 0xe28d0010, 0xe58d7000, 0xe58d6018, 0xe58d8010, 0xe58d9008, 0xe12fff35, 0xe3500000, 0xca00001d, 0xe28d7018, 0xe28d8010, + 0xe28d9020, 0xe1a00007, 0xe3a01001, 0xe3a02000, 0xe12fff35, 0xe3500000, 0xda00000a, 0xe1a00006, 0xe1a01009, 0xe3a02801, 0xe12fff3a, 0xe3500001, + 0xba00000e, 0xe1a02000, 0xe3a00001, 0xe1a01009, 0xe12fff34, 0xea000003, 0xe59d2004, 0xe28d0008, 0xe3a01000, 0xe12fff32, 0xe1a00008, 0xe3a01001, + 0xe3a02000, 0xe12fff35, 0xe3500001, 0xbaffffe4, 0xe59d1000, 0xe3a00000, 0xe12fff31, 0xe24bd01c, 0xe8bd8ff0, 0x00000198, 0x00000190, 0x00000181, + 0x00000172, 0x00000163, 0x00000159, 0x0000014a, 0x00000138, 0x7ffffe4c, 0x00000001, 0x6362696c, 0x006f732e, 0x6e65706f, 0x69786500, 0x6f700074, + 0x6e006c6c, 0x736f6e61, 0x7065656c, 0x61657200, 0x72770064, 0x00657469, 0x7379732f, 0x72656b2f, 0x2f6c656e, 0x75626564, 0x72742f67, 0x6e696361, + 0x72742f67, 0x5f656361, 0x65706970, 0x00000000, 0x00000003, 0x000014b0, 0x00000002, 0x00000010, 0x00000017, 0x000001b4, 0x00000014, 0x00000011, + 0x00000015, 0x00000000, 0x00000006, 0x00000128, 0x0000000b, 0x00000010, 0x00000005, 0x00000158, 0x0000000a, 0x0000001c, 0x6ffffef5, 0x00000174, + 0x00000001, 0x0000000d, 0x0000001e, 0x00000008, 0x6ffffffb, 0x00000001, 0x6ffffff0, 0x0000018c, 0x6ffffffe, 0x00000194, 0x6fffffff, 0x00000001, +}; + +static const unsigned int tracy_systrace_aarch64_size = 1650; +static const unsigned int tracy_systrace_aarch64_data[1652/4] = +{ + 0x464c457f, 0x00010102, 0x00000000, 0x00000000, 0x00b70003, 0x00000001, 0x000002e0, 0x00000000, 0x00000040, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00380040, 0x00400006, 0x00000000, 0x00000006, 0x00000005, 0x00000040, 0x00000000, 0x00000040, 0x00000000, 0x00000040, 0x00000000, + 0x00000150, 0x00000000, 0x00000150, 0x00000000, 0x00000008, 0x00000000, 0x00000003, 0x00000004, 0x00000190, 0x00000000, 0x00000190, 0x00000000, + 0x00000190, 0x00000000, 0x00000015, 0x00000000, 0x00000015, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000005, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000004e1, 0x00000000, 0x000004e1, 0x00000000, 0x00001000, 0x00000000, 0x00000001, 0x00000006, + 0x000004e8, 0x00000000, 0x000014e8, 0x00000000, 0x000014e8, 0x00000000, 0x0000018a, 0x00000000, 0x00000190, 0x00000000, 0x00001000, 0x00000000, + 0x00000002, 0x00000006, 0x000004e8, 0x00000000, 0x000014e8, 0x00000000, 0x000014e8, 0x00000000, 0x00000160, 0x00000000, 0x00000160, 0x00000000, + 0x00000008, 0x00000000, 0x6474e551, 0x00000006, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000008, 0x00000000, 0x7379732f, 0x2f6d6574, 0x2f6e6962, 0x6b6e696c, 0x34367265, 0x00000000, 0x00000001, 0x00000001, + 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00090003, 0x000002e0, 0x00000000, 0x00000000, 0x00000000, 0x00000010, 0x00000012, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x0000000a, 0x00000012, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x62696c00, 0x732e6c64, 0x6c64006f, 0x006d7973, 0x706f6c64, 0x4c006e65, + 0x00434249, 0x00000000, 0x00000000, 0x00000000, 0x00010001, 0x00000001, 0x00000010, 0x00000000, 0x00050d63, 0x00020000, 0x00000017, 0x00000000, + 0x00001668, 0x00000000, 0x00000402, 0x00000002, 0x00000000, 0x00000000, 0x00001670, 0x00000000, 0x00000402, 0x00000003, 0x00000000, 0x00000000, + 0xa9bf7bf0, 0xb0000010, 0xf9433211, 0x91198210, 0xd61f0220, 0xd503201f, 0xd503201f, 0xd503201f, 0xb0000010, 0xf9433611, 0x9119a210, 0xd61f0220, + 0xb0000010, 0xf9433a11, 0x9119c210, 0xd61f0220, 0xa9bb67fc, 0xa9015ff8, 0xa90257f6, 0xa9034ff4, 0xa9047bfd, 0x910103fd, 0xd14043ff, 0xd10083ff, + 0x90000000, 0x91124000, 0x52800021, 0x52800039, 0x97ffffec, 0x90000001, 0x91126021, 0xaa0003f7, 0x97ffffec, 0x90000001, 0xaa0003f8, 0x91127421, + 0xaa1703e0, 0x97ffffe7, 0x90000001, 0xaa0003f3, 0x91128821, 0xaa1703e0, 0x97ffffe2, 0x90000001, 0xaa0003f4, 0x91129c21, 0xaa1703e0, 0x97ffffdd, + 0x90000001, 0xaa0003f5, 0x9112c421, 0xaa1703e0, 0x97ffffd8, 0x90000001, 0xaa0003f6, 0x9112d821, 0xaa1703e0, 0x97ffffd3, 0xaa0003f7, 0x90000000, + 0x9112f000, 0x2a1f03e1, 0xd63f0300, 0x2a0003f8, 0x36f80060, 0x2a1f03e0, 0xd63f0260, 0x90000009, 0x3dc12120, 0x52800128, 0x79003be8, 0x52800108, + 0x910043e0, 0x52800021, 0x2a1f03e2, 0xb9001bf8, 0xb90013f9, 0x79002be8, 0x3d8003e0, 0xd63f0280, 0x7100001f, 0x5400036c, 0x910063e0, 0x52800021, + 0x2a1f03e2, 0xd63f0280, 0x7100001f, 0x5400018d, 0x910083e1, 0x52a00022, 0x2a1803e0, 0xd63f02c0, 0xf100041f, 0x540001eb, 0xaa0003e2, 0x910083e1, + 0x52800020, 0xd63f02e0, 0x14000004, 0x910003e0, 0xaa1f03e1, 0xd63f02a0, 0x910043e0, 0x52800021, 0x2a1f03e2, 0xd63f0280, 0x7100041f, 0x54fffceb, + 0x2a1f03e0, 0xd63f0260, 0x914043ff, 0x910083ff, 0xa9447bfd, 0xa9434ff4, 0xa94257f6, 0xa9415ff8, 0xa8c567fc, 0xd65f03c0, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00989680, 0x00000000, 0x6362696c, 0x006f732e, 0x6e65706f, 0x69786500, 0x6f700074, 0x6e006c6c, 0x736f6e61, 0x7065656c, + 0x61657200, 0x72770064, 0x00657469, 0x7379732f, 0x72656b2f, 0x2f6c656e, 0x75626564, 0x72742f67, 0x6e696361, 0x72742f67, 0x5f656361, 0x65706970, + 0x00000000, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x6ffffef5, 0x00000000, 0x000001a8, 0x00000000, 0x00000005, 0x00000000, + 0x00000228, 0x00000000, 0x00000006, 0x00000000, 0x000001c8, 0x00000000, 0x0000000a, 0x00000000, 0x0000001c, 0x00000000, 0x0000000b, 0x00000000, + 0x00000018, 0x00000000, 0x00000015, 0x00000000, 0x00000000, 0x00000000, 0x00000003, 0x00000000, 0x00001650, 0x00000000, 0x00000002, 0x00000000, + 0x00000030, 0x00000000, 0x00000014, 0x00000000, 0x00000007, 0x00000000, 0x00000017, 0x00000000, 0x00000270, 0x00000000, 0x0000001e, 0x00000000, + 0x00000008, 0x00000000, 0x6ffffffb, 0x00000000, 0x00000001, 0x00000000, 0x6ffffffe, 0x00000000, 0x00000250, 0x00000000, 0x6fffffff, 0x00000000, + 0x00000001, 0x00000000, 0x6ffffff0, 0x00000000, 0x00000244, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x000002a0, 0x00000000, 0x000002a0, +}; + +} diff --git a/Source/ThirdParty/tracy/client/TracyThread.hpp b/Source/ThirdParty/tracy/client/TracyThread.hpp new file mode 100644 index 000000000..edd255e87 --- /dev/null +++ b/Source/ThirdParty/tracy/client/TracyThread.hpp @@ -0,0 +1,85 @@ +#ifndef __TRACYTHREAD_HPP__ +#define __TRACYTHREAD_HPP__ + +#if defined _WIN32 || defined __CYGWIN__ +# include +#else +# include +#endif + +#ifdef TRACY_MANUAL_LIFETIME +# include "tracy_rpmalloc.hpp" +#endif + +namespace tracy +{ + +class ThreadExitHandler +{ +public: + ~ThreadExitHandler() + { +#ifdef TRACY_MANUAL_LIFETIME + rpmalloc_thread_finalize(); +#endif + } +}; + +#if defined _WIN32 || defined __CYGWIN__ + +class Thread +{ +public: + Thread( void(*func)( void* ptr ), void* ptr ) + : m_func( func ) + , m_ptr( ptr ) + , m_hnd( CreateThread( nullptr, 0, Launch, this, 0, nullptr ) ) + {} + + ~Thread() + { + WaitForSingleObject( m_hnd, INFINITE ); + CloseHandle( m_hnd ); + } + + HANDLE Handle() const { return m_hnd; } + +private: + static DWORD WINAPI Launch( void* ptr ) { ((Thread*)ptr)->m_func( ((Thread*)ptr)->m_ptr ); return 0; } + + void(*m_func)( void* ptr ); + void* m_ptr; + HANDLE m_hnd; +}; + +#else + +class Thread +{ +public: + Thread( void(*func)( void* ptr ), void* ptr ) + : m_func( func ) + , m_ptr( ptr ) + { + pthread_create( &m_thread, nullptr, Launch, this ); + } + + ~Thread() + { + pthread_join( m_thread, nullptr ); + } + + pthread_t Handle() const { return m_thread; } + +private: + static void* Launch( void* ptr ) { ((Thread*)ptr)->m_func( ((Thread*)ptr)->m_ptr ); return nullptr; } + void(*m_func)( void* ptr ); + void* m_ptr; + pthread_t m_thread; +}; + +#endif + +} + +#endif diff --git a/Source/ThirdParty/tracy/client/tracy_concurrentqueue.h b/Source/ThirdParty/tracy/client/tracy_concurrentqueue.h new file mode 100644 index 000000000..bf095bc36 --- /dev/null +++ b/Source/ThirdParty/tracy/client/tracy_concurrentqueue.h @@ -0,0 +1,1445 @@ +// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue. +// An overview, including benchmark results, is provided here: +// http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ +// The full design is also described in excruciating detail at: +// http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue + +// Simplified BSD license: +// Copyright (c) 2013-2016, Cameron Desrochers. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this list of +// conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, this list of +// conditions and the following disclaimer in the documentation and/or other materials +// provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +#pragma once + +#include "../common/TracyAlloc.hpp" +#include "../common/TracySystem.hpp" + +#if defined(__GNUC__) +// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and +// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings +// upon assigning any computed values) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +#if defined(__APPLE__) +#include "TargetConditionals.h" +#endif + +#include // Requires C++11. Sorry VS2010. +#include +#include // for max_align_t +#include +#include +#include +#include +#include +#include +#include // for CHAR_BIT +#include +#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading + +namespace tracy +{ + +// Compiler-specific likely/unlikely hints +namespace moodycamel { namespace details { +#if defined(__GNUC__) + inline bool cqLikely(bool x) { return __builtin_expect((x), true); } + inline bool cqUnlikely(bool x) { return __builtin_expect((x), false); } +#else + inline bool cqLikely(bool x) { return x; } + inline bool cqUnlikely(bool x) { return x; } +#endif +} } + +namespace +{ + // to avoid MSVC warning 4127: conditional expression is constant + template + struct compile_time_condition + { + static const bool value = false; + }; + template <> + struct compile_time_condition + { + static const bool value = true; + }; +} + +namespace moodycamel { +namespace details { + template + struct const_numeric_max { + static_assert(std::is_integral::value, "const_numeric_max can only be used with integers"); + static const T value = std::numeric_limits::is_signed + ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast(1) + : static_cast(-1); + }; + +#if defined(__GLIBCXX__) + typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while +#else + typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: +#endif + + // Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting + // 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. + typedef union { + std_max_align_t x; + long long y; + void* z; + } max_align_t; +} + +// Default traits for the ConcurrentQueue. To change some of the +// traits without re-implementing all of them, inherit from this +// struct and shadow the declarations you wish to be different; +// since the traits are used as a template type parameter, the +// shadowed declarations will be used where defined, and the defaults +// otherwise. +struct ConcurrentQueueDefaultTraits +{ + // General-purpose size type. std::size_t is strongly recommended. + typedef std::size_t size_t; + + // The type used for the enqueue and dequeue indices. Must be at least as + // large as size_t. Should be significantly larger than the number of elements + // you expect to hold at once, especially if you have a high turnover rate; + // for example, on 32-bit x86, if you expect to have over a hundred million + // elements or pump several million elements through your queue in a very + // short space of time, using a 32-bit type *may* trigger a race condition. + // A 64-bit int type is recommended in that case, and in practice will + // prevent a race condition no matter the usage of the queue. Note that + // whether the queue is lock-free with a 64-int type depends on the whether + // std::atomic is lock-free, which is platform-specific. + typedef std::size_t index_t; + + // Internally, all elements are enqueued and dequeued from multi-element + // blocks; this is the smallest controllable unit. If you expect few elements + // but many producers, a smaller block size should be favoured. For few producers + // and/or many elements, a larger block size is preferred. A sane default + // is provided. Must be a power of 2. + static const size_t BLOCK_SIZE = 64*1024; + + // For explicit producers (i.e. when using a producer token), the block is + // checked for being empty by iterating through a list of flags, one per element. + // For large block sizes, this is too inefficient, and switching to an atomic + // counter-based approach is faster. The switch is made for block sizes strictly + // larger than this threshold. + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; + + // How many full blocks can be expected for a single explicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; + + // Controls the number of items that an explicit consumer (i.e. one with a token) + // must consume before it causes all consumers to rotate and move on to the next + // internal queue. + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; + + // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. + // Enqueue operations that would cause this limit to be surpassed will fail. Note + // that this limit is enforced at the block level (for performance reasons), i.e. + // it's rounded up to the nearest block size. + static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max::value; + + + // Memory allocation can be customized if needed. + // malloc should return nullptr on failure, and handle alignment like std::malloc. +#if defined(malloc) || defined(free) + // Gah, this is 2015, stop defining macros that break standard code already! + // Work around malloc/free being special macros: + static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } + static inline void WORKAROUND_free(void* ptr) { return free(ptr); } + static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); } + static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); } +#else + static inline void* malloc(size_t size) { return tracy::tracy_malloc(size); } + static inline void free(void* ptr) { return tracy::tracy_free(ptr); } +#endif +}; + + +// When producing or consuming many elements, the most efficient way is to: +// 1) Use one of the bulk-operation methods of the queue with a token +// 2) Failing that, use the bulk-operation methods without a token +// 3) Failing that, create a token and use that with the single-item methods +// 4) Failing that, use the single-parameter methods of the queue +// Having said that, don't create tokens willy-nilly -- ideally there should be +// a maximum of one token per thread (of each kind). +struct ProducerToken; +struct ConsumerToken; + +template class ConcurrentQueue; + + +namespace details +{ + struct ConcurrentQueueProducerTypelessBase + { + ConcurrentQueueProducerTypelessBase* next; + std::atomic inactive; + ProducerToken* token; + uint64_t threadId; + + ConcurrentQueueProducerTypelessBase() + : next(nullptr), inactive(false), token(nullptr), threadId(0) + { + } + }; + + template + static inline bool circular_less_than(T a, T b) + { +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4554) +#endif + static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "circular_less_than is intended to be used only with unsigned integer types"); + return static_cast(a - b) > static_cast(static_cast(1) << static_cast(sizeof(T) * CHAR_BIT - 1)); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + } + + template + static inline char* align_for(char* ptr) + { + const std::size_t alignment = std::alignment_of::value; + return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; + } + + template + static inline T ceil_to_pow_2(T x) + { + static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types"); + + // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (std::size_t i = 1; i < sizeof(T); i <<= 1) { + x |= x >> (i << 3); + } + ++x; + return x; + } + + template + static inline void swap_relaxed(std::atomic& left, std::atomic& right) + { + T temp = std::move(left.load(std::memory_order_relaxed)); + left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed); + right.store(std::move(temp), std::memory_order_relaxed); + } + + template + static inline T const& nomove(T const& x) + { + return x; + } + + template + struct nomove_if + { + template + static inline T const& eval(T const& x) + { + return x; + } + }; + + template<> + struct nomove_if + { + template + static inline auto eval(U&& x) + -> decltype(std::forward(x)) + { + return std::forward(x); + } + }; + + template + static inline auto deref_noexcept(It& it) noexcept -> decltype(*it) + { + return *it; + } + +#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + template struct is_trivially_destructible : std::is_trivially_destructible { }; +#else + template struct is_trivially_destructible : std::has_trivial_destructor { }; +#endif + + template struct static_is_lock_free_num { enum { value = 0 }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_CHAR_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_SHORT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_INT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_LONG_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_LLONG_LOCK_FREE }; }; + template struct static_is_lock_free : static_is_lock_free_num::type> { }; + template<> struct static_is_lock_free { enum { value = ATOMIC_BOOL_LOCK_FREE }; }; + template struct static_is_lock_free { enum { value = ATOMIC_POINTER_LOCK_FREE }; }; +} + + +struct ProducerToken +{ + template + explicit ProducerToken(ConcurrentQueue& queue); + + ProducerToken(ProducerToken&& other) noexcept + : producer(other.producer) + { + other.producer = nullptr; + if (producer != nullptr) { + producer->token = this; + } + } + + inline ProducerToken& operator=(ProducerToken&& other) noexcept + { + swap(other); + return *this; + } + + void swap(ProducerToken& other) noexcept + { + std::swap(producer, other.producer); + if (producer != nullptr) { + producer->token = this; + } + if (other.producer != nullptr) { + other.producer->token = &other; + } + } + + // A token is always valid unless: + // 1) Memory allocation failed during construction + // 2) It was moved via the move constructor + // (Note: assignment does a swap, leaving both potentially valid) + // 3) The associated queue was destroyed + // Note that if valid() returns true, that only indicates + // that the token is valid for use with a specific queue, + // but not which one; that's up to the user to track. + inline bool valid() const { return producer != nullptr; } + + ~ProducerToken() + { + if (producer != nullptr) { + producer->token = nullptr; + producer->inactive.store(true, std::memory_order_release); + } + } + + // Disable copying and assignment + ProducerToken(ProducerToken const&) = delete; + ProducerToken& operator=(ProducerToken const&) = delete; + +private: + template friend class ConcurrentQueue; + +protected: + details::ConcurrentQueueProducerTypelessBase* producer; +}; + + +struct ConsumerToken +{ + template + explicit ConsumerToken(ConcurrentQueue& q); + + ConsumerToken(ConsumerToken&& other) noexcept + : initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer) + { + } + + inline ConsumerToken& operator=(ConsumerToken&& other) noexcept + { + swap(other); + return *this; + } + + void swap(ConsumerToken& other) noexcept + { + std::swap(initialOffset, other.initialOffset); + std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); + std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); + std::swap(currentProducer, other.currentProducer); + std::swap(desiredProducer, other.desiredProducer); + } + + // Disable copying and assignment + ConsumerToken(ConsumerToken const&) = delete; + ConsumerToken& operator=(ConsumerToken const&) = delete; + +private: + template friend class ConcurrentQueue; + +private: // but shared with ConcurrentQueue + std::uint32_t initialOffset; + std::uint32_t lastKnownGlobalOffset; + std::uint32_t itemsConsumedFromCurrent; + details::ConcurrentQueueProducerTypelessBase* currentProducer; + details::ConcurrentQueueProducerTypelessBase* desiredProducer; +}; + + +template +class ConcurrentQueue +{ +public: + struct ExplicitProducer; + + typedef moodycamel::ProducerToken producer_token_t; + typedef moodycamel::ConsumerToken consumer_token_t; + + typedef typename Traits::index_t index_t; + typedef typename Traits::size_t size_t; + + static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4307) // + integral constant overflow (that's what the ternary expression is for!) +#pragma warning(disable: 4309) // static_cast: Truncation of constant value +#endif + static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max::value - static_cast(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max::value : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::size_t must be an unsigned integral type"); + static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::index_t must be an unsigned integral type"); + static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); + static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); + static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)"); + static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be allocated + // up-front, which means only a single producer will be able to enqueue elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads (this + // includes making the memory effects of construction visible, possibly with a + // memory barrier). + explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); + } + + // Computes the correct amount of pre-allocated blocks for you based + // on the minimum number of elements you want available at any given + // time, and the maximum concurrent number of each type of producer. + ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers); + populate_initial_block_list(blocks); + } + + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + // This method is not thread safe. + ~ConcurrentQueue() + { + // Destroy producers + auto ptr = producerListTail.load(std::memory_order_relaxed); + while (ptr != nullptr) { + auto next = ptr->next_prod(); + if (ptr->token != nullptr) { + ptr->token->producer = nullptr; + } + destroy(ptr); + ptr = next; + } + + // Destroy global free list + auto block = freeList.head_unsafe(); + while (block != nullptr) { + auto next = block->freeListNext.load(std::memory_order_relaxed); + if (block->dynamicallyAllocated) { + destroy(block); + } + block = next; + } + + // Destroy initial free list + destroy_array(initialBlockPool, initialBlockPoolSize); + } + + // Disable copying and copy assignment + ConcurrentQueue(ConcurrentQueue const&) = delete; + ConcurrentQueue(ConcurrentQueue&& other) = delete; + ConcurrentQueue& operator=(ConcurrentQueue const&) = delete; + ConcurrentQueue& operator=(ConcurrentQueue&& other) = delete; + +public: + tracy_force_inline T* enqueue_begin(producer_token_t const& token, index_t& currentTailIndex) + { + return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::enqueue_begin(currentTailIndex); + } + + template + size_t try_dequeue_bulk_single(consumer_token_t& token, NotifyThread notifyThread, ProcessData processData ) + { + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return 0; + } + } + + size_t count = static_cast(token.currentProducer)->dequeue_bulk(notifyThread, processData); + token.itemsConsumedFromCurrent += static_cast(count); + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + if( count == 0 ) + { + while (ptr != static_cast(token.currentProducer)) { + auto dequeued = ptr->dequeue_bulk(notifyThread, processData); + if (dequeued != 0) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = static_cast(dequeued); + return dequeued; + } + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return 0; + } + else + { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = 0; + return count; + } + } + + + // Returns an estimate of the total number of elements currently in the queue. This + // estimate is only accurate if the queue has completely stabilized before it is called + // (i.e. all enqueue and dequeue operations have completed and their memory effects are + // visible on the calling thread, and no further operations start while this method is + // being called). + // Thread-safe. + size_t size_approx() const + { + size_t size = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + size += ptr->size_approx(); + } + return size; + } + + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free() + { + return + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2; + } + + +private: + friend struct ProducerToken; + friend struct ConsumerToken; + friend struct ExplicitProducer; + + + /////////////////////////////// + // Queue methods + /////////////////////////////// + + inline bool update_current_producer_after_rotation(consumer_token_t& token) + { + // Ah, there's been a rotation, figure out where we should be! + auto tail = producerListTail.load(std::memory_order_acquire); + if (token.desiredProducer == nullptr && tail == nullptr) { + return false; + } + auto prodCount = producerCount.load(std::memory_order_relaxed); + auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); + if (details::cqUnlikely(token.desiredProducer == nullptr)) { + // Aha, first time we're dequeueing anything. + // Figure out our local position + // Note: offset is from start, not end, but we're traversing from end -- subtract from count first + std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); + token.desiredProducer = tail; + for (std::uint32_t i = 0; i != offset; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + } + + std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; + if (delta >= prodCount) { + delta = delta % prodCount; + } + for (std::uint32_t i = 0; i != delta; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + + token.lastKnownGlobalOffset = globalOffset; + token.currentProducer = token.desiredProducer; + token.itemsConsumedFromCurrent = 0; + return true; + } + + + /////////////////////////// + // Free list + /////////////////////////// + + template + struct FreeListNode + { + FreeListNode() : freeListRefs(0), freeListNext(nullptr) { } + + std::atomic freeListRefs; + std::atomic freeListNext; + }; + + // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but + // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly + // speedy under low contention. + template // N must inherit FreeListNode or have the same fields (and initialization of them) + struct FreeList + { + FreeList() : freeListHead(nullptr) { } + FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); } + void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } + + FreeList(FreeList const&) = delete; + FreeList& operator=(FreeList const&) = delete; + + inline void add(N* node) + { + // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to + // set it using a fetch_add + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { + // Oh look! We were the last ones referencing this node, and we know + // we want to add it to the free list, so let's do it! + add_knowing_refcount_is_zero(node); + } + } + + inline N* try_get() + { + auto head = freeListHead.load(std::memory_order_acquire); + while (head != nullptr) { + auto prevHead = head; + auto refs = head->freeListRefs.load(std::memory_order_relaxed); + if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) { + head = freeListHead.load(std::memory_order_acquire); + continue; + } + + // Good, reference count has been incremented (it wasn't at zero), which means we can read the + // next and not worry about it changing between now and the time we do the CAS + auto next = head->freeListNext.load(std::memory_order_relaxed); + if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) { + // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no + // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on). + assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); + + // Decrease refcount twice, once for our ref, and once for the list's ref + head->freeListRefs.fetch_sub(2, std::memory_order_release); + return head; + } + + // OK, the head must have changed on us, but we still need to decrease the refcount we increased. + // Note that we don't need to release any memory effects, but we do need to ensure that the reference + // count decrement happens-after the CAS on the head. + refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); + if (refs == SHOULD_BE_ON_FREELIST + 1) { + add_knowing_refcount_is_zero(prevHead); + } + } + + return nullptr; + } + + // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) + N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } + + private: + inline void add_knowing_refcount_is_zero(N* node) + { + // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run + // only one copy of this method per node at a time, i.e. the single thread case), then we know + // we can safely change the next pointer of the node; however, once the refcount is back above + // zero, then other threads could increase it (happens under heavy contention, when the refcount + // goes to zero in between a load and a refcount increment of a node in try_get, then back up to + // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS + // to add the node to the actual list fails, decrease the refcount and leave the add operation to + // the next thread who puts the refcount back at zero (which could be us, hence the loop). + auto head = freeListHead.load(std::memory_order_relaxed); + while (true) { + node->freeListNext.store(head, std::memory_order_relaxed); + node->freeListRefs.store(1, std::memory_order_release); + if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) { + // Hmm, the add failed, but we can only try again when the refcount goes back to zero + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) { + continue; + } + } + return; + } + } + + private: + // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention) + std::atomic freeListHead; + + static const std::uint32_t REFS_MASK = 0x7FFFFFFF; + static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; + }; + + + /////////////////////////// + // Block + /////////////////////////// + + struct Block + { + Block() + : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true) + { + } + + inline bool is_empty() const + { + if (compile_time_condition::value) { + // Check flags + for (size_t i = 0; i < BLOCK_SIZE; ++i) { + if (!emptyFlags[i].load(std::memory_order_relaxed)) { + return false; + } + } + + // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + else { + // Check counter + if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) { + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE); + return false; + } + } + + // Returns true if the block is now empty (does not apply in explicit context) + inline bool set_empty(index_t i) + { + if (BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flag + assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].load(std::memory_order_relaxed)); + emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].store(true, std::memory_order_release); + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release); + assert(prevVal < BLOCK_SIZE); + return prevVal == BLOCK_SIZE - 1; + } + } + + // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). + // Returns true if the block is now empty (does not apply in explicit context). + inline bool set_many_empty(index_t i, size_t count) + { + if (compile_time_condition::value) { + // Set flags + std::atomic_thread_fence(std::memory_order_release); + i = BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1)) - count + 1; + for (size_t j = 0; j != count; ++j) { + assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); + emptyFlags[i + j].store(true, std::memory_order_relaxed); + } + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release); + assert(prevVal + count <= BLOCK_SIZE); + return prevVal + count == BLOCK_SIZE; + } + } + + inline void set_all_empty() + { + if (BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set all flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(true, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); + } + } + + inline void reset_empty() + { + if (compile_time_condition::value) { + // Reset flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(false, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(0, std::memory_order_relaxed); + } + } + + inline T* operator[](index_t idx) noexcept { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } + inline T const* operator[](index_t idx) const noexcept { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } + + private: + // IMPORTANT: This must be the first member in Block, so that if T depends on the alignment of + // addresses returned by malloc, that alignment will be preserved. Apparently clang actually + // generates code that uses this assumption for AVX instructions in some cases. Ideally, we + // should also align Block to the alignment of T in case it's higher than malloc's 16-byte + // alignment, but this is hard to do in a cross-platform way. Assert for this case: + static_assert(std::alignment_of::value <= std::alignment_of::value, "The queue does not support super-aligned types at this time"); + // Additionally, we need the alignment of Block itself to be a multiple of max_align_t since + // otherwise the appropriate padding will not be added at the end of Block in order to make + // arrays of Blocks all be properly aligned (not just the first one). We use a union to force + // this. + union { + char elements[sizeof(T) * BLOCK_SIZE]; + details::max_align_t dummy; + }; + public: + Block* next; + std::atomic elementsCompletelyDequeued; + std::atomic emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; + public: + std::atomic freeListRefs; + std::atomic freeListNext; + std::atomic shouldBeOnFreeList; + bool dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' + }; + static_assert(std::alignment_of::value >= std::alignment_of::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping"); + + + /////////////////////////// + // Producer base + /////////////////////////// + + struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase + { + ProducerBase(ConcurrentQueue* parent_) : + tailIndex(0), + headIndex(0), + dequeueOptimisticCount(0), + dequeueOvercommit(0), + tailBlock(nullptr), + parent(parent_) + { + } + + virtual ~ProducerBase() { }; + + template + inline size_t dequeue_bulk(NotifyThread notifyThread, ProcessData processData) + { + return static_cast(this)->dequeue_bulk(notifyThread, processData); + } + + inline ProducerBase* next_prod() const { return static_cast(next); } + + inline size_t size_approx() const + { + auto tail = tailIndex.load(std::memory_order_relaxed); + auto head = headIndex.load(std::memory_order_relaxed); + return details::circular_less_than(head, tail) ? static_cast(tail - head) : 0; + } + + inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } + protected: + std::atomic tailIndex; // Where to enqueue to next + std::atomic headIndex; // Where to dequeue from next + + std::atomic dequeueOptimisticCount; + std::atomic dequeueOvercommit; + + Block* tailBlock; + + public: + ConcurrentQueue* parent; + }; + + + public: + /////////////////////////// + // Explicit queue + /////////////////////////// + struct ExplicitProducer : public ProducerBase + { + explicit ExplicitProducer(ConcurrentQueue* _parent) : + ProducerBase(_parent), + blockIndex(nullptr), + pr_blockIndexSlotsUsed(0), + pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), + pr_blockIndexFront(0), + pr_blockIndexEntries(nullptr), + pr_blockIndexRaw(nullptr) + { + size_t poolBasedIndexSize = details::ceil_to_pow_2(_parent->initialBlockPoolSize) >> 1; + if (poolBasedIndexSize > pr_blockIndexSize) { + pr_blockIndexSize = poolBasedIndexSize; + } + + new_block_index(0); // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE + } + + ~ExplicitProducer() + { + // Destruct any elements not yet dequeued. + // Since we're in the destructor, we can assume all elements + // are either completely dequeued or completely not (no halfways). + if (this->tailBlock != nullptr) { // Note this means there must be a block index too + // First find the block that's partially dequeued, if any + Block* halfDequeuedBlock = nullptr; + if ((this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) != 0) { + // The head's not on a block boundary, meaning a block somewhere is partially dequeued + // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary) + size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); + while (details::circular_less_than(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { + i = (i + 1) & (pr_blockIndexSize - 1); + } + assert(details::circular_less_than(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); + halfDequeuedBlock = pr_blockIndexEntries[i].block; + } + + // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration) + auto block = this->tailBlock; + do { + block = block->next; + if (block->ConcurrentQueue::Block::is_empty()) { + continue; + } + + size_t i = 0; // Offset into block + if (block == halfDequeuedBlock) { + i = static_cast(this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + } + + // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index + auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast(this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { + (*block)[i++]->~T(); + } + } while (block != this->tailBlock); + } + + // Destroy all blocks that we own + if (this->tailBlock != nullptr) { + auto block = this->tailBlock; + do { + auto nextBlock = block->next; + if (block->dynamicallyAllocated) { + destroy(block); + } + else { + this->parent->add_block_to_free_list(block); + } + block = nextBlock; + } while (block != this->tailBlock); + } + + // Destroy the block indices + auto header = static_cast(pr_blockIndexRaw); + while (header != nullptr) { + auto prev = static_cast(header->prev); + header->~BlockIndexHeader(); + (Traits::free)(header); + header = prev; + } + } + + inline void enqueue_begin_alloc(index_t currentTailIndex) + { + // We reached the end of a block, start a new one + if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::is_empty()) { + // We can re-use the block ahead of us, it's empty! + this->tailBlock = this->tailBlock->next; + this->tailBlock->ConcurrentQueue::Block::reset_empty(); + + // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the + // last block from it first -- except instead of removing then adding, we can just overwrite). + // Note that there must be a valid block index here, since even if allocation failed in the ctor, + // it would have been re-attempted when adding the first block to the queue; since there is such + // a block, a block index must have been successfully allocated. + } + else { + // We're going to need a new block; check that the block index has room + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { + // Hmm, the circular block index is already full -- we'll need + // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if + // the initial allocation failed in the constructor. + new_block_index(pr_blockIndexSlotsUsed); + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::requisition_block(); + newBlock->ConcurrentQueue::Block::reset_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } + else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + ++pr_blockIndexSlotsUsed; + } + + // Add block to block index + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release); + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + tracy_force_inline T* enqueue_begin(index_t& currentTailIndex) + { + currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + if (details::cqUnlikely((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0)) { + this->enqueue_begin_alloc(currentTailIndex); + } + return (*this->tailBlock)[currentTailIndex]; + } + + tracy_force_inline std::atomic& get_tail_index() + { + return this->tailIndex; + } + + template + size_t dequeue_bulk(NotifyThread notifyThread, ProcessData processData) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < 8192 ? desiredCount : 8192; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); + assert(overcommit <= myDequeueCount); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Determine which block the first element is in + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto firstBlockBaseIndex = firstIndex & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast(static_cast::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE); + auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); + + notifyThread( this->threadId ); + + // Iterate the blocks and dequeue + auto index = firstIndex; + do { + auto firstIndexInBlock = index; + auto endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + auto block = localBlockIndex->entries[indexIndex].block; + + const auto sz = endIndex - index; + processData( (*block)[index], sz ); + index += sz; + + block->ConcurrentQueue::Block::set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + struct BlockIndexEntry + { + index_t base; + Block* block; + }; + + struct BlockIndexHeader + { + size_t size; + std::atomic front; // Current slot (not next, like pr_blockIndexFront) + BlockIndexEntry* entries; + void* prev; + }; + + + bool new_block_index(size_t numberOfFilledSlotsToExpose) + { + auto prevBlockSizeMask = pr_blockIndexSize - 1; + + // Create the new block + pr_blockIndexSize <<= 1; + auto newRawPtr = static_cast((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize)); + if (newRawPtr == nullptr) { + pr_blockIndexSize >>= 1; // Reset to allow graceful retry + return false; + } + + auto newBlockIndexEntries = reinterpret_cast(details::align_for(newRawPtr + sizeof(BlockIndexHeader))); + + // Copy in all the old indices, if any + size_t j = 0; + if (pr_blockIndexSlotsUsed != 0) { + auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; + do { + newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; + i = (i + 1) & prevBlockSizeMask; + } while (i != pr_blockIndexFront); + } + + // Update everything + auto header = new (newRawPtr) BlockIndexHeader; + header->size = pr_blockIndexSize; + header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); + header->entries = newBlockIndexEntries; + header->prev = pr_blockIndexRaw; // we link the new block to the old one so we can free it later + + pr_blockIndexFront = j; + pr_blockIndexEntries = newBlockIndexEntries; + pr_blockIndexRaw = newRawPtr; + blockIndex.store(header, std::memory_order_release); + + return true; + } + + private: + std::atomic blockIndex; + + // To be used by producer only -- consumer must use the ones in referenced by blockIndex + size_t pr_blockIndexSlotsUsed; + size_t pr_blockIndexSize; + size_t pr_blockIndexFront; // Next slot (not current) + BlockIndexEntry* pr_blockIndexEntries; + void* pr_blockIndexRaw; + }; + + ExplicitProducer* get_explicit_producer(producer_token_t const& token) + { + return static_cast(token.producer); + } + + private: + + ////////////////////////////////// + // Block pool manipulation + ////////////////////////////////// + + void populate_initial_block_list(size_t blockCount) + { + initialBlockPoolSize = blockCount; + if (initialBlockPoolSize == 0) { + initialBlockPool = nullptr; + return; + } + + initialBlockPool = create_array(blockCount); + if (initialBlockPool == nullptr) { + initialBlockPoolSize = 0; + } + for (size_t i = 0; i < initialBlockPoolSize; ++i) { + initialBlockPool[i].dynamicallyAllocated = false; + } + } + + inline Block* try_get_block_from_initial_pool() + { + if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { + return nullptr; + } + + auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); + + return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; + } + + inline void add_block_to_free_list(Block* block) + { + freeList.add(block); + } + + inline void add_blocks_to_free_list(Block* block) + { + while (block != nullptr) { + auto next = block->next; + add_block_to_free_list(block); + block = next; + } + } + + inline Block* try_get_block_from_free_list() + { + return freeList.try_get(); + } + + // Gets a free block from one of the memory pools, or allocates a new one (if applicable) + Block* requisition_block() + { + auto block = try_get_block_from_initial_pool(); + if (block != nullptr) { + return block; + } + + block = try_get_block_from_free_list(); + if (block != nullptr) { + return block; + } + + return create(); + } + + + ////////////////////////////////// + // Producer list manipulation + ////////////////////////////////// + + ProducerBase* recycle_or_create_producer() + { + bool recycled; + return recycle_or_create_producer(recycled); + } + + ProducerBase* recycle_or_create_producer(bool& recycled) + { + // Try to re-use one first + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->inactive.load(std::memory_order_relaxed)) { + if( ptr->size_approx() == 0 ) + { + bool expected = true; + if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { + // We caught one! It's been marked as activated, the caller can have it + recycled = true; + return ptr; + } + } + } + } + + recycled = false; + return add_producer(static_cast(create(this))); + } + + ProducerBase* add_producer(ProducerBase* producer) + { + // Handle failed memory allocation + if (producer == nullptr) { + return nullptr; + } + + producerCount.fetch_add(1, std::memory_order_relaxed); + + // Add it to the lock-free list + auto prevTail = producerListTail.load(std::memory_order_relaxed); + do { + producer->next = prevTail; + } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); + + return producer; + } + + void reown_producers() + { + // After another instance is moved-into/swapped-with this one, all the + // producers we stole still think their parents are the other queue. + // So fix them up! + for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { + ptr->parent = this; + } + } + + ////////////////////////////////// + // Utility functions + ////////////////////////////////// + + template + static inline U* create_array(size_t count) + { + assert(count > 0); + return static_cast((Traits::malloc)(sizeof(U) * count)); + } + + template + static inline void destroy_array(U* p, size_t count) + { + ((void)count); + if (p != nullptr) { + assert(count > 0); + (Traits::free)(p); + } + } + + template + static inline U* create() + { + auto p = (Traits::malloc)(sizeof(U)); + return new (p) U; + } + + template + static inline U* create(A1&& a1) + { + auto p = (Traits::malloc)(sizeof(U)); + return new (p) U(std::forward(a1)); + } + + template + static inline void destroy(U* p) + { + if (p != nullptr) { + p->~U(); + } + (Traits::free)(p); + } + +private: + std::atomic producerListTail; + std::atomic producerCount; + + std::atomic initialBlockPoolIndex; + Block* initialBlockPool; + size_t initialBlockPoolSize; + + FreeList freeList; + + std::atomic nextExplicitConsumerId; + std::atomic globalExplicitConsumerOffset; +}; + + +template +ProducerToken::ProducerToken(ConcurrentQueue& queue) + : producer(queue.recycle_or_create_producer()) +{ + if (producer != nullptr) { + producer->token = this; + producer->threadId = detail::GetThreadHandleImpl(); + } +} + +template +ConsumerToken::ConsumerToken(ConcurrentQueue& queue) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) +{ + initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); +} + +template +inline void swap(ConcurrentQueue& a, ConcurrentQueue& b) noexcept +{ + a.swap(b); +} + +inline void swap(ProducerToken& a, ProducerToken& b) noexcept +{ + a.swap(b); +} + +inline void swap(ConsumerToken& a, ConsumerToken& b) noexcept +{ + a.swap(b); +} + +} + +} /* namespace tracy */ + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif diff --git a/Source/ThirdParty/tracy/client/tracy_rpmalloc.cpp b/Source/ThirdParty/tracy/client/tracy_rpmalloc.cpp new file mode 100644 index 000000000..8aae78e03 --- /dev/null +++ b/Source/ThirdParty/tracy/client/tracy_rpmalloc.cpp @@ -0,0 +1,2495 @@ +#ifdef TRACY_ENABLE + +/* rpmalloc.c - Memory allocator - Public Domain - 2016 Mattias Jansson + * + * This library provides a cross-platform lock free thread caching malloc implementation in C11. + * The latest source code is always available at + * + * https://github.com/mjansson/rpmalloc + * + * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. + * + */ + +#include "tracy_rpmalloc.hpp" + +/// Build time configurable limits +#ifndef HEAP_ARRAY_SIZE +//! Size of heap hashmap +#define HEAP_ARRAY_SIZE 47 +#endif +#ifndef ENABLE_THREAD_CACHE +//! Enable per-thread cache +#define ENABLE_THREAD_CACHE 1 +#endif +#ifndef ENABLE_GLOBAL_CACHE +//! Enable global cache shared between all threads, requires thread cache +#define ENABLE_GLOBAL_CACHE 1 +#endif +#ifndef ENABLE_VALIDATE_ARGS +//! Enable validation of args to public entry points +#define ENABLE_VALIDATE_ARGS 0 +#endif +#ifndef ENABLE_STATISTICS +//! Enable statistics collection +#define ENABLE_STATISTICS 0 +#endif +#ifndef ENABLE_ASSERTS +//! Enable asserts +#define ENABLE_ASSERTS 0 +#endif +#ifndef ENABLE_OVERRIDE +//! Override standard library malloc/free and new/delete entry points +#define ENABLE_OVERRIDE 0 +#endif +#ifndef ENABLE_PRELOAD +//! Support preloading +#define ENABLE_PRELOAD 0 +#endif +#ifndef DISABLE_UNMAP +//! Disable unmapping memory pages +#define DISABLE_UNMAP 0 +#endif +#ifndef DEFAULT_SPAN_MAP_COUNT +//! Default number of spans to map in call to map more virtual memory (default values yield 4MiB here) +#define DEFAULT_SPAN_MAP_COUNT 64 +#endif + +#if ENABLE_THREAD_CACHE +#ifndef ENABLE_UNLIMITED_CACHE +//! Unlimited thread and global cache +#define ENABLE_UNLIMITED_CACHE 0 +#endif +#ifndef ENABLE_UNLIMITED_THREAD_CACHE +//! Unlimited cache disables any thread cache limitations +#define ENABLE_UNLIMITED_THREAD_CACHE ENABLE_UNLIMITED_CACHE +#endif +#if !ENABLE_UNLIMITED_THREAD_CACHE +#ifndef THREAD_CACHE_MULTIPLIER +//! Multiplier for thread cache (cache limit will be span release count multiplied by this value) +#define THREAD_CACHE_MULTIPLIER 16 +#endif +#ifndef ENABLE_ADAPTIVE_THREAD_CACHE +//! Enable adaptive size of per-thread cache (still bounded by THREAD_CACHE_MULTIPLIER hard limit) +#define ENABLE_ADAPTIVE_THREAD_CACHE 0 +#endif +#endif +#endif + +#if ENABLE_GLOBAL_CACHE && ENABLE_THREAD_CACHE +#ifndef ENABLE_UNLIMITED_GLOBAL_CACHE +//! Unlimited cache disables any global cache limitations +#define ENABLE_UNLIMITED_GLOBAL_CACHE ENABLE_UNLIMITED_CACHE +#endif +#if !ENABLE_UNLIMITED_GLOBAL_CACHE +//! Multiplier for global cache (cache limit will be span release count multiplied by this value) +#define GLOBAL_CACHE_MULTIPLIER (THREAD_CACHE_MULTIPLIER * 6) +#endif +#else +# undef ENABLE_GLOBAL_CACHE +# define ENABLE_GLOBAL_CACHE 0 +#endif + +#if !ENABLE_THREAD_CACHE || ENABLE_UNLIMITED_THREAD_CACHE +# undef ENABLE_ADAPTIVE_THREAD_CACHE +# define ENABLE_ADAPTIVE_THREAD_CACHE 0 +#endif + +#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE +# error Must use global cache if unmap is disabled +#endif + +#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 ) +# define PLATFORM_WINDOWS 1 +# define PLATFORM_POSIX 0 +#else +# define PLATFORM_WINDOWS 0 +# define PLATFORM_POSIX 1 +#endif + +#define _Static_assert static_assert + +/// Platform and arch specifics +#ifndef FORCEINLINE +# if defined(_MSC_VER) && !defined(__clang__) +# define FORCEINLINE inline __forceinline +# else +# define FORCEINLINE inline __attribute__((__always_inline__)) +# endif +#endif +#if PLATFORM_WINDOWS +# ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +# endif +# include +# if ENABLE_VALIDATE_ARGS +# include +# endif +#else +# include +# include +# include +# if defined(__APPLE__) +# include +# include +# include +# endif +# if defined(__HAIKU__) +# include +# include +# endif +#endif + +#include +#include + +#if ENABLE_ASSERTS +# undef NDEBUG +# if defined(_MSC_VER) && !defined(_DEBUG) +# define _DEBUG +# endif +# include +#else +# undef assert +# define assert(x) do {} while(0) +#endif +#if ENABLE_STATISTICS +# include +#endif + +#include + +namespace tracy +{ + +typedef std::atomic atomic32_t; +typedef std::atomic atomic64_t; +typedef std::atomic atomicptr_t; + +#define atomic_thread_fence_acquire() std::atomic_thread_fence(std::memory_order_acquire) +#define atomic_thread_fence_release() std::atomic_thread_fence(std::memory_order_release) + +static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return std::atomic_load_explicit(src, std::memory_order_relaxed); } +static FORCEINLINE void atomic_store32(atomic32_t* dst, int32_t val) { std::atomic_store_explicit(dst, val, std::memory_order_relaxed); } +static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return std::atomic_fetch_add_explicit(val, 1, std::memory_order_relaxed) + 1; } +#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE +static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; } +#endif +static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return std::atomic_fetch_add_explicit(val, add, std::memory_order_relaxed) + add; } +static FORCEINLINE void* atomic_load_ptr(atomicptr_t* src) { return std::atomic_load_explicit(src, std::memory_order_relaxed); } +static FORCEINLINE void atomic_store_ptr(atomicptr_t* dst, void* val) { std::atomic_store_explicit(dst, val, std::memory_order_relaxed); } +static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return std::atomic_compare_exchange_weak_explicit(dst, &ref, val, std::memory_order_release, std::memory_order_acquire); } + +#if defined(_MSC_VER) && !defined(__clang__) +# define EXPECTED(x) (x) +# define UNEXPECTED(x) (x) +#else +# define EXPECTED(x) __builtin_expect((x), 1) +# define UNEXPECTED(x) __builtin_expect((x), 0) +#endif + +/// Preconfigured limits and sizes +//! Granularity of a small allocation block +#define SMALL_GRANULARITY 16 +//! Small granularity shift count +#define SMALL_GRANULARITY_SHIFT 4 +//! Number of small block size classes +#define SMALL_CLASS_COUNT 65 +//! Maximum size of a small block +#define SMALL_SIZE_LIMIT (SMALL_GRANULARITY * (SMALL_CLASS_COUNT - 1)) +//! Granularity of a medium allocation block +#define MEDIUM_GRANULARITY 512 +//! Medium granularity shift count +#define MEDIUM_GRANULARITY_SHIFT 9 +//! Number of medium block size classes +#define MEDIUM_CLASS_COUNT 61 +//! Total number of small + medium size classes +#define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT) +//! Number of large block size classes +#define LARGE_CLASS_COUNT 32 +//! Maximum size of a medium block +#define MEDIUM_SIZE_LIMIT (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT)) +//! Maximum size of a large block +#define LARGE_SIZE_LIMIT ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE) +//! Size of a span header (must be a multiple of SMALL_GRANULARITY) +#define SPAN_HEADER_SIZE 96 + +#if ENABLE_VALIDATE_ARGS +//! Maximum allocation size to avoid integer overflow +#undef MAX_ALLOC_SIZE +#define MAX_ALLOC_SIZE (((size_t)-1) - _memory_span_size) +#endif + +#define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs)) +#define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second)) + +#define INVALID_POINTER ((void*)((uintptr_t)-1)) + +/// Data types +//! A memory heap, per thread +typedef struct heap_t heap_t; +//! Heap spans per size class +typedef struct heap_class_t heap_class_t; +//! Span of memory pages +typedef struct span_t span_t; +//! Span list +typedef struct span_list_t span_list_t; +//! Span active data +typedef struct span_active_t span_active_t; +//! Size class definition +typedef struct size_class_t size_class_t; +//! Global cache +typedef struct global_cache_t global_cache_t; + +//! Flag indicating span is the first (master) span of a split superspan +#define SPAN_FLAG_MASTER 1U +//! Flag indicating span is a secondary (sub) span of a split superspan +#define SPAN_FLAG_SUBSPAN 2U +//! Flag indicating span has blocks with increased alignment +#define SPAN_FLAG_ALIGNED_BLOCKS 4U + +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS +struct span_use_t { + //! Current number of spans used (actually used, not in cache) + atomic32_t current; + //! High water mark of spans used + uint32_t high; +#if ENABLE_STATISTICS + //! Number of spans transitioned to global cache + uint32_t spans_to_global; + //! Number of spans transitioned from global cache + uint32_t spans_from_global; + //! Number of spans transitioned to thread cache + uint32_t spans_to_cache; + //! Number of spans transitioned from thread cache + uint32_t spans_from_cache; + //! Number of spans transitioned to reserved state + uint32_t spans_to_reserved; + //! Number of spans transitioned from reserved state + uint32_t spans_from_reserved; + //! Number of raw memory map calls + uint32_t spans_map_calls; +#endif +}; +typedef struct span_use_t span_use_t; +#endif + +#if ENABLE_STATISTICS +struct size_class_use_t { + //! Current number of allocations + atomic32_t alloc_current; + //! Peak number of allocations + int32_t alloc_peak; + //! Total number of allocations + int32_t alloc_total; + //! Total number of frees + atomic32_t free_total; + //! Number of spans in use + uint32_t spans_current; + //! Number of spans transitioned to cache + uint32_t spans_peak; + //! Number of spans transitioned to cache + uint32_t spans_to_cache; + //! Number of spans transitioned from cache + uint32_t spans_from_cache; + //! Number of spans transitioned from reserved state + uint32_t spans_from_reserved; + //! Number of spans mapped + uint32_t spans_map_calls; +}; +typedef struct size_class_use_t size_class_use_t; +#endif + +typedef enum span_state_t { + SPAN_STATE_ACTIVE = 0, + SPAN_STATE_PARTIAL, + SPAN_STATE_FULL +} span_state_t; + +//A span can either represent a single span of memory pages with size declared by span_map_count configuration variable, +//or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single +//span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first +//(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans +//that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire +//superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released +//in the same call to release the virtual memory range, but individual subranges can be decommitted individually +//to reduce physical memory use). +struct span_t { + //! Free list + void* free_list; + //! State + uint32_t state; + //! Used count when not active (not including deferred free list) + uint32_t used_count; + //! Block count + uint32_t block_count; + //! Size class + uint32_t size_class; + //! Index of last block initialized in free list + uint32_t free_list_limit; + //! Span list size when part of a cache list, or size of deferred free list when partial/full + uint32_t list_size; + //! Deferred free list + atomicptr_t free_list_deferred; + //! Size of a block + uint32_t block_size; + //! Flags and counters + uint32_t flags; + //! Number of spans + uint32_t span_count; + //! Total span counter for master spans, distance for subspans + uint32_t total_spans_or_distance; + //! Remaining span counter, for master spans + atomic32_t remaining_spans; + //! Alignment offset + uint32_t align_offset; + //! Owning heap + heap_t* heap; + //! Next span + span_t* next; + //! Previous span + span_t* prev; +}; +_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch"); + +struct heap_class_t { + //! Free list of active span + void* free_list; + //! Double linked list of partially used spans with free blocks for each size class. + // Current active span is at head of list. Previous span pointer in head points to tail span of list. + span_t* partial_span; +}; + +struct heap_t { + //! Active and semi-used span data per size class + heap_class_t span_class[SIZE_CLASS_COUNT]; +#if ENABLE_THREAD_CACHE + //! List of free spans (single linked list) + span_t* span_cache[LARGE_CLASS_COUNT]; + //! List of deferred free spans of class 0 (single linked list) + atomicptr_t span_cache_deferred; +#endif +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + //! Current and high water mark of spans used per span count + span_use_t span_use[LARGE_CLASS_COUNT]; +#endif + //! Mapped but unused spans + span_t* span_reserve; + //! Master span for mapped but unused spans + span_t* span_reserve_master; + //! Number of mapped but unused spans + size_t spans_reserved; + //! Next heap in id list + heap_t* next_heap; + //! Next heap in orphan list + heap_t* next_orphan; + //! Memory pages alignment offset + size_t align_offset; + //! Heap ID + int32_t id; +#if ENABLE_STATISTICS + //! Number of bytes transitioned thread -> global + size_t thread_to_global; + //! Number of bytes transitioned global -> thread + size_t global_to_thread; + //! Allocation stats per size class + size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1]; +#endif +}; + +struct size_class_t { + //! Size of blocks in this class + uint32_t block_size; + //! Number of blocks in each chunk + uint16_t block_count; + //! Class index this class is merged with + uint16_t class_idx; +}; +_Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch"); + +struct global_cache_t { + //! Cache list pointer + atomicptr_t cache; + //! Cache size + atomic32_t size; + //! ABA counter + atomic32_t counter; +}; + +/// Global data +//! Initialized flag +static int _rpmalloc_initialized; +//! Configuration +static rpmalloc_config_t _memory_config; +//! Memory page size +static size_t _memory_page_size; +//! Shift to divide by page size +static size_t _memory_page_size_shift; +//! Granularity at which memory pages are mapped by OS +static size_t _memory_map_granularity; +#if RPMALLOC_CONFIGURABLE +//! Size of a span of memory pages +static size_t _memory_span_size; +//! Shift to divide by span size +static size_t _memory_span_size_shift; +//! Mask to get to start of a memory span +static uintptr_t _memory_span_mask; +#else +//! Hardwired span size (64KiB) +#define _memory_span_size (64 * 1024) +#define _memory_span_size_shift 16 +#define _memory_span_mask (~((uintptr_t)(_memory_span_size - 1))) +#endif +//! Number of spans to map in each map call +static size_t _memory_span_map_count; +//! Number of spans to release from thread cache to global cache (single spans) +static size_t _memory_span_release_count; +//! Number of spans to release from thread cache to global cache (large multiple spans) +static size_t _memory_span_release_count_large; +//! Global size classes +static size_class_t _memory_size_class[SIZE_CLASS_COUNT]; +//! Run-time size limit of medium blocks +static size_t _memory_medium_size_limit; +//! Heap ID counter +static atomic32_t _memory_heap_id; +//! Huge page support +static int _memory_huge_pages; +#if ENABLE_GLOBAL_CACHE +//! Global span cache +static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT]; +#endif +//! All heaps +static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE]; +//! Orphaned heaps +static atomicptr_t _memory_orphan_heaps; +//! Running orphan counter to avoid ABA issues in linked list +static atomic32_t _memory_orphan_counter; +#if ENABLE_STATISTICS +//! Active heap count +static atomic32_t _memory_active_heaps; +//! Number of currently mapped memory pages +static atomic32_t _mapped_pages; +//! Peak number of concurrently mapped memory pages +static int32_t _mapped_pages_peak; +//! Number of currently unused spans +static atomic32_t _reserved_spans; +//! Running counter of total number of mapped memory pages since start +static atomic32_t _mapped_total; +//! Running counter of total number of unmapped memory pages since start +static atomic32_t _unmapped_total; +//! Number of currently mapped memory pages in OS calls +static atomic32_t _mapped_pages_os; +//! Number of currently allocated pages in huge allocations +static atomic32_t _huge_pages_current; +//! Peak number of currently allocated pages in huge allocations +static int32_t _huge_pages_peak; +#endif + +//! Current thread heap +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD +static pthread_key_t _memory_thread_heap; +#else +# ifdef _MSC_VER +# define _Thread_local __declspec(thread) +# define TLS_MODEL +# else +# define TLS_MODEL __attribute__((tls_model("initial-exec"))) +# if !defined(__clang__) && defined(__GNUC__) +# define _Thread_local __thread +# endif +# endif +static _Thread_local heap_t* _memory_thread_heap TLS_MODEL; +#endif + +static inline heap_t* +get_thread_heap_raw(void) { +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD + return pthread_getspecific(_memory_thread_heap); +#else + return _memory_thread_heap; +#endif +} + +//! Get the current thread heap +static inline heap_t* +get_thread_heap(void) { + heap_t* heap = get_thread_heap_raw(); +#if ENABLE_PRELOAD + if (EXPECTED(heap != 0)) + return heap; + rpmalloc_initialize(); + return get_thread_heap_raw(); +#else + return heap; +#endif +} + +//! Set the current thread heap +static void +set_thread_heap(heap_t* heap) { +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD + pthread_setspecific(_memory_thread_heap, heap); +#else + _memory_thread_heap = heap; +#endif +} + +//! Default implementation to map more virtual memory +static void* +_memory_map_os(size_t size, size_t* offset); + +//! Default implementation to unmap virtual memory +static void +_memory_unmap_os(void* address, size_t size, size_t offset, size_t release); + +//! Lookup a memory heap from heap ID +static heap_t* +_memory_heap_lookup(int32_t id) { + uint32_t list_idx = id % HEAP_ARRAY_SIZE; + heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); + while (heap && (heap->id != id)) + heap = heap->next_heap; + return heap; +} + +#if ENABLE_STATISTICS +# define _memory_statistics_inc(counter, value) counter += value +# define _memory_statistics_dec(counter, value) counter -= value +# define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value)) +# define _memory_statistics_add_peak(atomic_counter, value, peak) do { int32_t _cur_count = atomic_add32(atomic_counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0) +# define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value)) +# define _memory_statistics_inc_alloc(heap, class_idx) do { \ + int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \ + if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \ + heap->size_class_use[class_idx].alloc_peak = alloc_current; \ + heap->size_class_use[class_idx].alloc_total++; \ +} while(0) +# define _memory_statistics_inc_free(heap, class_idx) do { \ + atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \ + atomic_incr32(&heap->size_class_use[class_idx].free_total); \ +} while(0) +#else +# define _memory_statistics_inc(counter, value) do {} while(0) +# define _memory_statistics_dec(counter, value) do {} while(0) +# define _memory_statistics_add(atomic_counter, value) do {} while(0) +# define _memory_statistics_add_peak(atomic_counter, value, peak) do {} while (0) +# define _memory_statistics_sub(atomic_counter, value) do {} while(0) +# define _memory_statistics_inc_alloc(heap, class_idx) do {} while(0) +# define _memory_statistics_inc_free(heap, class_idx) do {} while(0) +#endif + +static void +_memory_heap_cache_insert(heap_t* heap, span_t* span); + +//! Map more virtual memory +static void* +_memory_map(size_t size, size_t* offset) { + assert(!(size % _memory_page_size)); + assert(size >= _memory_page_size); + _memory_statistics_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak); + _memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift)); + return _memory_config.memory_map(size, offset); +} + +//! Unmap virtual memory +static void +_memory_unmap(void* address, size_t size, size_t offset, size_t release) { + assert(!release || (release >= size)); + assert(!release || (release >= _memory_page_size)); + if (release) { + assert(!(release % _memory_page_size)); + _memory_statistics_sub(&_mapped_pages, (release >> _memory_page_size_shift)); + _memory_statistics_add(&_unmapped_total, (release >> _memory_page_size_shift)); + } + _memory_config.memory_unmap(address, size, offset, release); +} + +//! Declare the span to be a subspan and store distance from master span and span count +static void +_memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) { + assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER)); + if (subspan != master) { + subspan->flags = SPAN_FLAG_SUBSPAN; + subspan->total_spans_or_distance = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift); + subspan->align_offset = 0; + } + subspan->span_count = (uint32_t)span_count; +} + +//! Use reserved spans to fulfill a memory map request (reserve size must be checked by caller) +static span_t* +_memory_map_from_reserve(heap_t* heap, size_t span_count) { + //Update the heap span reserve + span_t* span = heap->span_reserve; + heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size); + heap->spans_reserved -= span_count; + + _memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count); + if (span_count <= LARGE_CLASS_COUNT) + _memory_statistics_inc(heap->span_use[span_count - 1].spans_from_reserved, 1); + + return span; +} + +//! Get the aligned number of spans to map in based on wanted count, configured mapping granularity and the page size +static size_t +_memory_map_align_span_count(size_t span_count) { + size_t request_count = (span_count > _memory_span_map_count) ? span_count : _memory_span_map_count; + if ((_memory_page_size > _memory_span_size) && ((request_count * _memory_span_size) % _memory_page_size)) + request_count += _memory_span_map_count - (request_count % _memory_span_map_count); + return request_count; +} + +//! Store the given spans as reserve in the given heap +static void +_memory_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) { + heap->span_reserve_master = master; + heap->span_reserve = reserve; + heap->spans_reserved = reserve_span_count; +} + +//! Setup a newly mapped span +static void +_memory_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) { + span->total_spans_or_distance = (uint32_t)total_span_count; + span->span_count = (uint32_t)span_count; + span->align_offset = (uint32_t)align_offset; + span->flags = SPAN_FLAG_MASTER; + atomic_store32(&span->remaining_spans, (int32_t)total_span_count); +} + +//! Map a akigned set of spans, taking configured mapping granularity and the page size into account +static span_t* +_memory_map_aligned_span_count(heap_t* heap, size_t span_count) { + //If we already have some, but not enough, reserved spans, release those to heap cache and map a new + //full set of spans. Otherwise we would waste memory if page size > span size (huge pages) + size_t aligned_span_count = _memory_map_align_span_count(span_count); + size_t align_offset = 0; + span_t* span = (span_t*)_memory_map(aligned_span_count * _memory_span_size, &align_offset); + if (!span) + return 0; + _memory_span_initialize(span, aligned_span_count, span_count, align_offset); + _memory_statistics_add(&_reserved_spans, aligned_span_count); + if (span_count <= LARGE_CLASS_COUNT) + _memory_statistics_inc(heap->span_use[span_count - 1].spans_map_calls, 1); + if (aligned_span_count > span_count) { + if (heap->spans_reserved) { + _memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved); + _memory_heap_cache_insert(heap, heap->span_reserve); + } + _memory_heap_set_reserved_spans(heap, span, (span_t*)pointer_offset(span, span_count * _memory_span_size), aligned_span_count - span_count); + } + return span; +} + +//! Map in memory pages for the given number of spans (or use previously reserved pages) +static span_t* +_memory_map_spans(heap_t* heap, size_t span_count) { + if (span_count <= heap->spans_reserved) + return _memory_map_from_reserve(heap, span_count); + return _memory_map_aligned_span_count(heap, span_count); +} + +//! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings) +static void +_memory_unmap_span(span_t* span) { + assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN)); + assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN)); + + int is_master = !!(span->flags & SPAN_FLAG_MASTER); + span_t* master = is_master ? span : (span_t*)(pointer_offset(span, -(int32_t)(span->total_spans_or_distance * _memory_span_size))); + assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN)); + assert(master->flags & SPAN_FLAG_MASTER); + + size_t span_count = span->span_count; + if (!is_master) { + //Directly unmap subspans (unless huge pages, in which case we defer and unmap entire page range with master) + assert(span->align_offset == 0); + if (_memory_span_size >= _memory_page_size) { + _memory_unmap(span, span_count * _memory_span_size, 0, 0); + _memory_statistics_sub(&_reserved_spans, span_count); + } + } else { + //Special double flag to denote an unmapped master + //It must be kept in memory since span header must be used + span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN; + } + + if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) { + //Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span + assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN)); + size_t unmap_count = master->span_count; + if (_memory_span_size < _memory_page_size) + unmap_count = master->total_spans_or_distance; + _memory_statistics_sub(&_reserved_spans, unmap_count); + _memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, master->total_spans_or_distance * _memory_span_size); + } +} + +#if ENABLE_THREAD_CACHE + +//! Unmap a single linked list of spans +static void +_memory_unmap_span_list(span_t* span) { + size_t list_size = span->list_size; + for (size_t ispan = 0; ispan < list_size; ++ispan) { + span_t* next_span = span->next; + _memory_unmap_span(span); + span = next_span; + } + assert(!span); +} + +//! Add span to head of single linked span list +static size_t +_memory_span_list_push(span_t** head, span_t* span) { + span->next = *head; + if (*head) + span->list_size = (*head)->list_size + 1; + else + span->list_size = 1; + *head = span; + return span->list_size; +} + +//! Remove span from head of single linked span list, returns the new list head +static span_t* +_memory_span_list_pop(span_t** head) { + span_t* span = *head; + span_t* next_span = 0; + if (span->list_size > 1) { + assert(span->next); + next_span = span->next; + assert(next_span); + next_span->list_size = span->list_size - 1; + } + *head = next_span; + return span; +} + +//! Split a single linked span list +static span_t* +_memory_span_list_split(span_t* span, size_t limit) { + span_t* next = 0; + if (limit < 2) + limit = 2; + if (span->list_size > limit) { + uint32_t list_size = 1; + span_t* last = span; + next = span->next; + while (list_size < limit) { + last = next; + next = next->next; + ++list_size; + } + last->next = 0; + assert(next); + next->list_size = span->list_size - list_size; + span->list_size = list_size; + span->prev = 0; + } + return next; +} + +#endif + +//! Add a span to partial span double linked list at the head +static void +_memory_span_partial_list_add(span_t** head, span_t* span) { + if (*head) { + span->next = *head; + //Maintain pointer to tail span + span->prev = (*head)->prev; + (*head)->prev = span; + } else { + span->next = 0; + span->prev = span; + } + *head = span; +} + +//! Add a span to partial span double linked list at the tail +static void +_memory_span_partial_list_add_tail(span_t** head, span_t* span) { + span->next = 0; + if (*head) { + span_t* tail = (*head)->prev; + tail->next = span; + span->prev = tail; + //Maintain pointer to tail span + (*head)->prev = span; + } else { + span->prev = span; + *head = span; + } +} + +//! Pop head span from partial span double linked list +static void +_memory_span_partial_list_pop_head(span_t** head) { + span_t* span = *head; + *head = span->next; + if (*head) { + //Maintain pointer to tail span + (*head)->prev = span->prev; + } +} + +//! Remove a span from partial span double linked list +static void +_memory_span_partial_list_remove(span_t** head, span_t* span) { + if (UNEXPECTED(*head == span)) { + _memory_span_partial_list_pop_head(head); + } else { + span_t* next_span = span->next; + span_t* prev_span = span->prev; + prev_span->next = next_span; + if (EXPECTED(next_span != 0)) { + next_span->prev = prev_span; + } else { + //Update pointer to tail span + (*head)->prev = prev_span; + } + } +} + +#if ENABLE_GLOBAL_CACHE + +//! Insert the given list of memory page spans in the global cache +static void +_memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) { + assert((span->list_size == 1) || (span->next != 0)); + int32_t list_size = (int32_t)span->list_size; + //Unmap if cache has reached the limit + if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) { +#if !ENABLE_UNLIMITED_GLOBAL_CACHE + _memory_unmap_span_list(span); + atomic_add32(&cache->size, -list_size); + return; +#endif + } + void* current_cache, *new_cache; + do { + current_cache = atomic_load_ptr(&cache->cache); + span->prev = (span_t*)((uintptr_t)current_cache & _memory_span_mask); + new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask)); + } while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache)); +} + +//! Extract a number of memory page spans from the global cache +static span_t* +_memory_cache_extract(global_cache_t* cache) { + uintptr_t span_ptr; + do { + void* global_span = atomic_load_ptr(&cache->cache); + span_ptr = (uintptr_t)global_span & _memory_span_mask; + if (span_ptr) { + span_t* span = (span_t*)span_ptr; + //By accessing the span ptr before it is swapped out of list we assume that a contending thread + //does not manage to traverse the span to being unmapped before we access it + void* new_cache = (void*)((uintptr_t)span->prev | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask)); + if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) { + atomic_add32(&cache->size, -(int32_t)span->list_size); + return span; + } + } + } while (span_ptr); + return 0; +} + +//! Finalize a global cache, only valid from allocator finalization (not thread safe) +static void +_memory_cache_finalize(global_cache_t* cache) { + void* current_cache = atomic_load_ptr(&cache->cache); + span_t* span = (span_t*)((uintptr_t)current_cache & _memory_span_mask); + while (span) { + span_t* skip_span = (span_t*)((uintptr_t)span->prev & _memory_span_mask); + atomic_add32(&cache->size, -(int32_t)span->list_size); + _memory_unmap_span_list(span); + span = skip_span; + } + assert(!atomic_load32(&cache->size)); + atomic_store_ptr(&cache->cache, 0); + atomic_store32(&cache->size, 0); +} + +//! Insert the given list of memory page spans in the global cache +static void +_memory_global_cache_insert(span_t* span) { + size_t span_count = span->span_count; +#if ENABLE_UNLIMITED_GLOBAL_CACHE + _memory_cache_insert(&_memory_span_cache[span_count - 1], span, 0); +#else + const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * ((span_count == 1) ? _memory_span_release_count : _memory_span_release_count_large)); + _memory_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit); +#endif +} + +//! Extract a number of memory page spans from the global cache for large blocks +static span_t* +_memory_global_cache_extract(size_t span_count) { + span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]); + assert(!span || (span->span_count == span_count)); + return span; +} + +#endif + +#if ENABLE_THREAD_CACHE +//! Adopt the deferred span cache list +static void +_memory_heap_cache_adopt_deferred(heap_t* heap) { + atomic_thread_fence_acquire(); + span_t* span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred); + if (!span) + return; + do { + span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred); + } while (!atomic_cas_ptr(&heap->span_cache_deferred, 0, span)); + while (span) { + span_t* next_span = span->next; + _memory_span_list_push(&heap->span_cache[0], span); +#if ENABLE_STATISTICS + atomic_decr32(&heap->span_use[span->span_count - 1].current); + ++heap->size_class_use[span->size_class].spans_to_cache; + --heap->size_class_use[span->size_class].spans_current; +#endif + span = next_span; + } +} +#endif + +//! Insert a single span into thread heap cache, releasing to global cache if overflow +static void +_memory_heap_cache_insert(heap_t* heap, span_t* span) { +#if ENABLE_THREAD_CACHE + size_t span_count = span->span_count; + size_t idx = span_count - 1; + _memory_statistics_inc(heap->span_use[idx].spans_to_cache, 1); + if (!idx) + _memory_heap_cache_adopt_deferred(heap); +#if ENABLE_UNLIMITED_THREAD_CACHE + _memory_span_list_push(&heap->span_cache[idx], span); +#else + const size_t release_count = (!idx ? _memory_span_release_count : _memory_span_release_count_large); + size_t current_cache_size = _memory_span_list_push(&heap->span_cache[idx], span); + if (current_cache_size <= release_count) + return; + const size_t hard_limit = release_count * THREAD_CACHE_MULTIPLIER; + if (current_cache_size <= hard_limit) { +#if ENABLE_ADAPTIVE_THREAD_CACHE + //Require 25% of high water mark to remain in cache (and at least 1, if use is 0) + const size_t high_mark = heap->span_use[idx].high; + const size_t min_limit = (high_mark >> 2) + release_count + 1; + if (current_cache_size < min_limit) + return; +#else + return; +#endif + } + heap->span_cache[idx] = _memory_span_list_split(span, release_count); + assert(span->list_size == release_count); +#if ENABLE_STATISTICS + heap->thread_to_global += (size_t)span->list_size * span_count * _memory_span_size; + heap->span_use[idx].spans_to_global += span->list_size; +#endif +#if ENABLE_GLOBAL_CACHE + _memory_global_cache_insert(span); +#else + _memory_unmap_span_list(span); +#endif +#endif +#else + (void)sizeof(heap); + _memory_unmap_span(span); +#endif +} + +//! Extract the given number of spans from the different cache levels +static span_t* +_memory_heap_thread_cache_extract(heap_t* heap, size_t span_count) { +#if ENABLE_THREAD_CACHE + size_t idx = span_count - 1; + if (!idx) + _memory_heap_cache_adopt_deferred(heap); + if (heap->span_cache[idx]) { +#if ENABLE_STATISTICS + heap->span_use[idx].spans_from_cache++; +#endif + return _memory_span_list_pop(&heap->span_cache[idx]); + } +#endif + return 0; +} + +static span_t* +_memory_heap_reserved_extract(heap_t* heap, size_t span_count) { + if (heap->spans_reserved >= span_count) + return _memory_map_spans(heap, span_count); + return 0; +} + +//! Extract a span from the global cache +static span_t* +_memory_heap_global_cache_extract(heap_t* heap, size_t span_count) { +#if ENABLE_GLOBAL_CACHE + size_t idx = span_count - 1; + heap->span_cache[idx] = _memory_global_cache_extract(span_count); + if (heap->span_cache[idx]) { +#if ENABLE_STATISTICS + heap->global_to_thread += (size_t)heap->span_cache[idx]->list_size * span_count * _memory_span_size; + heap->span_use[idx].spans_from_global += heap->span_cache[idx]->list_size; +#endif + return _memory_span_list_pop(&heap->span_cache[idx]); + } +#endif + return 0; +} + +//! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory +static span_t* +_memory_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_idx) { + (void)sizeof(class_idx); +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + uint32_t idx = (uint32_t)span_count - 1; + uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current); + if (current_count > heap->span_use[idx].high) + heap->span_use[idx].high = current_count; +#if ENABLE_STATISTICS + uint32_t spans_current = ++heap->size_class_use[class_idx].spans_current; + if (spans_current > heap->size_class_use[class_idx].spans_peak) + heap->size_class_use[class_idx].spans_peak = spans_current; +#endif +#endif + span_t* span = _memory_heap_thread_cache_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1); + return span; + } + span = _memory_heap_reserved_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _memory_statistics_inc(heap->size_class_use[class_idx].spans_from_reserved, 1); + return span; + } + span = _memory_heap_global_cache_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1); + return span; + } + //Final fallback, map in more virtual memory + span = _memory_map_spans(heap, span_count); + _memory_statistics_inc(heap->size_class_use[class_idx].spans_map_calls, 1); + return span; +} + +//! Move the span (used for small or medium allocations) to the heap thread cache +static void +_memory_span_release_to_cache(heap_t* heap, span_t* span) { + heap_class_t* heap_class = heap->span_class + span->size_class; + assert(heap_class->partial_span != span); + if (span->state == SPAN_STATE_PARTIAL) + _memory_span_partial_list_remove(&heap_class->partial_span, span); +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + atomic_decr32(&heap->span_use[0].current); +#endif + _memory_statistics_inc(heap->span_use[0].spans_to_cache, 1); + _memory_statistics_inc(heap->size_class_use[span->size_class].spans_to_cache, 1); + _memory_statistics_dec(heap->size_class_use[span->size_class].spans_current, 1); + _memory_heap_cache_insert(heap, span); +} + +//! Initialize a (partial) free list up to next system memory page, while reserving the first block +//! as allocated, returning number of blocks in list +static uint32_t +free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start, + uint32_t block_count, uint32_t block_size) { + assert(block_count); + *first_block = block_start; + if (block_count > 1) { + void* free_block = pointer_offset(block_start, block_size); + void* block_end = pointer_offset(block_start, block_size * block_count); + //If block size is less than half a memory page, bound init to next memory page boundary + if (block_size < (_memory_page_size >> 1)) { + void* page_end = pointer_offset(page_start, _memory_page_size); + if (page_end < block_end) + block_end = page_end; + } + *list = free_block; + block_count = 2; + void* next_block = pointer_offset(free_block, block_size); + while (next_block < block_end) { + *((void**)free_block) = next_block; + free_block = next_block; + ++block_count; + next_block = pointer_offset(next_block, block_size); + } + *((void**)free_block) = 0; + } else { + *list = 0; + } + return block_count; +} + +//! Initialize an unused span (from cache or mapped) to be new active span +static void* +_memory_span_set_new_active(heap_t* heap, heap_class_t* heap_class, span_t* span, uint32_t class_idx) { + assert(span->span_count == 1); + size_class_t* size_class = _memory_size_class + class_idx; + span->size_class = class_idx; + span->heap = heap; + span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS; + span->block_count = size_class->block_count; + span->block_size = size_class->block_size; + span->state = SPAN_STATE_ACTIVE; + span->free_list = 0; + + //Setup free list. Only initialize one system page worth of free blocks in list + void* block; + span->free_list_limit = free_list_partial_init(&heap_class->free_list, &block, + span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size); + atomic_store_ptr(&span->free_list_deferred, 0); + span->list_size = 0; + atomic_thread_fence_release(); + + _memory_span_partial_list_add(&heap_class->partial_span, span); + return block; +} + +//! Promote a partially used span (from heap used list) to be new active span +static void +_memory_span_set_partial_active(heap_class_t* heap_class, span_t* span) { + assert(span->state == SPAN_STATE_PARTIAL); + assert(span->block_count == _memory_size_class[span->size_class].block_count); + //Move data to heap size class and set span as active + heap_class->free_list = span->free_list; + span->state = SPAN_STATE_ACTIVE; + span->free_list = 0; + assert(heap_class->free_list); +} + +//! Mark span as full (from active) +static void +_memory_span_set_active_full(heap_class_t* heap_class, span_t* span) { + assert(span->state == SPAN_STATE_ACTIVE); + assert(span == heap_class->partial_span); + _memory_span_partial_list_pop_head(&heap_class->partial_span); + span->used_count = span->block_count; + span->state = SPAN_STATE_FULL; + span->free_list = 0; +} + +//! Move span from full to partial state +static void +_memory_span_set_full_partial(heap_t* heap, span_t* span) { + assert(span->state == SPAN_STATE_FULL); + heap_class_t* heap_class = &heap->span_class[span->size_class]; + span->state = SPAN_STATE_PARTIAL; + _memory_span_partial_list_add_tail(&heap_class->partial_span, span); +} + +static void* +_memory_span_extract_deferred(span_t* span) { + void* free_list; + do { + free_list = atomic_load_ptr(&span->free_list_deferred); + } while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list)); + span->list_size = 0; + atomic_store_ptr(&span->free_list_deferred, 0); + atomic_thread_fence_release(); + return free_list; +} + +//! Pop first block from a free list +static void* +free_list_pop(void** list) { + void* block = *list; + *list = *((void**)block); + return block; +} + +//! Allocate a small/medium sized memory block from the given heap +static void* +_memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) { + heap_class_t* heap_class = &heap->span_class[class_idx]; + void* block; + + span_t* active_span = heap_class->partial_span; + if (EXPECTED(active_span != 0)) { + assert(active_span->state == SPAN_STATE_ACTIVE); + assert(active_span->block_count == _memory_size_class[active_span->size_class].block_count); + //Swap in free list if not empty + if (active_span->free_list) { + heap_class->free_list = active_span->free_list; + active_span->free_list = 0; + return free_list_pop(&heap_class->free_list); + } + //If the span did not fully initialize free list, link up another page worth of blocks + if (active_span->free_list_limit < active_span->block_count) { + void* block_start = pointer_offset(active_span, SPAN_HEADER_SIZE + (active_span->free_list_limit * active_span->block_size)); + active_span->free_list_limit += free_list_partial_init(&heap_class->free_list, &block, + (void*)((uintptr_t)block_start & ~(_memory_page_size - 1)), block_start, + active_span->block_count - active_span->free_list_limit, active_span->block_size); + return block; + } + //Swap in deferred free list + atomic_thread_fence_acquire(); + if (atomic_load_ptr(&active_span->free_list_deferred)) { + heap_class->free_list = _memory_span_extract_deferred(active_span); + return free_list_pop(&heap_class->free_list); + } + + //If the active span is fully allocated, mark span as free floating (fully allocated and not part of any list) + assert(!heap_class->free_list); + assert(active_span->free_list_limit >= active_span->block_count); + _memory_span_set_active_full(heap_class, active_span); + } + assert(!heap_class->free_list); + + //Try promoting a semi-used span to active + active_span = heap_class->partial_span; + if (EXPECTED(active_span != 0)) { + _memory_span_set_partial_active(heap_class, active_span); + return free_list_pop(&heap_class->free_list); + } + assert(!heap_class->free_list); + assert(!heap_class->partial_span); + + //Find a span in one of the cache levels + active_span = _memory_heap_extract_new_span(heap, 1, class_idx); + + //Mark span as owned by this heap and set base data, return first block + return _memory_span_set_new_active(heap, heap_class, active_span, class_idx); +} + +//! Allocate a small sized memory block from the given heap +static void* +_memory_allocate_small(heap_t* heap, size_t size) { + //Small sizes have unique size classes + const uint32_t class_idx = (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT); + _memory_statistics_inc_alloc(heap, class_idx); + if (EXPECTED(heap->span_class[class_idx].free_list != 0)) + return free_list_pop(&heap->span_class[class_idx].free_list); + return _memory_allocate_from_heap_fallback(heap, class_idx); +} + +//! Allocate a medium sized memory block from the given heap +static void* +_memory_allocate_medium(heap_t* heap, size_t size) { + //Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes) + const uint32_t base_idx = (uint32_t)(SMALL_CLASS_COUNT + ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT)); + const uint32_t class_idx = _memory_size_class[base_idx].class_idx; + _memory_statistics_inc_alloc(heap, class_idx); + if (EXPECTED(heap->span_class[class_idx].free_list != 0)) + return free_list_pop(&heap->span_class[class_idx].free_list); + return _memory_allocate_from_heap_fallback(heap, class_idx); +} + +//! Allocate a large sized memory block from the given heap +static void* +_memory_allocate_large(heap_t* heap, size_t size) { + //Calculate number of needed max sized spans (including header) + //Since this function is never called if size > LARGE_SIZE_LIMIT + //the span_count is guaranteed to be <= LARGE_CLASS_COUNT + size += SPAN_HEADER_SIZE; + size_t span_count = size >> _memory_span_size_shift; + if (size & (_memory_span_size - 1)) + ++span_count; + size_t idx = span_count - 1; + + //Find a span in one of the cache levels + span_t* span = _memory_heap_extract_new_span(heap, span_count, SIZE_CLASS_COUNT); + + //Mark span as owned by this heap and set base data + assert(span->span_count == span_count); + span->size_class = (uint32_t)(SIZE_CLASS_COUNT + idx); + span->heap = heap; + atomic_thread_fence_release(); + + return pointer_offset(span, SPAN_HEADER_SIZE); +} + +//! Allocate a huge block by mapping memory pages directly +static void* +_memory_allocate_huge(size_t size) { + size += SPAN_HEADER_SIZE; + size_t num_pages = size >> _memory_page_size_shift; + if (size & (_memory_page_size - 1)) + ++num_pages; + size_t align_offset = 0; + span_t* span = (span_t*)_memory_map(num_pages * _memory_page_size, &align_offset); + if (!span) + return span; + //Store page count in span_count + span->size_class = (uint32_t)-1; + span->span_count = (uint32_t)num_pages; + span->align_offset = (uint32_t)align_offset; + _memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + + return pointer_offset(span, SPAN_HEADER_SIZE); +} + +//! Allocate a block larger than medium size +static void* +_memory_allocate_oversized(heap_t* heap, size_t size) { + if (size <= LARGE_SIZE_LIMIT) + return _memory_allocate_large(heap, size); + return _memory_allocate_huge(size); +} + +//! Allocate a block of the given size +static void* +_memory_allocate(heap_t* heap, size_t size) { + if (EXPECTED(size <= SMALL_SIZE_LIMIT)) + return _memory_allocate_small(heap, size); + else if (size <= _memory_medium_size_limit) + return _memory_allocate_medium(heap, size); + return _memory_allocate_oversized(heap, size); +} + +//! Allocate a new heap +static heap_t* +_memory_allocate_heap(void) { + void* raw_heap; + void* next_raw_heap; + uintptr_t orphan_counter; + heap_t* heap; + heap_t* next_heap; + //Try getting an orphaned heap + atomic_thread_fence_acquire(); + do { + raw_heap = atomic_load_ptr(&_memory_orphan_heaps); + heap = (heap_t*)((uintptr_t)raw_heap & ~(uintptr_t)0x1FF); + if (!heap) + break; + next_heap = heap->next_orphan; + orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter); + next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & (uintptr_t)0x1FF)); + } while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap)); + + if (!heap) { + //Map in pages for a new heap + size_t align_offset = 0; + heap = (heap_t*)_memory_map((1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, &align_offset); + if (!heap) + return heap; + memset((char*)heap, 0, sizeof(heap_t)); + heap->align_offset = align_offset; + + //Get a new heap ID + do { + heap->id = atomic_incr32(&_memory_heap_id); + if (_memory_heap_lookup(heap->id)) + heap->id = 0; + } while (!heap->id); + + //Link in heap in heap ID map + size_t list_idx = heap->id % HEAP_ARRAY_SIZE; + do { + next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); + heap->next_heap = next_heap; + } while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap)); + } + + return heap; +} + +//! Deallocate the given small/medium memory block in the current thread local heap +static void +_memory_deallocate_direct(span_t* span, void* block) { + assert(span->heap == get_thread_heap_raw()); + uint32_t state = span->state; + //Add block to free list + *((void**)block) = span->free_list; + span->free_list = block; + if (UNEXPECTED(state == SPAN_STATE_ACTIVE)) + return; + uint32_t used = --span->used_count; + uint32_t free = span->list_size; + if (UNEXPECTED(used == free)) + _memory_span_release_to_cache(span->heap, span); + else if (UNEXPECTED(state == SPAN_STATE_FULL)) + _memory_span_set_full_partial(span->heap, span); +} + +//! Put the block in the deferred free list of the owning span +static void +_memory_deallocate_defer(span_t* span, void* block) { + atomic_thread_fence_acquire(); + if (span->state == SPAN_STATE_FULL) { + if ((span->list_size + 1) == span->block_count) { + //Span will be completely freed by deferred deallocations, no other thread can + //currently touch it. Safe to move to owner heap deferred cache + span_t* last_head; + heap_t* heap = span->heap; + do { + last_head = (span_t*)atomic_load_ptr(&heap->span_cache_deferred); + span->next = last_head; + } while (!atomic_cas_ptr(&heap->span_cache_deferred, span, last_head)); + return; + } + } + + void* free_list; + do { + atomic_thread_fence_acquire(); + free_list = atomic_load_ptr(&span->free_list_deferred); + *((void**)block) = free_list; + } while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list)); + ++span->list_size; + atomic_store_ptr(&span->free_list_deferred, block); +} + +static void +_memory_deallocate_small_or_medium(span_t* span, void* p) { + _memory_statistics_inc_free(span->heap, span->size_class); + if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) { + //Realign pointer to block start + void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); + p = pointer_offset(p, -(int32_t)(block_offset % span->block_size)); + } + //Check if block belongs to this heap or if deallocation should be deferred + if (span->heap == get_thread_heap_raw()) + _memory_deallocate_direct(span, p); + else + _memory_deallocate_defer(span, p); +} + +//! Deallocate the given large memory block to the current heap +static void +_memory_deallocate_large(span_t* span) { + //Decrease counter + assert(span->span_count == ((size_t)span->size_class - SIZE_CLASS_COUNT + 1)); + assert(span->size_class >= SIZE_CLASS_COUNT); + assert(span->size_class - SIZE_CLASS_COUNT < LARGE_CLASS_COUNT); + assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN)); + assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN)); + //Large blocks can always be deallocated and transferred between heaps + //Investigate if it is better to defer large spans as well through span_cache_deferred, + //possibly with some heuristics to pick either scheme at runtime per deallocation + heap_t* heap = get_thread_heap(); + if (!heap) return; +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + size_t idx = span->span_count - 1; + atomic_decr32(&span->heap->span_use[idx].current); +#endif + if ((span->span_count > 1) && !heap->spans_reserved) { + heap->span_reserve = span; + heap->spans_reserved = span->span_count; + if (span->flags & SPAN_FLAG_MASTER) { + heap->span_reserve_master = span; + } else { //SPAN_FLAG_SUBSPAN + uint32_t distance = span->total_spans_or_distance; + span_t* master = (span_t*)pointer_offset(span, -(int32_t)(distance * _memory_span_size)); + heap->span_reserve_master = master; + assert(master->flags & SPAN_FLAG_MASTER); + assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count); + } + _memory_statistics_inc(heap->span_use[idx].spans_to_reserved, 1); + } else { + //Insert into cache list + _memory_heap_cache_insert(heap, span); + } +} + +//! Deallocate the given huge span +static void +_memory_deallocate_huge(span_t* span) { + //Oversized allocation, page count is stored in span_count + size_t num_pages = span->span_count; + _memory_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size); + _memory_statistics_sub(&_huge_pages_current, num_pages); +} + +//! Deallocate the given block +static void +_memory_deallocate(void* p) { + //Grab the span (always at start of span, using span alignment) + span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); + if (UNEXPECTED(!span)) + return; + if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) + _memory_deallocate_small_or_medium(span, p); + else if (span->size_class != (uint32_t)-1) + _memory_deallocate_large(span); + else + _memory_deallocate_huge(span); +} + +//! Reallocate the given block to the given size +static void* +_memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) { + if (p) { + //Grab the span using guaranteed span alignment + span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); + if (span->heap) { + if (span->size_class < SIZE_CLASS_COUNT) { + //Small/medium sized block + assert(span->span_count == 1); + void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); + uint32_t block_idx = block_offset / span->block_size; + void* block = pointer_offset(blocks_start, block_idx * span->block_size); + if (!oldsize) + oldsize = span->block_size - (uint32_t)pointer_diff(p, block); + if ((size_t)span->block_size >= size) { + //Still fits in block, never mind trying to save memory, but preserve data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } else { + //Large block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_spans = total_size >> _memory_span_size_shift; + if (total_size & (_memory_span_mask - 1)) + ++num_spans; + size_t current_spans = span->span_count; + assert(current_spans == ((span->size_class - SIZE_CLASS_COUNT) + 1)); + void* block = pointer_offset(span, SPAN_HEADER_SIZE); + if (!oldsize) + oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; + if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) { + //Still fits in block, never mind trying to save memory, but preserve data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } + } else { + //Oversized block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_pages = total_size >> _memory_page_size_shift; + if (total_size & (_memory_page_size - 1)) + ++num_pages; + //Page count is stored in span_count + size_t current_pages = span->span_count; + void* block = pointer_offset(span, SPAN_HEADER_SIZE); + if (!oldsize) + oldsize = (current_pages * _memory_page_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; + if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) { + //Still fits in block, never mind trying to save memory, but preserve data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } + } else { + oldsize = 0; + } + + //Size is greater than block size, need to allocate a new block and deallocate the old + heap_t* heap = get_thread_heap(); + //Avoid hysteresis by overallocating if increase is small (below 37%) + size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); + size_t new_size = (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size); + void* block = _memory_allocate(heap, new_size); + if (p && block) { + if (!(flags & RPMALLOC_NO_PRESERVE)) + memcpy(block, p, oldsize < new_size ? oldsize : new_size); + _memory_deallocate(p); + } + + return block; +} + +//! Get the usable size of the given block +static size_t +_memory_usable_size(void* p) { + //Grab the span using guaranteed span alignment + span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); + if (span->heap) { + //Small/medium block + if (span->size_class < SIZE_CLASS_COUNT) { + void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size); + } + + //Large block + size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1; + return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span); + } + + //Oversized block, page count is stored in span_count + size_t current_pages = span->span_count; + return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span); +} + +//! Adjust and optimize the size class properties for the given class +static void +_memory_adjust_size_class(size_t iclass) { + size_t block_size = _memory_size_class[iclass].block_size; + size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size; + + _memory_size_class[iclass].block_count = (uint16_t)block_count; + _memory_size_class[iclass].class_idx = (uint16_t)iclass; + + //Check if previous size classes can be merged + size_t prevclass = iclass; + while (prevclass > 0) { + --prevclass; + //A class can be merged if number of pages and number of blocks are equal + if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count) + memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass])); + else + break; + } +} + +static void +_memory_heap_finalize(void* heapptr) { + heap_t* heap = (heap_t*)heapptr; + if (!heap) + return; + //Release thread cache spans back to global cache +#if ENABLE_THREAD_CACHE + _memory_heap_cache_adopt_deferred(heap); + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_t* span = heap->span_cache[iclass]; +#if ENABLE_GLOBAL_CACHE + while (span) { + assert(span->span_count == (iclass + 1)); + size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large); + span_t* next = _memory_span_list_split(span, (uint32_t)release_count); +#if ENABLE_STATISTICS + heap->thread_to_global += (size_t)span->list_size * span->span_count * _memory_span_size; + heap->span_use[iclass].spans_to_global += span->list_size; +#endif + _memory_global_cache_insert(span); + span = next; + } +#else + if (span) + _memory_unmap_span_list(span); +#endif + heap->span_cache[iclass] = 0; + } +#endif + + //Orphan the heap + void* raw_heap; + uintptr_t orphan_counter; + heap_t* last_heap; + do { + last_heap = (heap_t*)atomic_load_ptr(&_memory_orphan_heaps); + heap->next_orphan = (heap_t*)((uintptr_t)last_heap & ~(uintptr_t)0x1FF); + orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter); + raw_heap = (void*)((uintptr_t)heap | (orphan_counter & (uintptr_t)0x1FF)); + } while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap)); + + set_thread_heap(0); + +#if ENABLE_STATISTICS + atomic_decr32(&_memory_active_heaps); + assert(atomic_load32(&_memory_active_heaps) >= 0); +#endif +} + +#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) +#include +static DWORD fls_key; +static void NTAPI +rp_thread_destructor(void* value) { + if (value) + rpmalloc_thread_finalize(); +} +#endif + +#if PLATFORM_POSIX +# include +# include +# ifdef __FreeBSD__ +# include +# define MAP_HUGETLB MAP_ALIGNED_SUPER +# endif +# ifndef MAP_UNINITIALIZED +# define MAP_UNINITIALIZED 0 +# endif +#endif +#include + +//! Initialize the allocator and setup global data +TRACY_API int +rpmalloc_initialize(void) { + if (_rpmalloc_initialized) { + rpmalloc_thread_initialize(); + return 0; + } + memset(&_memory_config, 0, sizeof(rpmalloc_config_t)); + return rpmalloc_initialize_config(0); +} + +int +rpmalloc_initialize_config(const rpmalloc_config_t* config) { + if (_rpmalloc_initialized) { + rpmalloc_thread_initialize(); + return 0; + } + _rpmalloc_initialized = 1; + + if (config) + memcpy(&_memory_config, config, sizeof(rpmalloc_config_t)); + + if (!_memory_config.memory_map || !_memory_config.memory_unmap) { + _memory_config.memory_map = _memory_map_os; + _memory_config.memory_unmap = _memory_unmap_os; + } + +#if RPMALLOC_CONFIGURABLE + _memory_page_size = _memory_config.page_size; +#else + _memory_page_size = 0; +#endif + _memory_huge_pages = 0; + _memory_map_granularity = _memory_page_size; + if (!_memory_page_size) { +#if PLATFORM_WINDOWS + SYSTEM_INFO system_info; + memset(&system_info, 0, sizeof(system_info)); + GetSystemInfo(&system_info); + _memory_page_size = system_info.dwPageSize; + _memory_map_granularity = system_info.dwAllocationGranularity; + if (config && config->enable_huge_pages) { + HANDLE token = 0; + size_t large_page_minimum = GetLargePageMinimum(); + if (large_page_minimum) + OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token); + if (token) { + LUID luid; + if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) { + TOKEN_PRIVILEGES token_privileges; + memset(&token_privileges, 0, sizeof(token_privileges)); + token_privileges.PrivilegeCount = 1; + token_privileges.Privileges[0].Luid = luid; + token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) { + DWORD err = GetLastError(); + if (err == ERROR_SUCCESS) { + _memory_huge_pages = 1; + _memory_page_size = large_page_minimum; + _memory_map_granularity = large_page_minimum; + } + } + } + CloseHandle(token); + } + } +#else + _memory_page_size = (size_t)sysconf(_SC_PAGESIZE); + _memory_map_granularity = _memory_page_size; + if (config && config->enable_huge_pages) { +#if defined(__linux__) + size_t huge_page_size = 0; + FILE* meminfo = fopen("/proc/meminfo", "r"); + if (meminfo) { + char line[128]; + while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) { + line[sizeof(line) - 1] = 0; + if (strstr(line, "Hugepagesize:")) + huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024; + } + fclose(meminfo); + } + if (huge_page_size) { + _memory_huge_pages = 1; + _memory_page_size = huge_page_size; + _memory_map_granularity = huge_page_size; + } +#elif defined(__FreeBSD__) + int rc; + size_t sz = sizeof(rc); + + if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && rc == 1) { + _memory_huge_pages = 1; + _memory_page_size = 2 * 1024 * 1024; + _memory_map_granularity = _memory_page_size; + } +#elif defined(__APPLE__) + _memory_huge_pages = 1; + _memory_page_size = 2 * 1024 * 1024; + _memory_map_granularity = _memory_page_size; +#endif + } +#endif + } else { + if (config && config->enable_huge_pages) + _memory_huge_pages = 1; + } + + //The ABA counter in heap orphan list is tied to using 512 (bitmask 0x1FF) + if (_memory_page_size < 512) + _memory_page_size = 512; + if (_memory_page_size > (64 * 1024 * 1024)) + _memory_page_size = (64 * 1024 * 1024); + _memory_page_size_shift = 0; + size_t page_size_bit = _memory_page_size; + while (page_size_bit != 1) { + ++_memory_page_size_shift; + page_size_bit >>= 1; + } + _memory_page_size = ((size_t)1 << _memory_page_size_shift); + +#if RPMALLOC_CONFIGURABLE + size_t span_size = _memory_config.span_size; + if (!span_size) + span_size = (64 * 1024); + if (span_size > (256 * 1024)) + span_size = (256 * 1024); + _memory_span_size = 4096; + _memory_span_size_shift = 12; + while (_memory_span_size < span_size) { + _memory_span_size <<= 1; + ++_memory_span_size_shift; + } + _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1); +#endif + + _memory_span_map_count = ( _memory_config.span_map_count ? _memory_config.span_map_count : DEFAULT_SPAN_MAP_COUNT); + if ((_memory_span_size * _memory_span_map_count) < _memory_page_size) + _memory_span_map_count = (_memory_page_size / _memory_span_size); + if ((_memory_page_size >= _memory_span_size) && ((_memory_span_map_count * _memory_span_size) % _memory_page_size)) + _memory_span_map_count = (_memory_page_size / _memory_span_size); + + _memory_config.page_size = _memory_page_size; + _memory_config.span_size = _memory_span_size; + _memory_config.span_map_count = _memory_span_map_count; + _memory_config.enable_huge_pages = _memory_huge_pages; + + _memory_span_release_count = (_memory_span_map_count > 4 ? ((_memory_span_map_count < 64) ? _memory_span_map_count : 64) : 4); + _memory_span_release_count_large = (_memory_span_release_count > 8 ? (_memory_span_release_count / 4) : 2); + +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD + if (pthread_key_create(&_memory_thread_heap, _memory_heap_finalize)) + return -1; +#endif +#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + fls_key = FlsAlloc(&rp_thread_destructor); +#endif + + atomic_store32(&_memory_heap_id, 0); + atomic_store32(&_memory_orphan_counter, 0); +#if ENABLE_STATISTICS + atomic_store32(&_memory_active_heaps, 0); + atomic_store32(&_reserved_spans, 0); + atomic_store32(&_mapped_pages, 0); + _mapped_pages_peak = 0; + atomic_store32(&_mapped_total, 0); + atomic_store32(&_unmapped_total, 0); + atomic_store32(&_mapped_pages_os, 0); + atomic_store32(&_huge_pages_current, 0); + _huge_pages_peak = 0; +#endif + + //Setup all small and medium size classes + size_t iclass = 0; + _memory_size_class[iclass].block_size = SMALL_GRANULARITY; + _memory_adjust_size_class(iclass); + for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) { + size_t size = iclass * SMALL_GRANULARITY; + _memory_size_class[iclass].block_size = (uint32_t)size; + _memory_adjust_size_class(iclass); + } + //At least two blocks per span, then fall back to large allocations + _memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1; + if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT) + _memory_medium_size_limit = MEDIUM_SIZE_LIMIT; + for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { + size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); + if (size > _memory_medium_size_limit) + break; + _memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size; + _memory_adjust_size_class(SMALL_CLASS_COUNT + iclass); + } + + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) + atomic_store_ptr(&_memory_heaps[list_idx], 0); + + //Initialize this thread + rpmalloc_thread_initialize(); + return 0; +} + +//! Finalize the allocator +TRACY_API void +rpmalloc_finalize(void) { + atomic_thread_fence_acquire(); + + rpmalloc_thread_finalize(); + //rpmalloc_dump_statistics(stderr); + + //Free all thread caches + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { + heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); + while (heap) { + if (heap->spans_reserved) { + span_t* span = _memory_map_spans(heap, heap->spans_reserved); + _memory_unmap_span(span); + } + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + heap_class_t* heap_class = heap->span_class + iclass; + span_t* span = heap_class->partial_span; + while (span) { + span_t* next = span->next; + if (span->state == SPAN_STATE_ACTIVE) { + uint32_t used_blocks = span->block_count; + if (span->free_list_limit < span->block_count) + used_blocks = span->free_list_limit; + uint32_t free_blocks = 0; + void* block = heap_class->free_list; + while (block) { + ++free_blocks; + block = *((void**)block); + } + block = span->free_list; + while (block) { + ++free_blocks; + block = *((void**)block); + } + if (used_blocks == (free_blocks + span->list_size)) + _memory_heap_cache_insert(heap, span); + } else { + if (span->used_count == span->list_size) + _memory_heap_cache_insert(heap, span); + } + span = next; + } + } + +#if ENABLE_THREAD_CACHE + //Free span caches (other thread might have deferred after the thread using this heap finalized) + _memory_heap_cache_adopt_deferred(heap); + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + if (heap->span_cache[iclass]) + _memory_unmap_span_list(heap->span_cache[iclass]); + } +#endif + heap_t* next_heap = heap->next_heap; + size_t heap_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size; + _memory_unmap(heap, heap_size, heap->align_offset, heap_size); + heap = next_heap; + } + } + +#if ENABLE_GLOBAL_CACHE + //Free global caches + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) + _memory_cache_finalize(&_memory_span_cache[iclass]); +#endif + + atomic_store_ptr(&_memory_orphan_heaps, 0); + atomic_thread_fence_release(); + +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD + pthread_key_delete(_memory_thread_heap); +#endif +#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsFree(fls_key); +#endif + +#if ENABLE_STATISTICS + //If you hit these asserts you probably have memory leaks or double frees in your code + assert(!atomic_load32(&_mapped_pages)); + assert(!atomic_load32(&_reserved_spans)); + assert(!atomic_load32(&_mapped_pages_os)); +#endif + + _rpmalloc_initialized = 0; +} + +//! Initialize thread, assign heap +TRACY_API void +rpmalloc_thread_initialize(void) { + if (!get_thread_heap_raw()) { + heap_t* heap = _memory_allocate_heap(); + if (heap) { + atomic_thread_fence_acquire(); +#if ENABLE_STATISTICS + atomic_incr32(&_memory_active_heaps); +#endif + set_thread_heap(heap); +#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsSetValue(fls_key, heap); +#endif + } + } +} + +//! Finalize thread, orphan heap +TRACY_API void +rpmalloc_thread_finalize(void) { + heap_t* heap = get_thread_heap_raw(); + if (heap) + _memory_heap_finalize(heap); +} + +int +rpmalloc_is_thread_initialized(void) { + return (get_thread_heap_raw() != 0) ? 1 : 0; +} + +const rpmalloc_config_t* +rpmalloc_config(void) { + return &_memory_config; +} + +//! Map new pages to virtual memory +static void* +_memory_map_os(size_t size, size_t* offset) { + //Either size is a heap (a single page) or a (multiple) span - we only need to align spans, and only if larger than map granularity + size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0; + assert(size >= _memory_page_size); +#if PLATFORM_WINDOWS + //Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed" + void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + if (!ptr) { + assert(!"Failed to map virtual memory block"); + return 0; + } +#else + int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED; +# if defined(__APPLE__) + int fd = (int)VM_MAKE_TAG(240U); + if (_memory_huge_pages) + fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB; + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0); +# elif defined(MAP_HUGETLB) + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0); +# else + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); +# endif + if ((ptr == MAP_FAILED) || !ptr) { + assert("Failed to map virtual memory block" == 0); + return 0; + } +#endif +#if ENABLE_STATISTICS + atomic_add32(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift)); +#endif + if (padding) { + size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask); + assert(final_padding <= _memory_span_size); + assert(final_padding <= padding); + assert(!(final_padding % 8)); + ptr = pointer_offset(ptr, final_padding); + *offset = final_padding >> 3; + } + assert((size < _memory_span_size) || !((uintptr_t)ptr & ~_memory_span_mask)); + return ptr; +} + +//! Unmap pages from virtual memory +static void +_memory_unmap_os(void* address, size_t size, size_t offset, size_t release) { + assert(release || (offset == 0)); + assert(!release || (release >= _memory_page_size)); + assert(size >= _memory_page_size); + if (release && offset) { + offset <<= 3; + address = pointer_offset(address, -(int32_t)offset); +#if PLATFORM_POSIX + //Padding is always one span size + release += _memory_span_size; +#endif + } +#if !DISABLE_UNMAP +#if PLATFORM_WINDOWS + if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) { + assert(!"Failed to unmap virtual memory block"); + } +#else + if (release) { + if (munmap(address, release)) { + assert("Failed to unmap virtual memory block" == 0); + } + } + else { +#if defined(POSIX_MADV_FREE) + if (posix_madvise(address, size, POSIX_MADV_FREE)) +#endif +#if defined(POSIX_MADV_DONTNEED) + if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) { + assert("Failed to madvise virtual memory block as free" == 0); + } +#endif + } +#endif +#endif +#if ENABLE_STATISTICS + if (release) + atomic_add32(&_mapped_pages_os, -(int32_t)(release >> _memory_page_size_shift)); +#endif +} + +// Extern interface + +TRACY_API RPMALLOC_ALLOCATOR void* +rpmalloc(size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + heap_t* heap = get_thread_heap(); + return _memory_allocate(heap, size); +} + +TRACY_API void +rpfree(void* ptr) { + _memory_deallocate(ptr); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpcalloc(size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + heap_t* heap = get_thread_heap(); + void* block = _memory_allocate(heap, total); + memset(block, 0, total); + return block; +} + +TRACY_API RPMALLOC_ALLOCATOR void* +rprealloc(void* ptr, size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return ptr; + } +#endif + return _memory_reallocate(ptr, size, 0, 0); +} + +extern RPMALLOC_ALLOCATOR void* +rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, + unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if ((size + alignment < size) || (alignment > _memory_page_size)) { + errno = EINVAL; + return 0; + } +#endif + void* block; + if (alignment > 32) { + size_t usablesize = _memory_usable_size(ptr); + if ((usablesize >= size) && (size >= (usablesize / 2)) && !((uintptr_t)ptr & (alignment - 1))) + return ptr; + + block = rpaligned_alloc(alignment, size); + if (ptr) { + if (!oldsize) + oldsize = usablesize; + if (!(flags & RPMALLOC_NO_PRESERVE)) + memcpy(block, ptr, oldsize < size ? oldsize : size); + rpfree(ptr); + } + //Mark as having aligned blocks + span_t* span = (span_t*)((uintptr_t)block & _memory_span_mask); + span->flags |= SPAN_FLAG_ALIGNED_BLOCKS; + } else { + block = _memory_reallocate(ptr, size, oldsize, flags); + } + return block; +} + +extern RPMALLOC_ALLOCATOR void* +rpaligned_alloc(size_t alignment, size_t size) { + if (alignment <= 16) + return rpmalloc(size); + +#if ENABLE_VALIDATE_ARGS + if ((size + alignment) < size) { + errno = EINVAL; + return 0; + } + if (alignment & (alignment - 1)) { + errno = EINVAL; + return 0; + } +#endif + + void* ptr = 0; + size_t align_mask = alignment - 1; + if (alignment < _memory_page_size) { + ptr = rpmalloc(size + alignment); + if ((uintptr_t)ptr & align_mask) + ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment); + //Mark as having aligned blocks + span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask); + span->flags |= SPAN_FLAG_ALIGNED_BLOCKS; + return ptr; + } + + // Fallback to mapping new pages for this request. Since pointers passed + // to rpfree must be able to reach the start of the span by bitmasking of + // the address with the span size, the returned aligned pointer from this + // function must be with a span size of the start of the mapped area. + // In worst case this requires us to loop and map pages until we get a + // suitable memory address. It also means we can never align to span size + // or greater, since the span header will push alignment more than one + // span size away from span start (thus causing pointer mask to give us + // an invalid span start on free) + if (alignment & align_mask) { + errno = EINVAL; + return 0; + } + if (alignment >= _memory_span_size) { + errno = EINVAL; + return 0; + } + + size_t extra_pages = alignment / _memory_page_size; + + // Since each span has a header, we will at least need one extra memory page + size_t num_pages = 1 + (size / _memory_page_size); + if (size & (_memory_page_size - 1)) + ++num_pages; + + if (extra_pages > num_pages) + num_pages = 1 + extra_pages; + + size_t original_pages = num_pages; + size_t limit_pages = (_memory_span_size / _memory_page_size) * 2; + if (limit_pages < (original_pages * 2)) + limit_pages = original_pages * 2; + + size_t mapped_size, align_offset; + span_t* span; + +retry: + align_offset = 0; + mapped_size = num_pages * _memory_page_size; + + span = (span_t*)_memory_map(mapped_size, &align_offset); + if (!span) { + errno = ENOMEM; + return 0; + } + ptr = pointer_offset(span, SPAN_HEADER_SIZE); + + if ((uintptr_t)ptr & align_mask) + ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment); + + if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) || + (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) || + (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) { + _memory_unmap(span, mapped_size, align_offset, mapped_size); + ++num_pages; + if (num_pages > limit_pages) { + errno = EINVAL; + return 0; + } + goto retry; + } + + //Store page count in span_count + span->size_class = (uint32_t)-1; + span->span_count = (uint32_t)num_pages; + span->align_offset = (uint32_t)align_offset; + _memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + + return ptr; +} + +extern inline RPMALLOC_ALLOCATOR void* +rpmemalign(size_t alignment, size_t size) { + return rpaligned_alloc(alignment, size); +} + +extern inline int +rpposix_memalign(void **memptr, size_t alignment, size_t size) { + if (memptr) + *memptr = rpaligned_alloc(alignment, size); + else + return EINVAL; + return *memptr ? 0 : ENOMEM; +} + +extern inline size_t +rpmalloc_usable_size(void* ptr) { + return (ptr ? _memory_usable_size(ptr) : 0); +} + +extern inline void +rpmalloc_thread_collect(void) { +} + +void +rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) { + memset(stats, 0, sizeof(rpmalloc_thread_statistics_t)); + heap_t* heap = get_thread_heap_raw(); + if (!heap) + return; + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + size_class_t* size_class = _memory_size_class + iclass; + heap_class_t* heap_class = heap->span_class + iclass; + span_t* span = heap_class->partial_span; + while (span) { + atomic_thread_fence_acquire(); + size_t free_count = span->list_size; + if (span->state == SPAN_STATE_PARTIAL) + free_count += (size_class->block_count - span->used_count); + stats->sizecache = free_count * size_class->block_size; + span = span->next; + } + } + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + if (heap->span_cache[iclass]) + stats->spancache = (size_t)heap->span_cache[iclass]->list_size * (iclass + 1) * _memory_span_size; + span_t* deferred_list = !iclass ? (span_t*)atomic_load_ptr(&heap->span_cache_deferred) : 0; + //TODO: Incorrect, for deferred lists the size is NOT stored in list_size + if (deferred_list) + stats->spancache = (size_t)deferred_list->list_size * (iclass + 1) * _memory_span_size; + } +#endif +#if ENABLE_STATISTICS + stats->thread_to_global = heap->thread_to_global; + stats->global_to_thread = heap->global_to_thread; + + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + stats->span_use[iclass].current = (size_t)atomic_load32(&heap->span_use[iclass].current); + stats->span_use[iclass].peak = (size_t)heap->span_use[iclass].high; + stats->span_use[iclass].to_global = (size_t)heap->span_use[iclass].spans_to_global; + stats->span_use[iclass].from_global = (size_t)heap->span_use[iclass].spans_from_global; + stats->span_use[iclass].to_cache = (size_t)heap->span_use[iclass].spans_to_cache; + stats->span_use[iclass].from_cache = (size_t)heap->span_use[iclass].spans_from_cache; + stats->span_use[iclass].to_reserved = (size_t)heap->span_use[iclass].spans_to_reserved; + stats->span_use[iclass].from_reserved = (size_t)heap->span_use[iclass].spans_from_reserved; + stats->span_use[iclass].map_calls = (size_t)heap->span_use[iclass].spans_map_calls; + } + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + stats->size_use[iclass].alloc_current = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current); + stats->size_use[iclass].alloc_peak = (size_t)heap->size_class_use[iclass].alloc_peak; + stats->size_use[iclass].alloc_total = (size_t)heap->size_class_use[iclass].alloc_total; + stats->size_use[iclass].free_total = (size_t)atomic_load32(&heap->size_class_use[iclass].free_total); + stats->size_use[iclass].spans_to_cache = (size_t)heap->size_class_use[iclass].spans_to_cache; + stats->size_use[iclass].spans_from_cache = (size_t)heap->size_class_use[iclass].spans_from_cache; + stats->size_use[iclass].spans_from_reserved = (size_t)heap->size_class_use[iclass].spans_from_reserved; + stats->size_use[iclass].map_calls = (size_t)heap->size_class_use[iclass].spans_map_calls; + } +#endif +} + +void +rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) { + memset(stats, 0, sizeof(rpmalloc_global_statistics_t)); +#if ENABLE_STATISTICS + stats->mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; + stats->mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size; + stats->mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size; + stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; + stats->huge_alloc = (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size; + stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size; +#endif +#if ENABLE_GLOBAL_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + stats->cached += (size_t)atomic_load32(&_memory_span_cache[iclass].size) * (iclass + 1) * _memory_span_size; + } +#endif +} + +void +rpmalloc_dump_statistics(void* file) { +#if ENABLE_STATISTICS + //If you hit this assert, you still have active threads or forgot to finalize some thread(s) + assert(atomic_load32(&_memory_active_heaps) == 0); + + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { + heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]); + while (heap) { + fprintf(file, "Heap %d stats:\n", heap->id); + fprintf(file, "Class CurAlloc PeakAlloc TotAlloc TotFree BlkSize BlkCount SpansCur SpansPeak PeakAllocMiB ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n"); + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + if (!heap->size_class_use[iclass].alloc_total) { + assert(!atomic_load32(&heap->size_class_use[iclass].free_total)); + assert(!heap->size_class_use[iclass].spans_map_calls); + continue; + } + fprintf(file, "%3u: %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass, + atomic_load32(&heap->size_class_use[iclass].alloc_current), + heap->size_class_use[iclass].alloc_peak, + heap->size_class_use[iclass].alloc_total, + atomic_load32(&heap->size_class_use[iclass].free_total), + _memory_size_class[iclass].block_size, + _memory_size_class[iclass].block_count, + heap->size_class_use[iclass].spans_current, + heap->size_class_use[iclass].spans_peak, + ((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024), + ((size_t)heap->size_class_use[iclass].spans_to_cache * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)heap->size_class_use[iclass].spans_from_cache * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)heap->size_class_use[iclass].spans_from_reserved * _memory_span_size) / (size_t)(1024 * 1024), + heap->size_class_use[iclass].spans_map_calls); + } + fprintf(file, "Spans Current Peak PeakMiB Cached ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB MmapCalls\n"); + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + if (!heap->span_use[iclass].high && !heap->span_use[iclass].spans_map_calls) + continue; + fprintf(file, "%4u: %8d %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1), + atomic_load32(&heap->span_use[iclass].current), + heap->span_use[iclass].high, + ((size_t)heap->span_use[iclass].high * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), + heap->span_cache[iclass] ? heap->span_cache[iclass]->list_size : 0, + ((size_t)heap->span_use[iclass].spans_to_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)heap->span_use[iclass].spans_from_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)heap->span_use[iclass].spans_to_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)heap->span_use[iclass].spans_from_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)heap->span_use[iclass].spans_to_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), + ((size_t)heap->span_use[iclass].spans_from_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), + heap->span_use[iclass].spans_map_calls); + } + fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n"); + fprintf(file, "%17zu %17zu\n", (size_t)heap->thread_to_global / (size_t)(1024 * 1024), (size_t)heap->global_to_thread / (size_t)(1024 * 1024)); + heap = heap->next_heap; + } + } + + fprintf(file, "Global stats:\n"); + size_t huge_current = (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size; + size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size; + fprintf(file, "HugeCurrentMiB HugePeakMiB\n"); + fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024), huge_peak / (size_t)(1024 * 1024)); + + size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; + size_t mapped_os = (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size; + size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size; + size_t mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size; + size_t unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; + size_t reserved_total = (size_t)atomic_load32(&_reserved_spans) * _memory_span_size; + fprintf(file, "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB ReservedTotalMiB\n"); + fprintf(file, "%9zu %11zu %13zu %14zu %16zu %16zu\n", + mapped / (size_t)(1024 * 1024), + mapped_os / (size_t)(1024 * 1024), + mapped_peak / (size_t)(1024 * 1024), + mapped_total / (size_t)(1024 * 1024), + unmapped_total / (size_t)(1024 * 1024), + reserved_total / (size_t)(1024 * 1024)); + + fprintf(file, "\n"); +#else + (void)sizeof(file); +#endif +} + +} + +#endif diff --git a/Source/ThirdParty/tracy/client/tracy_rpmalloc.hpp b/Source/ThirdParty/tracy/client/tracy_rpmalloc.hpp new file mode 100644 index 000000000..3e8c4f1b5 --- /dev/null +++ b/Source/ThirdParty/tracy/client/tracy_rpmalloc.hpp @@ -0,0 +1,261 @@ +/* rpmalloc.h - Memory allocator - Public Domain - 2016 Mattias Jansson + * + * This library provides a cross-platform lock free thread caching malloc implementation in C11. + * The latest source code is always available at + * + * https://github.com/mjansson/rpmalloc + * + * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. + * + */ + +#pragma once + +#include +#include "../common/TracySystem.hpp" + +namespace tracy +{ + +#if defined(__clang__) || defined(__GNUC__) +# define RPMALLOC_EXPORT __attribute__((visibility("default"))) +# define RPMALLOC_ALLOCATOR +# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__)) +# if defined(__clang_major__) && (__clang_major__ < 4) +# define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) +# else +# define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size))) +# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) __attribute__((alloc_size(count, size))) +# endif +# define RPMALLOC_CDECL +#elif defined(_MSC_VER) +# define RPMALLOC_EXPORT +# define RPMALLOC_ALLOCATOR __declspec(allocator) __declspec(restrict) +# define RPMALLOC_ATTRIB_MALLOC +# define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count,size) +# define RPMALLOC_CDECL __cdecl +#else +# define RPMALLOC_EXPORT +# define RPMALLOC_ALLOCATOR +# define RPMALLOC_ATTRIB_MALLOC +# define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count,size) +# define RPMALLOC_CDECL +#endif + +//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes +#ifndef RPMALLOC_CONFIGURABLE +#define RPMALLOC_CONFIGURABLE 0 +#endif + +//! Flag to rpaligned_realloc to not preserve content in reallocation +#define RPMALLOC_NO_PRESERVE 1 + +typedef struct rpmalloc_global_statistics_t { + //! Current amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1) + size_t mapped; + //! Peak amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1) + size_t mapped_peak; + //! Current amount of memory in global caches for small and medium sizes (<32KiB) + size_t cached; + //! Current amount of memory allocated in huge allocations, i.e larger than LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) + size_t huge_alloc; + //! Peak amount of memory allocated in huge allocations, i.e larger than LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) + size_t huge_alloc_peak; + //! Total amount of memory mapped since initialization (only if ENABLE_STATISTICS=1) + size_t mapped_total; + //! Total amount of memory unmapped since initialization (only if ENABLE_STATISTICS=1) + size_t unmapped_total; +} rpmalloc_global_statistics_t; + +typedef struct rpmalloc_thread_statistics_t { + //! Current number of bytes available in thread size class caches for small and medium sizes (<32KiB) + size_t sizecache; + //! Current number of bytes available in thread span caches for small and medium sizes (<32KiB) + size_t spancache; + //! Total number of bytes transitioned from thread cache to global cache (only if ENABLE_STATISTICS=1) + size_t thread_to_global; + //! Total number of bytes transitioned from global cache to thread cache (only if ENABLE_STATISTICS=1) + size_t global_to_thread; + //! Per span count statistics (only if ENABLE_STATISTICS=1) + struct { + //! Currently used number of spans + size_t current; + //! High water mark of spans used + size_t peak; + //! Number of spans transitioned to global cache + size_t to_global; + //! Number of spans transitioned from global cache + size_t from_global; + //! Number of spans transitioned to thread cache + size_t to_cache; + //! Number of spans transitioned from thread cache + size_t from_cache; + //! Number of spans transitioned to reserved state + size_t to_reserved; + //! Number of spans transitioned from reserved state + size_t from_reserved; + //! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls) + size_t map_calls; + } span_use[32]; + //! Per size class statistics (only if ENABLE_STATISTICS=1) + struct { + //! Current number of allocations + size_t alloc_current; + //! Peak number of allocations + size_t alloc_peak; + //! Total number of allocations + size_t alloc_total; + //! Total number of frees + size_t free_total; + //! Number of spans transitioned to cache + size_t spans_to_cache; + //! Number of spans transitioned from cache + size_t spans_from_cache; + //! Number of spans transitioned from reserved state + size_t spans_from_reserved; + //! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls) + size_t map_calls; + } size_use[128]; +} rpmalloc_thread_statistics_t; + +typedef struct rpmalloc_config_t { + //! Map memory pages for the given number of bytes. The returned address MUST be + // aligned to the rpmalloc span size, which will always be a power of two. + // Optionally the function can store an alignment offset in the offset variable + // in case it performs alignment and the returned pointer is offset from the + // actual start of the memory region due to this alignment. The alignment offset + // will be passed to the memory unmap function. The alignment offset MUST NOT be + // larger than 65535 (storable in an uint16_t), if it is you must use natural + // alignment to shift it into 16 bits. If you set a memory_map function, you + // must also set a memory_unmap function or else the default implementation will + // be used for both. + void* (*memory_map)(size_t size, size_t* offset); + //! Unmap the memory pages starting at address and spanning the given number of bytes. + // If release is set to non-zero, the unmap is for an entire span range as returned by + // a previous call to memory_map and that the entire range should be released. The + // release argument holds the size of the entire span range. If release is set to 0, + // the unmap is a partial decommit of a subset of the mapped memory range. + // If you set a memory_unmap function, you must also set a memory_map function or + // else the default implementation will be used for both. + void (*memory_unmap)(void* address, size_t size, size_t offset, size_t release); + //! Size of memory pages. The page size MUST be a power of two. All memory mapping + // requests to memory_map will be made with size set to a multiple of the page size. + // Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system page size is used. + size_t page_size; + //! Size of a span of memory blocks. MUST be a power of two, and in [4096,262144] + // range (unless 0 - set to 0 to use the default span size). Used if RPMALLOC_CONFIGURABLE + // is defined to 1. + size_t span_size; + //! Number of spans to map at each request to map new virtual memory blocks. This can + // be used to minimize the system call overhead at the cost of virtual memory address + // space. The extra mapped pages will not be written until actually used, so physical + // committed memory should not be affected in the default implementation. Will be + // aligned to a multiple of spans that match memory page size in case of huge pages. + size_t span_map_count; + //! Enable use of large/huge pages. If this flag is set to non-zero and page size is + // zero, the allocator will try to enable huge pages and auto detect the configuration. + // If this is set to non-zero and page_size is also non-zero, the allocator will + // assume huge pages have been configured and enabled prior to initializing the + // allocator. + // For Windows, see https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support + // For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt + int enable_huge_pages; +} rpmalloc_config_t; + +//! Initialize allocator with default configuration +TRACY_API int +rpmalloc_initialize(void); + +//! Initialize allocator with given configuration +RPMALLOC_EXPORT int +rpmalloc_initialize_config(const rpmalloc_config_t* config); + +//! Get allocator configuration +RPMALLOC_EXPORT const rpmalloc_config_t* +rpmalloc_config(void); + +//! Finalize allocator +TRACY_API void +rpmalloc_finalize(void); + +//! Initialize allocator for calling thread +TRACY_API void +rpmalloc_thread_initialize(void); + +//! Finalize allocator for calling thread +TRACY_API void +rpmalloc_thread_finalize(void); + +//! Perform deferred deallocations pending for the calling thread heap +RPMALLOC_EXPORT void +rpmalloc_thread_collect(void); + +//! Query if allocator is initialized for calling thread +RPMALLOC_EXPORT int +rpmalloc_is_thread_initialized(void); + +//! Get per-thread statistics +RPMALLOC_EXPORT void +rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats); + +//! Get global statistics +RPMALLOC_EXPORT void +rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats); + +//! Dump all statistics in human readable format to file (should be a FILE*) +RPMALLOC_EXPORT void +rpmalloc_dump_statistics(void* file); + +//! Allocate a memory block of at least the given size +TRACY_API RPMALLOC_ALLOCATOR void* +rpmalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1); + +//! Free the given memory block +TRACY_API void +rpfree(void* ptr); + +//! Allocate a memory block of at least the given size and zero initialize it +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2); + +//! Reallocate the given block to at least the given size +TRACY_API RPMALLOC_ALLOCATOR void* +rprealloc(void* ptr, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Reallocate the given block to at least the given size and alignment, +// with optional control flags (see RPMALLOC_NO_PRESERVE). +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size (default 64KiB) +RPMALLOC_EXPORT int +rpposix_memalign(void **memptr, size_t alignment, size_t size); + +//! Query the usable size of the given memory block (from given pointer to the end of block) +RPMALLOC_EXPORT size_t +rpmalloc_usable_size(void* ptr); + +} diff --git a/Source/ThirdParty/tracy/common/TracyAlign.hpp b/Source/ThirdParty/tracy/common/TracyAlign.hpp new file mode 100644 index 000000000..730342df0 --- /dev/null +++ b/Source/ThirdParty/tracy/common/TracyAlign.hpp @@ -0,0 +1,25 @@ +#ifndef __TRACYALIGN_HPP__ +#define __TRACYALIGN_HPP__ + +#include + +namespace tracy +{ + +template +tracy_force_inline T MemRead( const void* ptr ) +{ + T val; + memcpy( &val, ptr, sizeof( T ) ); + return val; +} + +template +tracy_force_inline void MemWrite( void* ptr, T val ) +{ + memcpy( ptr, &val, sizeof( T ) ); +} + +} + +#endif diff --git a/Source/ThirdParty/tracy/common/TracyAlloc.hpp b/Source/ThirdParty/tracy/common/TracyAlloc.hpp new file mode 100644 index 000000000..a3cbec057 --- /dev/null +++ b/Source/ThirdParty/tracy/common/TracyAlloc.hpp @@ -0,0 +1,42 @@ +#ifndef __TRACYALLOC_HPP__ +#define __TRACYALLOC_HPP__ + +#include + +#ifdef TRACY_ENABLE +# include "../client/tracy_rpmalloc.hpp" +#endif + +namespace tracy +{ + +static inline void* tracy_malloc( size_t size ) +{ +#ifdef TRACY_ENABLE + return rpmalloc( size ); +#else + return malloc( size ); +#endif +} + +static inline void tracy_free( void* ptr ) +{ +#ifdef TRACY_ENABLE + rpfree( ptr ); +#else + free( ptr ); +#endif +} + +static inline void* tracy_realloc( void* ptr, size_t size ) +{ +#ifdef TRACY_ENABLE + return rprealloc( ptr, size ); +#else + return realloc( ptr, size ); +#endif +} + +} + +#endif diff --git a/Source/ThirdParty/tracy/common/TracyMutex.hpp b/Source/ThirdParty/tracy/common/TracyMutex.hpp new file mode 100644 index 000000000..57fb01a0c --- /dev/null +++ b/Source/ThirdParty/tracy/common/TracyMutex.hpp @@ -0,0 +1,24 @@ +#ifndef __TRACYMUTEX_HPP__ +#define __TRACYMUTEX_HPP__ + +#if defined _MSC_VER + +# include + +namespace tracy +{ +using TracyMutex = std::shared_mutex; +} + +#else + +#include + +namespace tracy +{ +using TracyMutex = std::mutex; +} + +#endif + +#endif diff --git a/Source/ThirdParty/tracy/common/TracyProtocol.hpp b/Source/ThirdParty/tracy/common/TracyProtocol.hpp new file mode 100644 index 000000000..2326a7f32 --- /dev/null +++ b/Source/ThirdParty/tracy/common/TracyProtocol.hpp @@ -0,0 +1,128 @@ +#ifndef __TRACYPROTOCOL_HPP__ +#define __TRACYPROTOCOL_HPP__ + +#include +#include + +namespace tracy +{ + +constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; } + +enum : uint32_t { ProtocolVersion = 46 }; +enum : uint16_t { BroadcastVersion = 2 }; + +using lz4sz_t = uint32_t; + +enum { TargetFrameSize = 256 * 1024 }; +enum { LZ4Size = Lz4CompressBound( TargetFrameSize ) }; +static_assert( LZ4Size <= std::numeric_limits::max(), "LZ4Size greater than lz4sz_t" ); +static_assert( TargetFrameSize * 2 >= 64 * 1024, "Not enough space for LZ4 stream buffer" ); + +enum { HandshakeShibbolethSize = 8 }; +static const char HandshakeShibboleth[HandshakeShibbolethSize] = { 'T', 'r', 'a', 'c', 'y', 'P', 'r', 'f' }; + +enum HandshakeStatus : uint8_t +{ + HandshakePending, + HandshakeWelcome, + HandshakeProtocolMismatch, + HandshakeNotAvailable, + HandshakeDropped +}; + +enum { WelcomeMessageProgramNameSize = 64 }; +enum { WelcomeMessageHostInfoSize = 1024 }; + +#pragma pack( 1 ) + +// Must increase left query space after handling! +enum ServerQuery : uint8_t +{ + ServerQueryTerminate, + ServerQueryString, + ServerQueryThreadString, + ServerQuerySourceLocation, + ServerQueryPlotName, + ServerQueryCallstackFrame, + ServerQueryFrameName, + ServerQueryDisconnect, + ServerQueryExternalName, + ServerQueryParameter, + ServerQuerySymbol, + ServerQuerySymbolCode, + ServerQueryCodeLocation, + ServerQuerySourceCode, + ServerQueryDataTransfer, + ServerQueryDataTransferPart +}; + +struct ServerQueryPacket +{ + ServerQuery type; + uint64_t ptr; + uint32_t extra; +}; + +enum { ServerQueryPacketSize = sizeof( ServerQueryPacket ) }; + + +enum CpuArchitecture : uint8_t +{ + CpuArchUnknown, + CpuArchX86, + CpuArchX64, + CpuArchArm32, + CpuArchArm64 +}; + + +struct WelcomeMessage +{ + double timerMul; + int64_t initBegin; + int64_t initEnd; + uint64_t delay; + uint64_t resolution; + uint64_t epoch; + uint64_t exectime; + uint64_t pid; + int64_t samplingPeriod; + uint8_t onDemand; + uint8_t isApple; + uint8_t cpuArch; + uint8_t codeTransfer; + char cpuManufacturer[12]; + uint32_t cpuId; + char programName[WelcomeMessageProgramNameSize]; + char hostInfo[WelcomeMessageHostInfoSize]; +}; + +enum { WelcomeMessageSize = sizeof( WelcomeMessage ) }; + + +struct OnDemandPayloadMessage +{ + uint64_t frames; + uint64_t currentTime; +}; + +enum { OnDemandPayloadMessageSize = sizeof( OnDemandPayloadMessage ) }; + + +struct BroadcastMessage +{ + uint16_t broadcastVersion; + uint16_t listenPort; + uint32_t protocolVersion; + int32_t activeTime; // in seconds + char programName[WelcomeMessageProgramNameSize]; +}; + +enum { BroadcastMessageSize = sizeof( BroadcastMessage ) }; + +#pragma pack() + +} + +#endif diff --git a/Source/ThirdParty/tracy/common/TracyQueue.hpp b/Source/ThirdParty/tracy/common/TracyQueue.hpp new file mode 100644 index 000000000..d99945013 --- /dev/null +++ b/Source/ThirdParty/tracy/common/TracyQueue.hpp @@ -0,0 +1,678 @@ +#ifndef __TRACYQUEUE_HPP__ +#define __TRACYQUEUE_HPP__ + +#include + +namespace tracy +{ + +enum class QueueType : uint8_t +{ + ZoneText, + ZoneName, + Message, + MessageColor, + MessageCallstack, + MessageColorCallstack, + MessageAppInfo, + ZoneBeginAllocSrcLoc, + ZoneBeginAllocSrcLocCallstack, + CallstackSerial, + Callstack, + CallstackAlloc, + CallstackSample, + FrameImage, + ZoneBegin, + ZoneBeginCallstack, + ZoneEnd, + LockWait, + LockObtain, + LockRelease, + LockSharedWait, + LockSharedObtain, + LockSharedRelease, + LockName, + MemAlloc, + MemAllocNamed, + MemFree, + MemFreeNamed, + MemAllocCallstack, + MemAllocCallstackNamed, + MemFreeCallstack, + MemFreeCallstackNamed, + GpuZoneBegin, + GpuZoneBeginCallstack, + GpuZoneBeginAllocSrcLoc, + GpuZoneBeginAllocSrcLocCallstack, + GpuZoneEnd, + GpuZoneBeginSerial, + GpuZoneBeginCallstackSerial, + GpuZoneBeginAllocSrcLocSerial, + GpuZoneBeginAllocSrcLocCallstackSerial, + GpuZoneEndSerial, + PlotData, + ContextSwitch, + ThreadWakeup, + GpuTime, + GpuContextName, + Terminate, + KeepAlive, + ThreadContext, + GpuCalibration, + Crash, + CrashReport, + ZoneValidation, + ZoneColor, + ZoneValue, + FrameMarkMsg, + FrameMarkMsgStart, + FrameMarkMsgEnd, + SourceLocation, + LockAnnounce, + LockTerminate, + LockMark, + MessageLiteral, + MessageLiteralColor, + MessageLiteralCallstack, + MessageLiteralColorCallstack, + GpuNewContext, + CallstackFrameSize, + CallstackFrame, + SymbolInformation, + CodeInformation, + SysTimeReport, + TidToPid, + PlotConfig, + ParamSetup, + AckServerQueryNoop, + AckSourceCodeNotAvailable, + CpuTopology, + SingleStringData, + SecondStringData, + MemNamePayload, + StringData, + ThreadName, + PlotName, + SourceLocationPayload, + CallstackPayload, + CallstackAllocPayload, + FrameName, + FrameImageData, + ExternalName, + ExternalThreadName, + SymbolCode, + SourceCode, + NUM_TYPES +}; + +#pragma pack( 1 ) + +struct QueueThreadContext +{ + uint64_t thread; +}; + +struct QueueZoneBeginLean +{ + int64_t time; +}; + +struct QueueZoneBegin : public QueueZoneBeginLean +{ + uint64_t srcloc; // ptr +}; + +struct QueueZoneEnd +{ + int64_t time; +}; + +struct QueueZoneValidation +{ + uint32_t id; +}; + +struct QueueZoneColor +{ + uint8_t r; + uint8_t g; + uint8_t b; +}; + +struct QueueZoneValue +{ + uint64_t value; +}; + +struct QueueStringTransfer +{ + uint64_t ptr; +}; + +struct QueueFrameMark +{ + int64_t time; + uint64_t name; // ptr +}; + +struct QueueFrameImage +{ + uint32_t frame; + uint16_t w; + uint16_t h; + uint8_t flip; +}; + +struct QueueFrameImageFat : public QueueFrameImage +{ + uint64_t image; // ptr +}; + +struct QueueSourceLocation +{ + uint64_t name; + uint64_t function; // ptr + uint64_t file; // ptr + uint32_t line; + uint8_t r; + uint8_t g; + uint8_t b; +}; + +struct QueueZoneTextFat +{ + uint64_t text; // ptr + uint16_t size; +}; + +enum class LockType : uint8_t +{ + Lockable, + SharedLockable +}; + +struct QueueLockAnnounce +{ + uint32_t id; + int64_t time; + uint64_t lckloc; // ptr + LockType type; +}; + +struct QueueLockTerminate +{ + uint32_t id; + int64_t time; +}; + +struct QueueLockWait +{ + uint64_t thread; + uint32_t id; + int64_t time; +}; + +struct QueueLockObtain +{ + uint64_t thread; + uint32_t id; + int64_t time; +}; + +struct QueueLockRelease +{ + uint64_t thread; + uint32_t id; + int64_t time; +}; + +struct QueueLockMark +{ + uint64_t thread; + uint32_t id; + uint64_t srcloc; // ptr +}; + +struct QueueLockName +{ + uint32_t id; +}; + +struct QueueLockNameFat : public QueueLockName +{ + uint64_t name; // ptr + uint16_t size; +}; + +enum class PlotDataType : uint8_t +{ + Float, + Double, + Int +}; + +struct QueuePlotData +{ + uint64_t name; // ptr + int64_t time; + PlotDataType type; + union + { + double d; + float f; + int64_t i; + } data; +}; + +struct QueueMessage +{ + int64_t time; +}; + +struct QueueMessageColor : public QueueMessage +{ + uint8_t r; + uint8_t g; + uint8_t b; +}; + +struct QueueMessageLiteral : public QueueMessage +{ + uint64_t text; // ptr +}; + +struct QueueMessageColorLiteral : public QueueMessageColor +{ + uint64_t text; // ptr +}; + +struct QueueMessageFat : public QueueMessage +{ + uint64_t text; // ptr + uint16_t size; +}; + +struct QueueMessageColorFat : public QueueMessageColor +{ + uint64_t text; // ptr + uint16_t size; +}; + +// Don't change order, only add new entries at the end, this is also used on trace dumps! +enum class GpuContextType : uint8_t +{ + Invalid, + OpenGl, + Vulkan, + OpenCL, + Direct3D12 +}; + +enum GpuContextFlags : uint8_t +{ + GpuContextCalibration = 1 << 0 +}; + +struct QueueGpuNewContext +{ + int64_t cpuTime; + int64_t gpuTime; + uint64_t thread; + float period; + uint8_t context; + GpuContextFlags flags; + GpuContextType type; +}; + +struct QueueGpuZoneBeginLean +{ + int64_t cpuTime; + uint64_t thread; + uint16_t queryId; + uint8_t context; +}; + +struct QueueGpuZoneBegin : public QueueGpuZoneBeginLean +{ + uint64_t srcloc; +}; + +struct QueueGpuZoneEnd +{ + int64_t cpuTime; + uint64_t thread; + uint16_t queryId; + uint8_t context; +}; + +struct QueueGpuTime +{ + int64_t gpuTime; + uint16_t queryId; + uint8_t context; +}; + +struct QueueGpuCalibration +{ + int64_t gpuTime; + int64_t cpuTime; + int64_t cpuDelta; + uint8_t context; +}; + +struct QueueGpuContextName +{ + uint8_t context; +}; + +struct QueueGpuContextNameFat : public QueueGpuContextName +{ + uint64_t ptr; + uint16_t size; +}; + +struct QueueMemNamePayload +{ + uint64_t name; +}; + +struct QueueMemAlloc +{ + int64_t time; + uint64_t thread; + uint64_t ptr; + char size[6]; +}; + +struct QueueMemFree +{ + int64_t time; + uint64_t thread; + uint64_t ptr; +}; + +struct QueueCallstackFat +{ + uint64_t ptr; +}; + +struct QueueCallstackAllocFat +{ + uint64_t ptr; + uint64_t nativePtr; +}; + +struct QueueCallstackSample +{ + int64_t time; + uint64_t thread; +}; + +struct QueueCallstackSampleFat : public QueueCallstackSample +{ + uint64_t ptr; +}; + +struct QueueCallstackFrameSize +{ + uint64_t ptr; + uint8_t size; +}; + +struct QueueCallstackFrame +{ + uint32_t line; + uint64_t symAddr; + uint32_t symLen; +}; + +struct QueueSymbolInformation +{ + uint32_t line; + uint64_t symAddr; +}; + +struct QueueCodeInformation +{ + uint64_t ptr; + uint32_t line; +}; + +struct QueueCrashReport +{ + int64_t time; + uint64_t text; // ptr +}; + +struct QueueSysTime +{ + int64_t time; + float sysTime; +}; + +struct QueueContextSwitch +{ + int64_t time; + uint64_t oldThread; + uint64_t newThread; + uint8_t cpu; + uint8_t reason; + uint8_t state; +}; + +struct QueueThreadWakeup +{ + int64_t time; + uint64_t thread; +}; + +struct QueueTidToPid +{ + uint64_t tid; + uint64_t pid; +}; + +struct QueuePlotConfig +{ + uint64_t name; // ptr + uint8_t type; +}; + +struct QueueParamSetup +{ + uint32_t idx; + uint64_t name; // ptr + uint8_t isBool; + int32_t val; +}; + +struct QueueCpuTopology +{ + uint32_t package; + uint32_t core; + uint32_t thread; +}; + +struct QueueHeader +{ + union + { + QueueType type; + uint8_t idx; + }; +}; + +struct QueueItem +{ + QueueHeader hdr; + union + { + QueueThreadContext threadCtx; + QueueZoneBegin zoneBegin; + QueueZoneBeginLean zoneBeginLean; + QueueZoneEnd zoneEnd; + QueueZoneValidation zoneValidation; + QueueZoneColor zoneColor; + QueueZoneValue zoneValue; + QueueStringTransfer stringTransfer; + QueueFrameMark frameMark; + QueueFrameImage frameImage; + QueueFrameImageFat frameImageFat; + QueueSourceLocation srcloc; + QueueZoneTextFat zoneTextFat; + QueueLockAnnounce lockAnnounce; + QueueLockTerminate lockTerminate; + QueueLockWait lockWait; + QueueLockObtain lockObtain; + QueueLockRelease lockRelease; + QueueLockMark lockMark; + QueueLockName lockName; + QueueLockNameFat lockNameFat; + QueuePlotData plotData; + QueueMessage message; + QueueMessageColor messageColor; + QueueMessageLiteral messageLiteral; + QueueMessageColorLiteral messageColorLiteral; + QueueMessageFat messageFat; + QueueMessageColorFat messageColorFat; + QueueGpuNewContext gpuNewContext; + QueueGpuZoneBegin gpuZoneBegin; + QueueGpuZoneBeginLean gpuZoneBeginLean; + QueueGpuZoneEnd gpuZoneEnd; + QueueGpuTime gpuTime; + QueueGpuCalibration gpuCalibration; + QueueGpuContextName gpuContextName; + QueueGpuContextNameFat gpuContextNameFat; + QueueMemAlloc memAlloc; + QueueMemFree memFree; + QueueMemNamePayload memName; + QueueCallstackFat callstackFat; + QueueCallstackAllocFat callstackAllocFat; + QueueCallstackSample callstackSample; + QueueCallstackSampleFat callstackSampleFat; + QueueCallstackFrameSize callstackFrameSize; + QueueCallstackFrame callstackFrame; + QueueSymbolInformation symbolInformation; + QueueCodeInformation codeInformation; + QueueCrashReport crashReport; + QueueSysTime sysTime; + QueueContextSwitch contextSwitch; + QueueThreadWakeup threadWakeup; + QueueTidToPid tidToPid; + QueuePlotConfig plotConfig; + QueueParamSetup paramSetup; + QueueCpuTopology cpuTopology; + }; +}; +#pragma pack() + + +enum { QueueItemSize = sizeof( QueueItem ) }; + +static constexpr size_t QueueDataSize[] = { + sizeof( QueueHeader ), // zone text + sizeof( QueueHeader ), // zone name + sizeof( QueueHeader ) + sizeof( QueueMessage ), + sizeof( QueueHeader ) + sizeof( QueueMessageColor ), + sizeof( QueueHeader ) + sizeof( QueueMessage ), // callstack + sizeof( QueueHeader ) + sizeof( QueueMessageColor ), // callstack + sizeof( QueueHeader ) + sizeof( QueueMessage ), // app info + sizeof( QueueHeader ) + sizeof( QueueZoneBeginLean ), // allocated source location + sizeof( QueueHeader ) + sizeof( QueueZoneBeginLean ), // allocated source location, callstack + sizeof( QueueHeader ), // callstack memory + sizeof( QueueHeader ), // callstack + sizeof( QueueHeader ), // callstack alloc + sizeof( QueueHeader ) + sizeof( QueueCallstackSample ), + sizeof( QueueHeader ) + sizeof( QueueFrameImage ), + sizeof( QueueHeader ) + sizeof( QueueZoneBegin ), + sizeof( QueueHeader ) + sizeof( QueueZoneBegin ), // callstack + sizeof( QueueHeader ) + sizeof( QueueZoneEnd ), + sizeof( QueueHeader ) + sizeof( QueueLockWait ), + sizeof( QueueHeader ) + sizeof( QueueLockObtain ), + sizeof( QueueHeader ) + sizeof( QueueLockRelease ), + sizeof( QueueHeader ) + sizeof( QueueLockWait ), // shared + sizeof( QueueHeader ) + sizeof( QueueLockObtain ), // shared + sizeof( QueueHeader ) + sizeof( QueueLockRelease ), // shared + sizeof( QueueHeader ) + sizeof( QueueLockName ), + sizeof( QueueHeader ) + sizeof( QueueMemAlloc ), + sizeof( QueueHeader ) + sizeof( QueueMemAlloc ), // named + sizeof( QueueHeader ) + sizeof( QueueMemFree ), + sizeof( QueueHeader ) + sizeof( QueueMemFree ), // named + sizeof( QueueHeader ) + sizeof( QueueMemAlloc ), // callstack + sizeof( QueueHeader ) + sizeof( QueueMemAlloc ), // callstack, named + sizeof( QueueHeader ) + sizeof( QueueMemFree ), // callstack + sizeof( QueueHeader ) + sizeof( QueueMemFree ), // callstack, named + sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ), + sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ), // callstack + sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// allocated source location + sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// allocated source location, callstack + sizeof( QueueHeader ) + sizeof( QueueGpuZoneEnd ), + sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ), // serial + sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ), // serial, callstack + sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// serial, allocated source location + sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// serial, allocated source location, callstack + sizeof( QueueHeader ) + sizeof( QueueGpuZoneEnd ), // serial + sizeof( QueueHeader ) + sizeof( QueuePlotData ), + sizeof( QueueHeader ) + sizeof( QueueContextSwitch ), + sizeof( QueueHeader ) + sizeof( QueueThreadWakeup ), + sizeof( QueueHeader ) + sizeof( QueueGpuTime ), + sizeof( QueueHeader ) + sizeof( QueueGpuContextName ), + // above items must be first + sizeof( QueueHeader ), // terminate + sizeof( QueueHeader ), // keep alive + sizeof( QueueHeader ) + sizeof( QueueThreadContext ), + sizeof( QueueHeader ) + sizeof( QueueGpuCalibration ), + sizeof( QueueHeader ), // crash + sizeof( QueueHeader ) + sizeof( QueueCrashReport ), + sizeof( QueueHeader ) + sizeof( QueueZoneValidation ), + sizeof( QueueHeader ) + sizeof( QueueZoneColor ), + sizeof( QueueHeader ) + sizeof( QueueZoneValue ), + sizeof( QueueHeader ) + sizeof( QueueFrameMark ), // continuous frames + sizeof( QueueHeader ) + sizeof( QueueFrameMark ), // start + sizeof( QueueHeader ) + sizeof( QueueFrameMark ), // end + sizeof( QueueHeader ) + sizeof( QueueSourceLocation ), + sizeof( QueueHeader ) + sizeof( QueueLockAnnounce ), + sizeof( QueueHeader ) + sizeof( QueueLockTerminate ), + sizeof( QueueHeader ) + sizeof( QueueLockMark ), + sizeof( QueueHeader ) + sizeof( QueueMessageLiteral ), + sizeof( QueueHeader ) + sizeof( QueueMessageColorLiteral ), + sizeof( QueueHeader ) + sizeof( QueueMessageLiteral ), // callstack + sizeof( QueueHeader ) + sizeof( QueueMessageColorLiteral ), // callstack + sizeof( QueueHeader ) + sizeof( QueueGpuNewContext ), + sizeof( QueueHeader ) + sizeof( QueueCallstackFrameSize ), + sizeof( QueueHeader ) + sizeof( QueueCallstackFrame ), + sizeof( QueueHeader ) + sizeof( QueueSymbolInformation ), + sizeof( QueueHeader ) + sizeof( QueueCodeInformation ), + sizeof( QueueHeader ) + sizeof( QueueSysTime ), + sizeof( QueueHeader ) + sizeof( QueueTidToPid ), + sizeof( QueueHeader ) + sizeof( QueuePlotConfig ), + sizeof( QueueHeader ) + sizeof( QueueParamSetup ), + sizeof( QueueHeader ), // server query acknowledgement + sizeof( QueueHeader ), // source code not available + sizeof( QueueHeader ) + sizeof( QueueCpuTopology ), + sizeof( QueueHeader ), // single string data + sizeof( QueueHeader ), // second string data + sizeof( QueueHeader ) + sizeof( QueueMemNamePayload ), + // keep all QueueStringTransfer below + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // string data + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // thread name + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // plot name + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // allocated source location payload + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // callstack payload + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // callstack alloc payload + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // frame name + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // frame image data + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // external name + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // external thread name + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // symbol code + sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // source code +}; + +static_assert( QueueItemSize == 32, "Queue item size not 32 bytes" ); +static_assert( sizeof( QueueDataSize ) / sizeof( size_t ) == (uint8_t)QueueType::NUM_TYPES, "QueueDataSize mismatch" ); +static_assert( sizeof( void* ) <= sizeof( uint64_t ), "Pointer size > 8 bytes" ); +static_assert( sizeof( void* ) == sizeof( uintptr_t ), "Pointer size != uintptr_t" ); + +} + +#endif diff --git a/Source/ThirdParty/tracy/common/TracySocket.cpp b/Source/ThirdParty/tracy/common/TracySocket.cpp new file mode 100644 index 000000000..f16569b06 --- /dev/null +++ b/Source/ThirdParty/tracy/common/TracySocket.cpp @@ -0,0 +1,748 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "TracyAlloc.hpp" +#include "TracySocket.hpp" + +#ifdef _WIN32 +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +# include +# ifdef _MSC_VER +# pragma warning(disable:4244) +# pragma warning(disable:4267) +# endif +# define poll WSAPoll +#else +# include +# include +# include +# include +# include +# include +# include +# include +# include +#endif + +#ifndef MSG_NOSIGNAL +# define MSG_NOSIGNAL 0 +#endif + +namespace tracy +{ + +#ifdef _WIN32 +typedef SOCKET socket_t; +#else +typedef int socket_t; +#endif + +#ifdef _WIN32 +struct __wsinit +{ + __wsinit() + { + WSADATA wsaData; + if( WSAStartup( MAKEWORD( 2, 2 ), &wsaData ) != 0 ) + { + fprintf( stderr, "Cannot init winsock.\n" ); + exit( 1 ); + } + } +}; + +void InitWinSock() +{ + static __wsinit init; +} +#endif + + +enum { BufSize = 128 * 1024 }; + +Socket::Socket() + : m_buf( (char*)tracy_malloc( BufSize ) ) + , m_bufPtr( nullptr ) + , m_sock( -1 ) + , m_bufLeft( 0 ) + , m_ptr( nullptr ) +{ +#ifdef _WIN32 + InitWinSock(); +#endif +} + +Socket::Socket( int sock ) + : m_buf( (char*)tracy_malloc( BufSize ) ) + , m_bufPtr( nullptr ) + , m_sock( sock ) + , m_bufLeft( 0 ) + , m_ptr( nullptr ) +{ +} + +Socket::~Socket() +{ + tracy_free( m_buf ); + if( m_sock.load( std::memory_order_relaxed ) != -1 ) + { + Close(); + } + if( m_ptr ) + { + freeaddrinfo( m_res ); +#ifdef _WIN32 + closesocket( m_connSock ); +#else + close( m_connSock ); +#endif + } +} + +bool Socket::Connect( const char* addr, uint16_t port ) +{ + assert( !IsValid() ); + + if( m_ptr ) + { + const auto c = connect( m_connSock, m_ptr->ai_addr, m_ptr->ai_addrlen ); + if( c == -1 ) + { +#if defined _WIN32 + const auto err = WSAGetLastError(); + if( err == WSAEALREADY || err == WSAEINPROGRESS ) return false; + if( err != WSAEISCONN ) + { + freeaddrinfo( m_res ); + closesocket( m_connSock ); + m_ptr = nullptr; + return false; + } +#else + const auto err = errno; + if( err == EALREADY || err == EINPROGRESS ) return false; + if( err != EISCONN ) + { + freeaddrinfo( m_res ); + close( m_connSock ); + m_ptr = nullptr; + return false; + } +#endif + } + +#if defined _WIN32 + u_long nonblocking = 0; + ioctlsocket( m_connSock, FIONBIO, &nonblocking ); +#else + int flags = fcntl( m_connSock, F_GETFL, 0 ); + fcntl( m_connSock, F_SETFL, flags & ~O_NONBLOCK ); +#endif + m_sock.store( m_connSock, std::memory_order_relaxed ); + freeaddrinfo( m_res ); + m_ptr = nullptr; + return true; + } + + struct addrinfo hints; + struct addrinfo *res, *ptr; + + memset( &hints, 0, sizeof( hints ) ); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + + char portbuf[32]; + sprintf( portbuf, "%" PRIu16, port ); + + if( getaddrinfo( addr, portbuf, &hints, &res ) != 0 ) return false; + int sock = 0; + for( ptr = res; ptr; ptr = ptr->ai_next ) + { + if( ( sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol ) ) == -1 ) continue; +#if defined __APPLE__ + int val = 1; + setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) ); +#endif +#if defined _WIN32 + u_long nonblocking = 1; + ioctlsocket( sock, FIONBIO, &nonblocking ); +#else + int flags = fcntl( sock, F_GETFL, 0 ); + fcntl( sock, F_SETFL, flags | O_NONBLOCK ); +#endif + if( connect( sock, ptr->ai_addr, ptr->ai_addrlen ) == 0 ) + { + break; + } + else + { +#if defined _WIN32 + const auto err = WSAGetLastError(); + if( err != WSAEWOULDBLOCK ) + { + closesocket( sock ); + continue; + } +#else + if( errno != EINPROGRESS ) + { + close( sock ); + continue; + } +#endif + } + m_res = res; + m_ptr = ptr; + m_connSock = sock; + return false; + } + freeaddrinfo( res ); + if( !ptr ) return false; + +#if defined _WIN32 + u_long nonblocking = 0; + ioctlsocket( sock, FIONBIO, &nonblocking ); +#else + int flags = fcntl( sock, F_GETFL, 0 ); + fcntl( sock, F_SETFL, flags & ~O_NONBLOCK ); +#endif + + m_sock.store( sock, std::memory_order_relaxed ); + return true; +} + +bool Socket::ConnectBlocking( const char* addr, uint16_t port ) +{ + assert( !IsValid() ); + assert( !m_ptr ); + + struct addrinfo hints; + struct addrinfo *res, *ptr; + + memset( &hints, 0, sizeof( hints ) ); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + + char portbuf[32]; + sprintf( portbuf, "%" PRIu16, port ); + + if( getaddrinfo( addr, portbuf, &hints, &res ) != 0 ) return false; + int sock = 0; + for( ptr = res; ptr; ptr = ptr->ai_next ) + { + if( ( sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol ) ) == -1 ) continue; +#if defined __APPLE__ + int val = 1; + setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) ); +#endif + if( connect( sock, ptr->ai_addr, ptr->ai_addrlen ) == -1 ) + { +#ifdef _WIN32 + closesocket( sock ); +#else + close( sock ); +#endif + continue; + } + break; + } + freeaddrinfo( res ); + if( !ptr ) return false; + + m_sock.store( sock, std::memory_order_relaxed ); + return true; +} + +void Socket::Close() +{ + const auto sock = m_sock.load( std::memory_order_relaxed ); + assert( sock != -1 ); +#ifdef _WIN32 + closesocket( sock ); +#else + close( sock ); +#endif + m_sock.store( -1, std::memory_order_relaxed ); +} + +int Socket::Send( const void* _buf, int len ) +{ + const auto sock = m_sock.load( std::memory_order_relaxed ); + auto buf = (const char*)_buf; + assert( sock != -1 ); + auto start = buf; + while( len > 0 ) + { + auto ret = send( sock, buf, len, MSG_NOSIGNAL ); + if( ret == -1 ) return -1; + len -= ret; + buf += ret; + } + return int( buf - start ); +} + +int Socket::GetSendBufSize() +{ + const auto sock = m_sock.load( std::memory_order_relaxed ); + int bufSize; +#if defined _WIN32 + int sz = sizeof( bufSize ); + getsockopt( sock, SOL_SOCKET, SO_SNDBUF, (char*)&bufSize, &sz ); +#else + socklen_t sz = sizeof( bufSize ); + getsockopt( sock, SOL_SOCKET, SO_SNDBUF, &bufSize, &sz ); +#endif + return bufSize; +} + +int Socket::RecvBuffered( void* buf, int len, int timeout ) +{ + if( len <= m_bufLeft ) + { + memcpy( buf, m_bufPtr, len ); + m_bufPtr += len; + m_bufLeft -= len; + return len; + } + + if( m_bufLeft > 0 ) + { + memcpy( buf, m_bufPtr, m_bufLeft ); + const auto ret = m_bufLeft; + m_bufLeft = 0; + return ret; + } + + if( len >= BufSize ) return Recv( buf, len, timeout ); + + m_bufLeft = Recv( m_buf, BufSize, timeout ); + if( m_bufLeft <= 0 ) return m_bufLeft; + + const auto sz = len < m_bufLeft ? len : m_bufLeft; + memcpy( buf, m_buf, sz ); + m_bufPtr = m_buf + sz; + m_bufLeft -= sz; + return sz; +} + +int Socket::Recv( void* _buf, int len, int timeout ) +{ + const auto sock = m_sock.load( std::memory_order_relaxed ); + auto buf = (char*)_buf; + + struct pollfd fd; + fd.fd = (socket_t)sock; + fd.events = POLLIN; + + if( poll( &fd, 1, timeout ) > 0 ) + { + return recv( sock, buf, len, 0 ); + } + else + { + return -1; + } +} + +int Socket::ReadUpTo( void* _buf, int len, int timeout ) +{ + const auto sock = m_sock.load( std::memory_order_relaxed ); + auto buf = (char*)_buf; + + int rd = 0; + while( len > 0 ) + { + const auto res = recv( sock, buf, len, 0 ); + if( res == 0 ) break; + if( res == -1 ) return -1; + len -= res; + rd += res; + buf += res; + } + return rd; +} + +bool Socket::Read( void* buf, int len, int timeout ) +{ + auto cbuf = (char*)buf; + while( len > 0 ) + { + if( !ReadImpl( cbuf, len, timeout ) ) return false; + } + return true; +} + +bool Socket::ReadImpl( char*& buf, int& len, int timeout ) +{ + const auto sz = RecvBuffered( buf, len, timeout ); + switch( sz ) + { + case 0: + return false; + case -1: +#ifdef _WIN32 + { + auto err = WSAGetLastError(); + if( err == WSAECONNABORTED || err == WSAECONNRESET ) return false; + } +#endif + break; + default: + len -= sz; + buf += sz; + break; + } + return true; +} + +bool Socket::ReadRaw( void* _buf, int len, int timeout ) +{ + auto buf = (char*)_buf; + while( len > 0 ) + { + const auto sz = Recv( buf, len, timeout ); + if( sz <= 0 ) return false; + len -= sz; + buf += sz; + } + return true; +} + +bool Socket::HasData() +{ + const auto sock = m_sock.load( std::memory_order_relaxed ); + if( m_bufLeft > 0 ) return true; + + struct pollfd fd; + fd.fd = (socket_t)sock; + fd.events = POLLIN; + + return poll( &fd, 1, 0 ) > 0; +} + +bool Socket::IsValid() const +{ + return m_sock.load( std::memory_order_relaxed ) >= 0; +} + + +ListenSocket::ListenSocket() + : m_sock( -1 ) +{ +#ifdef _WIN32 + InitWinSock(); +#endif +} + +ListenSocket::~ListenSocket() +{ + if( m_sock != -1 ) Close(); +} + +static int addrinfo_and_socket_for_family( uint16_t port, int ai_family, struct addrinfo** res ) +{ + struct addrinfo hints; + memset( &hints, 0, sizeof( hints ) ); + hints.ai_family = ai_family; + hints.ai_socktype = SOCK_STREAM; +#ifndef TRACY_ONLY_LOCALHOST + const char* onlyLocalhost = getenv( "TRACY_ONLY_LOCALHOST" ); + if( !onlyLocalhost || onlyLocalhost[0] != '1' ) + { + hints.ai_flags = AI_PASSIVE; + } +#endif + char portbuf[32]; + sprintf( portbuf, "%" PRIu16, port ); + if( getaddrinfo( nullptr, portbuf, &hints, res ) != 0 ) return -1; + int sock = socket( (*res)->ai_family, (*res)->ai_socktype, (*res)->ai_protocol ); + if (sock == -1) freeaddrinfo( *res ); + return sock; +} + +bool ListenSocket::Listen( uint16_t port, int backlog ) +{ + assert( m_sock == -1 ); + + struct addrinfo* res = nullptr; + +#if !defined TRACY_ONLY_IPV4 && !defined TRACY_ONLY_LOCALHOST + const char* onlyIPv4 = getenv( "TRACY_ONLY_IPV4" ); + if( !onlyIPv4 || onlyIPv4[0] != '1' ) + { + m_sock = addrinfo_and_socket_for_family( port, AF_INET6, &res ); + } +#endif + if (m_sock == -1) + { + // IPV6 protocol may not be available/is disabled. Try to create a socket + // with the IPV4 protocol + m_sock = addrinfo_and_socket_for_family( port, AF_INET, &res ); + if( m_sock == -1 ) return false; + } +#if defined _WIN32 || defined __CYGWIN__ + unsigned long val = 0; + setsockopt( m_sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*)&val, sizeof( val ) ); +#elif defined BSD + int val = 0; + setsockopt( m_sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*)&val, sizeof( val ) ); + val = 1; + setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &val, sizeof( val ) ); +#else + int val = 1; + setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &val, sizeof( val ) ); +#endif + if( bind( m_sock, res->ai_addr, res->ai_addrlen ) == -1 ) { freeaddrinfo( res ); Close(); return false; } + if( listen( m_sock, backlog ) == -1 ) { freeaddrinfo( res ); Close(); return false; } + freeaddrinfo( res ); + return true; +} + +Socket* ListenSocket::Accept() +{ + struct sockaddr_storage remote; + socklen_t sz = sizeof( remote ); + + struct pollfd fd; + fd.fd = (socket_t)m_sock; + fd.events = POLLIN; + + if( poll( &fd, 1, 10 ) > 0 ) + { + int sock = accept( m_sock, (sockaddr*)&remote, &sz); + if( sock == -1 ) return nullptr; + +#if defined __APPLE__ + int val = 1; + setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) ); +#endif + + auto ptr = (Socket*)tracy_malloc( sizeof( Socket ) ); + new(ptr) Socket( sock ); + return ptr; + } + else + { + return nullptr; + } +} + +void ListenSocket::Close() +{ + assert( m_sock != -1 ); +#ifdef _WIN32 + closesocket( m_sock ); +#else + close( m_sock ); +#endif + m_sock = -1; +} + +UdpBroadcast::UdpBroadcast() + : m_sock( -1 ) +{ +#ifdef _WIN32 + InitWinSock(); +#endif +} + +UdpBroadcast::~UdpBroadcast() +{ + if( m_sock != -1 ) Close(); +} + +bool UdpBroadcast::Open( const char* addr, uint16_t port ) +{ + assert( m_sock == -1 ); + + struct addrinfo hints; + struct addrinfo *res, *ptr; + + memset( &hints, 0, sizeof( hints ) ); + hints.ai_family = AF_INET; + hints.ai_socktype = SOCK_DGRAM; + + char portbuf[32]; + sprintf( portbuf, "%" PRIu16, port ); + + if( getaddrinfo( addr, portbuf, &hints, &res ) != 0 ) return false; + int sock = 0; + for( ptr = res; ptr; ptr = ptr->ai_next ) + { + if( ( sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol ) ) == -1 ) continue; +#if defined __APPLE__ + int val = 1; + setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) ); +#endif +#if defined _WIN32 + unsigned long broadcast = 1; + if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, (const char*)&broadcast, sizeof( broadcast ) ) == -1 ) +#else + int broadcast = 1; + if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, &broadcast, sizeof( broadcast ) ) == -1 ) +#endif + { +#ifdef _WIN32 + closesocket( sock ); +#else + close( sock ); +#endif + continue; + } + break; + } + freeaddrinfo( res ); + if( !ptr ) return false; + + m_sock = sock; + inet_pton( AF_INET, addr, &m_addr ); + return true; +} + +void UdpBroadcast::Close() +{ + assert( m_sock != -1 ); +#ifdef _WIN32 + closesocket( m_sock ); +#else + close( m_sock ); +#endif + m_sock = -1; +} + +int UdpBroadcast::Send( uint16_t port, const void* data, int len ) +{ + assert( m_sock != -1 ); + struct sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_port = htons( port ); + addr.sin_addr.s_addr = m_addr; + return sendto( m_sock, (const char*)data, len, MSG_NOSIGNAL, (sockaddr*)&addr, sizeof( addr ) ); +} + +IpAddress::IpAddress() + : m_number( 0 ) +{ + *m_text = '\0'; +} + +IpAddress::~IpAddress() +{ +} + +void IpAddress::Set( const struct sockaddr& addr ) +{ +#if defined _WIN32 && ( !defined NTDDI_WIN10 || NTDDI_VERSION < NTDDI_WIN10 ) + struct sockaddr_in tmp; + memcpy( &tmp, &addr, sizeof( tmp ) ); + auto ai = &tmp; +#else + auto ai = (const struct sockaddr_in*)&addr; +#endif + inet_ntop( AF_INET, &ai->sin_addr, m_text, 17 ); + m_number = ai->sin_addr.s_addr; +} + +UdpListen::UdpListen() + : m_sock( -1 ) +{ +#ifdef _WIN32 + InitWinSock(); +#endif +} + +UdpListen::~UdpListen() +{ + if( m_sock != -1 ) Close(); +} + +bool UdpListen::Listen( uint16_t port ) +{ + assert( m_sock == -1 ); + + int sock; + if( ( sock = socket( AF_INET, SOCK_DGRAM, 0 ) ) == -1 ) return false; + +#if defined __APPLE__ + int val = 1; + setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) ); +#endif +#if defined _WIN32 + unsigned long reuse = 1; + setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof( reuse ) ); +#else + int reuse = 1; + setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof( reuse ) ); +#endif +#if defined _WIN32 + unsigned long broadcast = 1; + if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, (const char*)&broadcast, sizeof( broadcast ) ) == -1 ) +#else + int broadcast = 1; + if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, &broadcast, sizeof( broadcast ) ) == -1 ) +#endif + { +#ifdef _WIN32 + closesocket( sock ); +#else + close( sock ); +#endif + return false; + } + + struct sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_port = htons( port ); + addr.sin_addr.s_addr = INADDR_ANY; + + if( bind( sock, (sockaddr*)&addr, sizeof( addr ) ) == -1 ) + { +#ifdef _WIN32 + closesocket( sock ); +#else + close( sock ); +#endif + return false; + } + + m_sock = sock; + return true; +} + +void UdpListen::Close() +{ + assert( m_sock != -1 ); +#ifdef _WIN32 + closesocket( m_sock ); +#else + close( m_sock ); +#endif + m_sock = -1; +} + +const char* UdpListen::Read( size_t& len, IpAddress& addr, int timeout ) +{ + static char buf[2048]; + + struct pollfd fd; + fd.fd = (socket_t)m_sock; + fd.events = POLLIN; + if( poll( &fd, 1, timeout ) <= 0 ) return nullptr; + + sockaddr sa; + socklen_t salen = sizeof( struct sockaddr ); + len = (size_t)recvfrom( m_sock, buf, 2048, 0, &sa, &salen ); + addr.Set( sa ); + + return buf; +} + +} diff --git a/Source/ThirdParty/tracy/common/TracySocket.hpp b/Source/ThirdParty/tracy/common/TracySocket.hpp new file mode 100644 index 000000000..4fbb3278a --- /dev/null +++ b/Source/ThirdParty/tracy/common/TracySocket.hpp @@ -0,0 +1,154 @@ +#ifndef __TRACYSOCKET_HPP__ +#define __TRACYSOCKET_HPP__ + +#include +#include + +struct addrinfo; +struct sockaddr; + +namespace tracy +{ + +#ifdef _WIN32 +void InitWinSock(); +#endif + +class Socket +{ +public: + Socket(); + Socket( int sock ); + ~Socket(); + + bool Connect( const char* addr, uint16_t port ); + bool ConnectBlocking( const char* addr, uint16_t port ); + void Close(); + + int Send( const void* buf, int len ); + int GetSendBufSize(); + + int ReadUpTo( void* buf, int len, int timeout ); + bool Read( void* buf, int len, int timeout ); + + template + bool Read( void* buf, int len, int timeout, ShouldExit exitCb ) + { + auto cbuf = (char*)buf; + while( len > 0 ) + { + if( exitCb() ) return false; + if( !ReadImpl( cbuf, len, timeout ) ) return false; + } + return true; + } + + bool ReadRaw( void* buf, int len, int timeout ); + bool HasData(); + bool IsValid() const; + + Socket( const Socket& ) = delete; + Socket( Socket&& ) = delete; + Socket& operator=( const Socket& ) = delete; + Socket& operator=( Socket&& ) = delete; + +private: + int RecvBuffered( void* buf, int len, int timeout ); + int Recv( void* buf, int len, int timeout ); + + bool ReadImpl( char*& buf, int& len, int timeout ); + + char* m_buf; + char* m_bufPtr; + std::atomic m_sock; + int m_bufLeft; + + struct addrinfo *m_res; + struct addrinfo *m_ptr; + int m_connSock; +}; + +class ListenSocket +{ +public: + ListenSocket(); + ~ListenSocket(); + + bool Listen( uint16_t port, int backlog ); + Socket* Accept(); + void Close(); + + ListenSocket( const ListenSocket& ) = delete; + ListenSocket( ListenSocket&& ) = delete; + ListenSocket& operator=( const ListenSocket& ) = delete; + ListenSocket& operator=( ListenSocket&& ) = delete; + +private: + int m_sock; +}; + +class UdpBroadcast +{ +public: + UdpBroadcast(); + ~UdpBroadcast(); + + bool Open( const char* addr, uint16_t port ); + void Close(); + + int Send( uint16_t port, const void* data, int len ); + + UdpBroadcast( const UdpBroadcast& ) = delete; + UdpBroadcast( UdpBroadcast&& ) = delete; + UdpBroadcast& operator=( const UdpBroadcast& ) = delete; + UdpBroadcast& operator=( UdpBroadcast&& ) = delete; + +private: + int m_sock; + uint32_t m_addr; +}; + +class IpAddress +{ +public: + IpAddress(); + ~IpAddress(); + + void Set( const struct sockaddr& addr ); + + uint32_t GetNumber() const { return m_number; } + const char* GetText() const { return m_text; } + + IpAddress( const IpAddress& ) = delete; + IpAddress( IpAddress&& ) = delete; + IpAddress& operator=( const IpAddress& ) = delete; + IpAddress& operator=( IpAddress&& ) = delete; + +private: + uint32_t m_number; + char m_text[17]; +}; + +class UdpListen +{ +public: + UdpListen(); + ~UdpListen(); + + bool Listen( uint16_t port ); + void Close(); + + const char* Read( size_t& len, IpAddress& addr, int timeout ); + + UdpListen( const UdpListen& ) = delete; + UdpListen( UdpListen&& ) = delete; + UdpListen& operator=( const UdpListen& ) = delete; + UdpListen& operator=( UdpListen&& ) = delete; + +private: + int m_sock; +}; + +} + +#endif diff --git a/Source/ThirdParty/tracy/common/TracySystem.cpp b/Source/ThirdParty/tracy/common/TracySystem.cpp new file mode 100644 index 000000000..25ccf9f8a --- /dev/null +++ b/Source/ThirdParty/tracy/common/TracySystem.cpp @@ -0,0 +1,239 @@ +#if defined _MSC_VER || defined __CYGWIN__ || defined _WIN32 +# ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +# endif +# ifndef NOMINMAX +# define NOMINMAX +# endif +#endif +#ifdef _MSC_VER +# pragma warning(disable:4996) +#endif +#if defined _WIN32 || defined __CYGWIN__ +# include +#else +# include +# include +# include +#endif + +#ifdef __linux__ +# ifdef __ANDROID__ +# include +# else +# include +# endif +# include +#elif defined __FreeBSD__ +# include +#elif defined __NetBSD__ || defined __DragonFly__ +# include +#endif + +#ifdef __MINGW32__ +# define __STDC_FORMAT_MACROS +#endif +#include +#include +#include + +#include "TracySystem.hpp" + +#if defined _WIN32 || defined __CYGWIN__ +extern "C" typedef HRESULT (WINAPI *t_SetThreadDescription)( HANDLE, PCWSTR ); +extern "C" typedef HRESULT (WINAPI *t_GetThreadDescription)( HANDLE, PWSTR* ); +#endif + +#ifdef TRACY_ENABLE +# include +# include "TracyAlloc.hpp" +#endif + +namespace tracy +{ + +namespace detail +{ + +TRACY_API uint64_t GetThreadHandleImpl() +{ +#if defined _WIN32 || defined __CYGWIN__ + static_assert( sizeof( decltype( GetCurrentThreadId() ) ) <= sizeof( uint64_t ), "Thread handle too big to fit in protocol" ); + return uint64_t( GetCurrentThreadId() ); +#elif defined __APPLE__ + uint64_t id; + pthread_threadid_np( pthread_self(), &id ); + return id; +#elif defined __ANDROID__ + return (uint64_t)gettid(); +#elif defined __linux__ + return (uint64_t)syscall( SYS_gettid ); +#elif defined __FreeBSD__ + long id; + thr_self( &id ); + return id; +#elif defined __NetBSD__ + return _lwp_self(); +#elif defined __DragonFly__ + return lwp_gettid(); +#elif defined __OpenBSD__ + return getthrid(); +#else + static_assert( sizeof( decltype( pthread_self() ) ) <= sizeof( uint64_t ), "Thread handle too big to fit in protocol" ); + return uint64_t( pthread_self() ); +#endif + +} + +} + +#ifdef TRACY_ENABLE +struct ThreadNameData +{ + uint64_t id; + const char* name; + ThreadNameData* next; +}; +std::atomic& GetThreadNameData(); +TRACY_API void InitRPMallocThread(); +#endif + +TRACY_API void SetThreadName( const char* name ) +{ +#if defined _WIN32 || defined __CYGWIN__ + static auto _SetThreadDescription = (t_SetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "SetThreadDescription" ); + if( _SetThreadDescription ) + { + wchar_t buf[256]; + mbstowcs( buf, name, 256 ); + _SetThreadDescription( GetCurrentThread(), buf ); + } + else + { +# if defined _MSC_VER + const DWORD MS_VC_EXCEPTION=0x406D1388; +# pragma pack( push, 8 ) + struct THREADNAME_INFO + { + DWORD dwType; + LPCSTR szName; + DWORD dwThreadID; + DWORD dwFlags; + }; +# pragma pack(pop) + + DWORD ThreadId = GetCurrentThreadId(); + THREADNAME_INFO info; + info.dwType = 0x1000; + info.szName = name; + info.dwThreadID = ThreadId; + info.dwFlags = 0; + + __try + { + RaiseException( MS_VC_EXCEPTION, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info ); + } + __except(EXCEPTION_EXECUTE_HANDLER) + { + } +# endif + } +#elif defined _GNU_SOURCE && !defined __EMSCRIPTEN__ && !defined __CYGWIN__ + { + const auto sz = strlen( name ); + if( sz <= 15 ) + { + pthread_setname_np( pthread_self(), name ); + } + else + { + char buf[16]; + memcpy( buf, name, 15 ); + buf[15] = '\0'; + pthread_setname_np( pthread_self(), buf ); + } + } +#endif +#ifdef TRACY_ENABLE + { + InitRPMallocThread(); + const auto sz = strlen( name ); + char* buf = (char*)tracy_malloc( sz+1 ); + memcpy( buf, name, sz ); + buf[sz] = '\0'; + auto data = (ThreadNameData*)tracy_malloc( sizeof( ThreadNameData ) ); + data->id = detail::GetThreadHandleImpl(); + data->name = buf; + data->next = GetThreadNameData().load( std::memory_order_relaxed ); + while( !GetThreadNameData().compare_exchange_weak( data->next, data, std::memory_order_release, std::memory_order_relaxed ) ) {} + } +#endif +} + +TRACY_API const char* GetThreadName( uint64_t id ) +{ + static char buf[256]; +#ifdef TRACY_ENABLE + auto ptr = GetThreadNameData().load( std::memory_order_relaxed ); + while( ptr ) + { + if( ptr->id == id ) + { + return ptr->name; + } + ptr = ptr->next; + } +#else +# if defined _WIN32 || defined __CYGWIN__ + static auto _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" ); + if( _GetThreadDescription ) + { + auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id ); + if( hnd != 0 ) + { + PWSTR tmp; + _GetThreadDescription( hnd, &tmp ); + auto ret = wcstombs( buf, tmp, 256 ); + CloseHandle( hnd ); + if( ret != 0 ) + { + return buf; + } + } + } +# elif defined __linux__ + int cs, fd; + char path[32]; +# ifdef __ANDROID__ + int tid = gettid(); +# else + int tid = (int) syscall( SYS_gettid ); +# endif + snprintf( path, sizeof( path ), "/proc/self/task/%d/comm", tid ); + sprintf( buf, "%" PRIu64, id ); +# ifndef __ANDROID__ + pthread_setcancelstate( PTHREAD_CANCEL_DISABLE, &cs ); +# endif + if ( ( fd = open( path, O_RDONLY ) ) > 0) { + int len = read( fd, buf, 255 ); + if( len > 0 ) + { + buf[len] = 0; + if( len > 1 && buf[len-1] == '\n' ) + { + buf[len-1] = 0; + } + } + close( fd ); + } +# ifndef __ANDROID__ + pthread_setcancelstate( cs, 0 ); +# endif + return buf; +# endif +#endif + sprintf( buf, "%" PRIu64, id ); + return buf; +} + +} diff --git a/Source/ThirdParty/tracy/common/TracySystem.hpp b/Source/ThirdParty/tracy/common/TracySystem.hpp new file mode 100644 index 000000000..f285b762a --- /dev/null +++ b/Source/ThirdParty/tracy/common/TracySystem.hpp @@ -0,0 +1,95 @@ +#ifndef __TRACYSYSTEM_HPP__ +#define __TRACYSYSTEM_HPP__ + +#include + +// Tracy -> Flax integration: +// - use LZ4 from Flax +// - use engine symbols export +// - use engine types and macros +// - remove AddVectoredExceptionHandler from win32 to prevent messing with Flax crashes reporting +// - hide implementation from includers to reduce compilation overhead +// - optimize includes (faster compilation) +// - remove some features (colors, frame image, dx1 compression) +#include "Engine/Core/Types/BaseTypes.h" +#define TRACY_API FLAXENGINE_API +#define tracy_force_inline FORCE_INLINE +#define tracy_no_inline FORCE_NOINLINE + +#ifndef TracyConcat +# define TracyConcat(x,y) TracyConcatIndirect(x,y) +#endif +#ifndef TracyConcatIndirect +# define TracyConcatIndirect(x,y) x##y +#endif + +namespace tracy +{ +enum class PlotFormatType : uint8_t +{ + Number, + Memory, + Percentage +}; + +typedef void(*ParameterCallback)( uint32_t idx, int32_t val ); + +struct TRACY_API SourceLocationData +{ + const char* name; + const char* function; + const char* file; + uint32_t line; + uint32_t color; +}; + +class TRACY_API ScopedZone +{ +public: + ScopedZone( const ScopedZone& ) = delete; + ScopedZone( ScopedZone&& ) = delete; + ScopedZone& operator=( const ScopedZone& ) = delete; + ScopedZone& operator=( ScopedZone&& ) = delete; + + ScopedZone( const SourceLocationData* srcloc, bool is_active = true ); + ScopedZone( const SourceLocationData* srcloc, int depth, bool is_active = true ); + ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool is_active = true ); + ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool is_active = true ); + + ~ScopedZone(); + + void Text( const char* txt, size_t size ); + void Name( const char* txt, size_t size ); + void Name( const Char* txt, size_t size ); + void Color( uint32_t color ); + void Value( uint64_t value ); + bool IsActive() const; + +private: + const bool m_active; + +#ifdef TRACY_ON_DEMAND + uint64_t m_connectionId; +#endif +}; + +namespace detail +{ +TRACY_API uint64_t GetThreadHandleImpl(); +} + +#ifdef TRACY_ENABLE +TRACY_API uint64_t GetThreadHandle(); +#else +static inline uint64_t GetThreadHandle() +{ + return detail::GetThreadHandleImpl(); +} +#endif + +TRACY_API void SetThreadName( const char* name ); +TRACY_API const char* GetThreadName( uint64_t id ); + +} + +#endif diff --git a/Source/ThirdParty/tracy/tracy.Build.cs b/Source/ThirdParty/tracy/tracy.Build.cs new file mode 100644 index 000000000..aa10c514f --- /dev/null +++ b/Source/ThirdParty/tracy/tracy.Build.cs @@ -0,0 +1,47 @@ +// Copyright (c) 2012-2021 Wojciech Figat. All rights reserved. + +using System.Collections.Generic; +using System.IO; +using Flax.Build; +using Flax.Build.NativeCpp; + +/// +/// https://github.com/wolfpld/tracy +/// +public class tracy : ThirdPartyModule +{ + /// + public override void Init() + { + base.Init(); + + LicenseType = LicenseTypes.BSD3Clause; + LicenseFilePath = "LICENSE"; + + // Merge third-party modules into engine binary + BinaryModuleName = "FlaxEngine"; + } + + /// + public override void Setup(BuildOptions options) + { + base.Setup(options); + + options.SourcePaths.Clear(); + options.SourceFiles.Clear(); + options.SourceFiles.Add(Path.Combine(FolderPath, "Tracy.h")); + options.SourceFiles.Add(Path.Combine(FolderPath, "TracyClient.cpp")); + + options.PublicDefinitions.Add("TRACY_ENABLE"); + } + + /// + public override void GetFilesToDeploy(List files) + { + base.GetFilesToDeploy(files); + + files.Add(Path.Combine(FolderPath, "Tracy.h")); + files.Add(Path.Combine(FolderPath, "common", "TracySystem.hpp")); + files.Add(Path.Combine(FolderPath, "client", "TracyCallstack.h")); + } +}