From d654d2d0acf827f8638efba5bd6fc8a23084b810 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Sat, 28 Aug 2021 12:49:02 +0200 Subject: [PATCH 1/7] Add `PROFILE_CPU_ASSET` for asset related profiler scoped zone --- Source/Engine/Content/Asset.cpp | 6 +----- Source/Engine/Profiler/ProfilerCPU.h | 2 ++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/Source/Engine/Content/Asset.cpp b/Source/Engine/Content/Asset.cpp index 03bb7f75f..9a68d6a17 100644 --- a/Source/Engine/Content/Asset.cpp +++ b/Source/Engine/Content/Asset.cpp @@ -480,11 +480,7 @@ bool Asset::onLoad(LoadAssetTask* task) // Load asset LoadResult result; { -#if TRACY_ENABLE - ZoneScoped; - const StringView name(GetPath()); - ZoneName(*name, name.Length()); -#endif + PROFILE_CPU_ASSET(this); result = loadAsset(); } const bool isLoaded = result == LoadResult::Ok; diff --git a/Source/Engine/Profiler/ProfilerCPU.h b/Source/Engine/Profiler/ProfilerCPU.h index 7ffdc85eb..565ee3697 100644 --- a/Source/Engine/Profiler/ProfilerCPU.h +++ b/Source/Engine/Profiler/ProfilerCPU.h @@ -402,8 +402,10 @@ struct TIsPODType #ifdef TRACY_ENABLE #define PROFILE_CPU_SRC_LOC(srcLoc) tracy::ScopedZone ___tracy_scoped_zone( (tracy::SourceLocationData*)&(srcLoc) ); ScopeProfileBlockCPU ProfileBlockCPU((srcLoc).name) +#define PROFILE_CPU_ASSET(asset) ZoneScoped; const StringView __tracy_asset_name((asset)->GetPath()); ZoneName(*__tracy_asset_name, __tracy_asset_name.Length()) #else #define PROFILE_CPU_SRC_LOC(srcLoc) ScopeProfileBlockCPU ProfileBlockCPU((srcLoc).name) +#define PROFILE_CPU_ASSET(asset) #endif #else From 2e5491604bc811fe087c0b5d1b815da4556f404c Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Mon, 30 Aug 2021 20:24:38 +0200 Subject: [PATCH 2/7] Add more profiler events and naming for particles/animations jobs events --- Source/Engine/Animations/Animations.cpp | 4 +++ .../Animations/Graph/AnimGroup.Animation.cpp | 3 +- ...rticleEmitterGraph.CPU.ParticleModules.cpp | 33 +++++++++++++++++++ .../Graph/CPU/ParticleEmitterGraph.CPU.cpp | 11 +++++++ Source/Engine/Particles/Particles.cpp | 5 +++ 5 files changed, 54 insertions(+), 2 deletions(-) diff --git a/Source/Engine/Animations/Animations.cpp b/Source/Engine/Animations/Animations.cpp index 359334da2..6959211b9 100644 --- a/Source/Engine/Animations/Animations.cpp +++ b/Source/Engine/Animations/Animations.cpp @@ -56,6 +56,10 @@ void AnimationsSystem::Job(int32 index) auto animatedModel = UpdateList[index]; auto skinnedModel = animatedModel->SkinnedModel.Get(); auto graph = animatedModel->AnimationGraph.Get(); +#if COMPILE_WITH_PROFILER && TRACY_ENABLE + const StringView graphName(graph->GetPath()); + ZoneName(*graphName, graphName.Length()); +#endif if (graph && graph->IsLoaded() && graph->Graph.CanUseWithSkeleton(skinnedModel) #if USE_EDITOR && graph->Graph.Parameters.Count() == animatedModel->GraphInstance.Parameters.Count() // It may happen in editor so just add safe check to prevent any crashes diff --git a/Source/Engine/Animations/Graph/AnimGroup.Animation.cpp b/Source/Engine/Animations/Graph/AnimGroup.Animation.cpp index ecdaf2aa9..5b1f1a401 100644 --- a/Source/Engine/Animations/Graph/AnimGroup.Animation.cpp +++ b/Source/Engine/Animations/Graph/AnimGroup.Animation.cpp @@ -149,6 +149,7 @@ Variant AnimGraphExecutor::SampleAnimation(AnimGraphNode* node, bool loop, float // Skip if animation is not ready to use if (anim == nullptr || !anim->IsLoaded()) return Value::Null; + PROFILE_CPU_ASSET(anim); // Calculate actual time position within the animation node (defined by length and loop mode) const float pos = GetAnimPos(newTimePos, startTimePos, loop, length); @@ -607,8 +608,6 @@ void AnimGraphExecutor::ProcessGroupAnimation(Box* boxBase, Node* nodeBase, Valu // Animation case 0: { - ANIM_GRAPH_PROFILE_EVENT("Sample"); - const float length = anim ? anim->GetLength() : 0.0f; // Calculate new time position diff --git a/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.ParticleModules.cpp b/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.ParticleModules.cpp index 3f2c58df8..24a584852 100644 --- a/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.ParticleModules.cpp +++ b/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.ParticleModules.cpp @@ -14,6 +14,15 @@ #define RAND3 Vector3(RAND, RAND, RAND) #define RAND4 Vector4(RAND, RAND, RAND, RAND) +// Enable to insert CPU profiler events for particles modules +#define PARTICLE_EMITTER_MODULES_PROFILE 0 +#if PARTICLE_EMITTER_MODULES_PROFILE +#include "Engine/Profiler/ProfilerCPU.h" +#define PARTICLE_EMITTER_MODULE(name) PROFILE_CPU_NAMED(name) +#else +#define PARTICLE_EMITTER_MODULE(name) +#endif + namespace { FORCE_INLINE Vector4 Mod289(Vector4 x) @@ -181,6 +190,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* case 201: case 303: { + PARTICLE_EMITTER_MODULE("Orient Sprite"); auto spriteFacingMode = node->Values[2].AsInt; { auto& attribute = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; @@ -223,6 +233,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* case 213: case 309: { + PARTICLE_EMITTER_MODULE("Orient Model"); auto modelFacingMode = node->Values[2].AsInt; { auto& attribute = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; @@ -238,6 +249,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Update Age case 300: { + PARTICLE_EMITTER_MODULE("Update Age"); auto& attribute = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* agePtr = start + attribute.Offset; for (int32 particleIndex = particlesStart; particleIndex < particlesEnd; particleIndex++) @@ -251,6 +263,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* case 301: case 304: { + PARTICLE_EMITTER_MODULE("Gravity/Force"); auto& attribute = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* velocityPtr = start + attribute.Offset; auto box = node->GetBox(0); @@ -278,6 +291,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Conform to Sphere case 305: { + PARTICLE_EMITTER_MODULE("Conform to Sphere"); auto& position = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; auto& velocity = context.Data->Buffer->Layout->Attributes[node->Attributes[1]]; auto& mass = context.Data->Buffer->Layout->Attributes[node->Attributes[2]]; @@ -340,6 +354,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Kill (sphere) case 306: { + PARTICLE_EMITTER_MODULE("Kill"); auto& position = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* positionPtr = start + position.Offset; @@ -388,6 +403,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Kill (box) case 307: { + PARTICLE_EMITTER_MODULE("Kill"); auto& position = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* positionPtr = start + position.Offset; @@ -441,6 +457,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Kill (custom) case 308: { + PARTICLE_EMITTER_MODULE("Kill (custom)"); auto killBox = node->GetBox(0); #define INPUTS_FETCH() \ @@ -478,6 +495,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Linear Drag case 310: { + PARTICLE_EMITTER_MODULE("Linear Drag"); auto box = node->GetBox(0); const bool useSpriteSize = node->Values[3].AsBool; @@ -523,6 +541,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Turbulence case 311: { + PARTICLE_EMITTER_MODULE("Turbulence"); auto& position = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; auto& velocity = context.Data->Buffer->Layout->Attributes[node->Attributes[1]]; auto& mass = context.Data->Buffer->Layout->Attributes[node->Attributes[2]]; @@ -583,6 +602,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* case 200: case 302: { + PARTICLE_EMITTER_MODULE("Set Attribute"); auto& attribute = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* dataPtr = start + attribute.Offset; int32 dataSize = attribute.GetSize(); @@ -639,6 +659,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* case 362: case 363: { + PARTICLE_EMITTER_MODULE("Set"); auto& attribute = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* dataPtr = start + attribute.Offset; int32 dataSize = attribute.GetSize(); @@ -668,6 +689,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Position (sphere surface) case 202: { + PARTICLE_EMITTER_MODULE("Position"); auto& positionAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* positionPtr = start + positionAttr.Offset; @@ -713,6 +735,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Position (plane) case 203: { + PARTICLE_EMITTER_MODULE("Position"); auto& positionAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* positionPtr = start + positionAttr.Offset; @@ -751,6 +774,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Position (circle) case 204: { + PARTICLE_EMITTER_MODULE("Position"); auto& positionAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* positionPtr = start + positionAttr.Offset; @@ -794,6 +818,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Position (disc) case 205: { + PARTICLE_EMITTER_MODULE("Position"); auto& positionAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* positionPtr = start + positionAttr.Offset; @@ -837,6 +862,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Position (box surface) case 206: { + PARTICLE_EMITTER_MODULE("Position"); auto& positionAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* positionPtr = start + positionAttr.Offset; @@ -887,6 +913,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Position (box volume) case 207: { + PARTICLE_EMITTER_MODULE("Position"); auto& positionAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* positionPtr = start + positionAttr.Offset; @@ -925,6 +952,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Position (cylinder) case 208: { + PARTICLE_EMITTER_MODULE("Position"); auto& positionAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* positionPtr = start + positionAttr.Offset; @@ -970,6 +998,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Position (line) case 209: { + PARTICLE_EMITTER_MODULE("Position"); auto& positionAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* positionPtr = start + positionAttr.Offset; @@ -1008,6 +1037,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Position (torus) case 210: { + PARTICLE_EMITTER_MODULE("Position"); auto& positionAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* positionPtr = start + positionAttr.Offset; @@ -1072,6 +1102,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Position (sphere volume) case 211: { + PARTICLE_EMITTER_MODULE("Position"); auto& positionAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; byte* positionPtr = start + positionAttr.Offset; @@ -1123,6 +1154,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Position (spiral) case 214: { + PARTICLE_EMITTER_MODULE("Position"); auto& positionAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; auto& velocityAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[1]]; @@ -1173,6 +1205,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* // Helper macros for collision modules to share the code #define COLLISION_BEGIN() \ + PARTICLE_EMITTER_MODULE("Collision"); \ auto& positionAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[0]]; \ auto& velocityAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[1]]; \ auto& ageAttr = context.Data->Buffer->Layout->Attributes[node->Attributes[2]]; \ diff --git a/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.cpp b/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.cpp index 0de97633b..ba22564bd 100644 --- a/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.cpp +++ b/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.cpp @@ -6,6 +6,7 @@ #include "Engine/Renderer/RenderList.h" #include "Engine/Particles/ParticleEffect.h" #include "Engine/Engine/Time.h" +#include "Engine/Profiler/ProfilerCPU.h" ThreadLocal ParticleEmitterGraphCPUExecutor::Context; @@ -433,6 +434,7 @@ void ParticleEmitterGraphCPUExecutor::Update(ParticleEmitter* emitter, ParticleE // Update particles if (cpu.Count > 0) { + PROFILE_CPU_NAMED("Update"); for (int32 i = 0; i < _graph.UpdateModules.Count(); i++) { ProcessModule(_graph.UpdateModules[i], 0, cpu.Count); @@ -442,6 +444,7 @@ void ParticleEmitterGraphCPUExecutor::Update(ParticleEmitter* emitter, ParticleE // Dead particles removal if (_graph._attrAge != -1 && _graph._attrLifetime != -1) { + PROFILE_CPU_NAMED("Age kill"); byte* agePtr = cpu.Buffer.Get() + data.Buffer->Layout->Attributes[_graph._attrAge].Offset; byte* lifetimePtr = cpu.Buffer.Get() + data.Buffer->Layout->Attributes[_graph._attrLifetime].Offset; for (int32 particleIndex = 0; particleIndex < cpu.Count; particleIndex++) @@ -477,6 +480,7 @@ void ParticleEmitterGraphCPUExecutor::Update(ParticleEmitter* emitter, ParticleE // Euler integration if (_graph._attrPosition != -1 && _graph._attrVelocity != -1) { + PROFILE_CPU_NAMED("Euler Integration"); byte* positionPtr = cpu.Buffer.Get() + data.Buffer->Layout->Attributes[_graph._attrPosition].Offset; byte* velocityPtr = cpu.Buffer.Get() + data.Buffer->Layout->Attributes[_graph._attrVelocity].Offset; for (int32 particleIndex = 0; particleIndex < cpu.Count; particleIndex++) @@ -490,6 +494,7 @@ void ParticleEmitterGraphCPUExecutor::Update(ParticleEmitter* emitter, ParticleE // Angular Euler Integration if (_graph._attrRotation != -1 && _graph._attrAngularVelocity != -1) { + PROFILE_CPU_NAMED("Angular Euler Integration"); byte* rotationPtr = cpu.Buffer.Get() + data.Buffer->Layout->Attributes[_graph._attrRotation].Offset; byte* angularVelocityPtr = cpu.Buffer.Get() + data.Buffer->Layout->Attributes[_graph._attrAngularVelocity].Offset; for (int32 particleIndex = 0; particleIndex < cpu.Count; particleIndex++) @@ -504,6 +509,7 @@ void ParticleEmitterGraphCPUExecutor::Update(ParticleEmitter* emitter, ParticleE int32 spawnCount = 0; if (canSpawn) { + PROFILE_CPU_NAMED("Spawn"); for (int32 i = 0; i < _graph.SpawnModules.Count(); i++) { spawnCount += ProcessSpawnModule(i); @@ -514,6 +520,8 @@ void ParticleEmitterGraphCPUExecutor::Update(ParticleEmitter* emitter, ParticleE spawnCount = countAfter - countBefore; if (spawnCount != 0) { + PROFILE_CPU_NAMED("Init"); + // Spawn particles data.Buffer->CPU.Count = countAfter; @@ -533,6 +541,7 @@ void ParticleEmitterGraphCPUExecutor::Update(ParticleEmitter* emitter, ParticleE if (_graph.RibbonRenderingModules.HasItems()) { // Sort ribbon particles + PROFILE_CPU_NAMED("Ribbon"); if (cpu.RibbonOrder.IsEmpty()) { cpu.RibbonOrder.Resize(_graph.RibbonRenderingModules.Count() * data.Buffer->Capacity); @@ -559,6 +568,8 @@ void ParticleEmitterGraphCPUExecutor::Update(ParticleEmitter* emitter, ParticleE int32 ParticleEmitterGraphCPUExecutor::UpdateSpawn(ParticleEmitter* emitter, ParticleEffect* effect, ParticleEmitterInstance& data, float dt) { + PROFILE_CPU_NAMED("Spawn"); + // Prepare data auto& context = Context.Get(); Init(emitter, effect, data, dt); diff --git a/Source/Engine/Particles/Particles.cpp b/Source/Engine/Particles/Particles.cpp index 9b4bf66a5..bd6d856d1 100644 --- a/Source/Engine/Particles/Particles.cpp +++ b/Source/Engine/Particles/Particles.cpp @@ -1211,6 +1211,10 @@ void ParticlesSystem::Job(int32 index) } if (anyEmitterNotReady) return; +#if COMPILE_WITH_PROFILER && TRACY_ENABLE + const StringView particleSystemName(particleSystem->GetPath()); + ZoneName(*particleSystemName, particleSystemName.Length()); +#endif // Prepare instance data instance.Sync(particleSystem); @@ -1287,6 +1291,7 @@ void ParticlesSystem::Job(int32 index) auto& data = instance.Emitters[track.AsEmitter.Index]; ASSERT(emitter && emitter->IsLoaded()); ASSERT(emitter->Capacity != 0 && emitter->Graph.Layout.Size != 0); + PROFILE_CPU_ASSET(emitter); // Calculate new time position const float startTime = (float)track.AsEmitter.StartFrame / fps; From ab8e0fdd4678edd18f8cd5a0c8422a7bd1e162a6 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Mon, 30 Aug 2021 20:27:43 +0200 Subject: [PATCH 3/7] Fixes for `SIMD.h` --- Source/Engine/Core/SIMD.h | 46 ++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/Source/Engine/Core/SIMD.h b/Source/Engine/Core/SIMD.h index c71f8ee04..a5e97d581 100644 --- a/Source/Engine/Core/SIMD.h +++ b/Source/Engine/Core/SIMD.h @@ -18,9 +18,14 @@ typedef __m128 SimdVector4; namespace SIMD { + FORCE_INLINE SimdVector4 Load(float xyzw) + { + return _mm_set1_ps(xyzw); + } + FORCE_INLINE SimdVector4 Load(float x, float y, float z, float w) { - return _mm_set_ps(x, y, z, w); + return _mm_set_ps(w, z, y, x); } FORCE_INLINE SimdVector4 Load(const void* src) @@ -91,34 +96,39 @@ namespace SIMD #else -struct SimdFloat4 +struct SimdVector4 { float X, Y, Z, W; }; namespace SIMD { - FORCE_INLINE SimdFloat4 Load(float x, float y, float z, float w) + FORCE_INLINE SimdVector4 Load(float xyzw) + { + return { xyzw, xyzw, xyzw, xyzw }; + } + + FORCE_INLINE SimdVector4 Load(float x, float y, float z, float w) { return { x, y, z, w }; } - FORCE_INLINE SimdFloat4 Load(const void* src) + FORCE_INLINE SimdVector4 Load(const void* src) { - return *(const SimdFloat4*)src; + return *(const SimdVector4*)src; } - FORCE_INLINE SimdFloat4 Splat(float value) + FORCE_INLINE SimdVector4 Splat(float value) { return { value, value, value, value }; } - FORCE_INLINE void Store(void* dst, SimdFloat4 src) + FORCE_INLINE void Store(void* dst, SimdVector4 src) { - (*(SimdFloat4*)dst) = src; + (*(SimdVector4*)dst) = src; } - FORCE_INLINE int MoveMask(SimdFloat4 a) + FORCE_INLINE int MoveMask(SimdVector4 a) { return (a.W < 0 ? (1 << 3) : 0) | (a.Z < 0 ? (1 << 2) : 0) | @@ -126,7 +136,7 @@ namespace SIMD (a.X < 0 ? 1 : 0); } - FORCE_INLINE SimdFloat4 Add(SimdFloat4 a, SimdFloat4 b) + FORCE_INLINE SimdVector4 Add(SimdVector4 a, SimdVector4 b) { return { @@ -137,7 +147,7 @@ namespace SIMD }; } - FORCE_INLINE SimdFloat4 Sub(SimdFloat4 a, SimdFloat4 b) + FORCE_INLINE SimdVector4 Sub(SimdVector4 a, SimdVector4 b) { return { @@ -148,7 +158,7 @@ namespace SIMD }; } - FORCE_INLINE SimdFloat4 Mul(SimdFloat4 a, SimdFloat4 b) + FORCE_INLINE SimdVector4 Mul(SimdVector4 a, SimdVector4 b) { return { @@ -159,7 +169,7 @@ namespace SIMD }; } - FORCE_INLINE SimdFloat4 Div(SimdFloat4 a, SimdFloat4 b) + FORCE_INLINE SimdVector4 Div(SimdVector4 a, SimdVector4 b) { return { @@ -170,7 +180,7 @@ namespace SIMD }; } - FORCE_INLINE SimdFloat4 Rcp(SimdFloat4 a) + FORCE_INLINE SimdVector4 Rcp(SimdVector4 a) { return { @@ -181,7 +191,7 @@ namespace SIMD }; } - FORCE_INLINE SimdFloat4 Sqrt(SimdFloat4 a) + FORCE_INLINE SimdVector4 Sqrt(SimdVector4 a) { return { @@ -192,7 +202,7 @@ namespace SIMD }; } - FORCE_INLINE SimdFloat4 Rsqrt(SimdFloat4 a) + FORCE_INLINE SimdVector4 Rsqrt(SimdVector4 a) { return { @@ -203,7 +213,7 @@ namespace SIMD }; } - FORCE_INLINE SimdFloat4 Min(SimdFloat4 a, SimdFloat4 b) + FORCE_INLINE SimdVector4 Min(SimdVector4 a, SimdVector4 b) { return { @@ -214,7 +224,7 @@ namespace SIMD }; } - FORCE_INLINE SimdFloat4 Max(SimdFloat4 a, SimdFloat4 b) + FORCE_INLINE SimdVector4 Max(SimdVector4 a, SimdVector4 b) { return { From 074aa8c3fbac4ab13b675c4d801d1f4c034841f5 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Mon, 30 Aug 2021 20:29:58 +0200 Subject: [PATCH 4/7] Optimize ThreadLocal by removing size check --- Source/Engine/Threading/ThreadLocal.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Source/Engine/Threading/ThreadLocal.h b/Source/Engine/Threading/ThreadLocal.h index a20d31930..391ce16ac 100644 --- a/Source/Engine/Threading/ThreadLocal.h +++ b/Source/Engine/Threading/ThreadLocal.h @@ -81,7 +81,6 @@ protected: FORCE_INLINE int32 GetIndex() { - ASSERT(Count() < MaxThreads); int64 key = (int64)Platform::GetCurrentThreadID(); auto index = Hash(key); while (true) From e284a88da41f03da0140e80085cf1aaad8818ab0 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Mon, 30 Aug 2021 20:30:22 +0200 Subject: [PATCH 5/7] Optimize utilities in `AnimationUtils` --- Source/Engine/Animations/AnimationUtils.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/Source/Engine/Animations/AnimationUtils.h b/Source/Engine/Animations/AnimationUtils.h index 121eaee57..3bae847fc 100644 --- a/Source/Engine/Animations/AnimationUtils.h +++ b/Source/Engine/Animations/AnimationUtils.h @@ -89,10 +89,19 @@ namespace AnimationUtils result = (T)(a + t * (b - a)); } + template<> + FORCE_INLINE void Interpolate(const Vector2& a, const Vector2& b, float t, Vector2& result) + { + result.X = Math::Lerp(a.X, b.X, t); + result.Y = Math::Lerp(a.Y, b.Y, t); + } + template<> FORCE_INLINE void Interpolate(const Vector3& a, const Vector3& b, float t, Vector3& result) { - Vector3::Lerp(a, b, t, result); + result.X = Math::Lerp(a.X, b.X, t); + result.Y = Math::Lerp(a.Y, b.Y, t); + result.Y = Math::Lerp(a.Z, b.Z, t); } template<> @@ -191,7 +200,7 @@ namespace AnimationUtils const float uu = u * u; const float uuu = uu * u; const float ttt = tt * t; - result = uuu * p0 + 3 * uu * t * p1 + 3 * u * tt * p2 + ttt * p3; + result = uuu * p0 + (3 * uu * t) * p1 + (3 * u * tt) * p2 + ttt * p3; } template<> @@ -202,7 +211,7 @@ namespace AnimationUtils const float uu = u * u; const float uuu = uu * u; const float ttt = tt * t; - result = uuu * p0 + 3 * uu * t * p1 + 3 * u * tt * p2 + ttt * p3; + result = uuu * p0 + (3 * uu * t) * p1 + (3 * u * tt) * p2 + ttt * p3; } template<> From e3b98c902bd60a0f96c16d859abc4baa859a4dc6 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Mon, 30 Aug 2021 20:30:52 +0200 Subject: [PATCH 6/7] Optimize CPU particles impl parts --- .../CPU/ParticleEmitterGraph.CPU.ParticleModules.cpp | 6 ++++-- .../Graph/CPU/ParticleEmitterGraph.CPU.Particles.cpp | 2 +- .../Particles/Graph/CPU/ParticleEmitterGraph.CPU.cpp | 8 ++++---- .../Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.h | 3 ++- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.ParticleModules.cpp b/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.ParticleModules.cpp index 24a584852..de06e6435 100644 --- a/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.ParticleModules.cpp +++ b/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.ParticleModules.cpp @@ -610,10 +610,11 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* ValueType type(GetVariantType(attribute.ValueType)); if (node->UsePerParticleDataResolve()) { + Value value; for (int32 particleIndex = particlesStart; particleIndex < particlesEnd; particleIndex++) { context.ParticleIndex = particleIndex; - const Value value = GetValue(box, 4).Cast(type); + value = GetValue(box, 4).Cast(type); Platform::MemoryCopy(dataPtr, &value.AsPointer, dataSize); dataPtr += stride; } @@ -667,10 +668,11 @@ void ParticleEmitterGraphCPUExecutor::ProcessModule(ParticleEmitterGraphCPUNode* ValueType type(GetVariantType(attribute.ValueType)); if (node->UsePerParticleDataResolve()) { + Value value; for (int32 particleIndex = particlesStart; particleIndex < particlesEnd; particleIndex++) { context.ParticleIndex = particleIndex; - const Value value = GetValue(box, 2).Cast(type); + value = GetValue(box, 2).Cast(type); Platform::MemoryCopy(dataPtr, &value.AsPointer, dataSize); dataPtr += stride; } diff --git a/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.Particles.cpp b/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.Particles.cpp index 6d9ba3722..ee71f8d09 100644 --- a/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.Particles.cpp +++ b/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.Particles.cpp @@ -419,7 +419,7 @@ void ParticleEmitterGraphCPUExecutor::ProcessGroupFunction(Box* box, Node* node, Node* functionCallNode = nullptr; ASSERT(context.GraphStack.Count() >= 2); Graph* graph; - for (int32 i = context.CallStack.Count() - 1; i >= 0; i--) + for (int32 i = context.CallStackSize - 1; i >= 0; i--) { if (context.CallStack[i]->Type == GRAPH_NODE_MAKE_TYPE(14, 300) && context.Functions.TryGet(context.CallStack[i], graph) && context.GraphStack[context.GraphStack.Count() - 1] == graph) { diff --git a/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.cpp b/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.cpp index ba22564bd..0c459ec96 100644 --- a/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.cpp +++ b/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.cpp @@ -131,7 +131,7 @@ void ParticleEmitterGraphCPUExecutor::Init(ParticleEmitter* emitter, ParticleEff context.DeltaTime = dt; context.ParticleIndex = 0; context.ViewTask = effect->GetRenderTask(); - context.CallStack.Clear(); + context.CallStackSize = 0; context.Functions.Clear(); } @@ -588,7 +588,7 @@ VisjectExecutor::Value ParticleEmitterGraphCPUExecutor::eatBox(Node* caller, Box { // Check if graph is looped or is too deep auto& context = Context.Get(); - if (context.CallStack.Count() >= PARTICLE_EMITTER_MAX_CALL_STACK) + if (context.CallStackSize >= PARTICLE_EMITTER_MAX_CALL_STACK) { OnError(caller, box, TEXT("Graph is looped or too deep!")); return Value::Zero; @@ -602,7 +602,7 @@ VisjectExecutor::Value ParticleEmitterGraphCPUExecutor::eatBox(Node* caller, Box #endif // Add to the calling stack - context.CallStack.Add(caller); + context.CallStack[context.CallStackSize++] = caller; // Call per group custom processing event Value value; @@ -611,7 +611,7 @@ VisjectExecutor::Value ParticleEmitterGraphCPUExecutor::eatBox(Node* caller, Box (this->*func)(box, parentNode, value); // Remove from the calling stack - context.CallStack.RemoveLast(); + context.CallStackSize--; return value; } diff --git a/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.h b/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.h index 34834a2cd..a1e06f850 100644 --- a/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.h +++ b/Source/Engine/Particles/Graph/CPU/ParticleEmitterGraph.CPU.h @@ -120,9 +120,10 @@ struct ParticleEmitterGraphCPUContext ParticleEmitter* Emitter; ParticleEffect* Effect; class SceneRenderTask* ViewTask; - Array> CallStack; Array> GraphStack; Dictionary Functions; + int32 CallStackSize = 0; + VisjectExecutor::Node* CallStack[PARTICLE_EMITTER_MAX_CALL_STACK]; }; /// From fab7bd48c5ce1d59b08ba00c9a7ece013c843cb6 Mon Sep 17 00:00:00 2001 From: Wojtek Figat Date: Mon, 30 Aug 2021 20:31:40 +0200 Subject: [PATCH 7/7] Optimize atomic and interlocked memory operations on Win32Platform (Windows and Xbox) by inlining --- .../Engine/Platform/Win32/Win32Platform.cpp | 60 +----------------- Source/Engine/Platform/Win32/Win32Platform.h | 61 +++++++++++++++---- 2 files changed, 52 insertions(+), 69 deletions(-) diff --git a/Source/Engine/Platform/Win32/Win32Platform.cpp b/Source/Engine/Platform/Win32/Win32Platform.cpp index 38564cceb..14f370dce 100644 --- a/Source/Engine/Platform/Win32/Win32Platform.cpp +++ b/Source/Engine/Platform/Win32/Win32Platform.cpp @@ -20,6 +20,8 @@ #include #pragma comment(lib, "Iphlpapi.lib") +static_assert(sizeof(int32) == sizeof(long), "Invalid long size for Interlocked and Atomic operations in Win32Platform."); + namespace { Guid DeviceId; @@ -239,59 +241,6 @@ void Win32Platform::MemoryBarrier() #endif } -int64 Win32Platform::InterlockedExchange(int64 volatile* dst, int64 exchange) -{ - return InterlockedExchange64(dst, exchange); -} - -int32 Win32Platform::InterlockedCompareExchange(int32 volatile* dst, int32 exchange, int32 comperand) -{ - static_assert(sizeof(int32) == sizeof(LONG), "Invalid LONG size."); - return _InterlockedCompareExchange((LONG volatile*)dst, exchange, comperand); -} - -int64 Win32Platform::InterlockedCompareExchange(int64 volatile* dst, int64 exchange, int64 comperand) -{ - return InterlockedCompareExchange64(dst, exchange, comperand); -} - -int64 Win32Platform::InterlockedIncrement(int64 volatile* dst) -{ - return InterlockedIncrement64(dst); -} - -int64 Win32Platform::InterlockedDecrement(int64 volatile* dst) -{ - return InterlockedDecrement64(dst); -} - -int64 Win32Platform::InterlockedAdd(int64 volatile* dst, int64 value) -{ - return InterlockedExchangeAdd64(dst, value); -} - -int32 Win32Platform::AtomicRead(int32 volatile* dst) -{ - static_assert(sizeof(int32) == sizeof(LONG), "Invalid LONG size."); - return _InterlockedCompareExchange((LONG volatile*)dst, 0, 0); -} - -int64 Win32Platform::AtomicRead(int64 volatile* dst) -{ - return InterlockedCompareExchange64(dst, 0, 0); -} - -void Win32Platform::AtomicStore(int32 volatile* dst, int32 value) -{ - static_assert(sizeof(int32) == sizeof(LONG), "Invalid LONG size."); - _InterlockedExchange((LONG volatile*)dst, value); -} - -void Win32Platform::AtomicStore(int64 volatile* dst, int64 value) -{ - InterlockedExchange64(dst, value); -} - void Win32Platform::Prefetch(void const* ptr) { _mm_prefetch((char const*)ptr, _MM_HINT_T0); @@ -387,11 +336,6 @@ uint64 Win32Platform::GetCurrentProcessId() return ::GetCurrentProcessId(); } -uint64 Win32Platform::GetCurrentThreadID() -{ - return ::GetCurrentThreadId(); -} - void Win32Platform::SetThreadPriority(ThreadPriority priority) { int32 winPriority; diff --git a/Source/Engine/Platform/Win32/Win32Platform.h b/Source/Engine/Platform/Win32/Win32Platform.h index 02a7d4045..4ed5dd616 100644 --- a/Source/Engine/Platform/Win32/Win32Platform.h +++ b/Source/Engine/Platform/Win32/Win32Platform.h @@ -5,6 +5,12 @@ #if PLATFORM_WIN32 #include "Engine/Platform/Base/PlatformBase.h" +#if _MSC_VER <= 1900 +#include +#else +#include +#endif +extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void); /// /// The Win32 platform implementation and application management utilities. @@ -17,16 +23,46 @@ public: static bool Init(); static void Exit(); static void MemoryBarrier(); - static int64 InterlockedExchange(int64 volatile* dst, int64 exchange); - static int32 InterlockedCompareExchange(int32 volatile* dst, int32 exchange, int32 comperand); - static int64 InterlockedCompareExchange(int64 volatile* dst, int64 exchange, int64 comperand); - static int64 InterlockedIncrement(int64 volatile* dst); - static int64 InterlockedDecrement(int64 volatile* dst); - static int64 InterlockedAdd(int64 volatile* dst, int64 value); - static int32 AtomicRead(int32 volatile* dst); - static int64 AtomicRead(int64 volatile* dst); - static void AtomicStore(int32 volatile* dst, int32 value); - static void AtomicStore(int64 volatile* dst, int64 value); + static int64 InterlockedExchange(int64 volatile* dst, int64 exchange) + { + return _InterlockedExchange64(dst, exchange); + } + static int32 InterlockedCompareExchange(int32 volatile* dst, int32 exchange, int32 comperand) + { + return _InterlockedCompareExchange((long volatile*)dst, exchange, comperand); + } + static int64 InterlockedCompareExchange(int64 volatile* dst, int64 exchange, int64 comperand) + { + return _InterlockedCompareExchange64(dst, exchange, comperand); + } + static int64 InterlockedIncrement(int64 volatile* dst) + { + return _InterlockedExchangeAdd64(dst, 1) + 1; + } + static int64 InterlockedDecrement(int64 volatile* dst) + { + return _InterlockedExchangeAdd64(dst, -1) - 1; + } + static int64 InterlockedAdd(int64 volatile* dst, int64 value) + { + return _InterlockedExchangeAdd64(dst, value); + } + static int32 AtomicRead(int32 volatile* dst) + { + return (int32)_InterlockedCompareExchange((long volatile*)dst, 0, 0); + } + static int64 AtomicRead(int64 volatile* dst) + { + return _InterlockedCompareExchange64(dst, 0, 0); + } + static void AtomicStore(int32 volatile* dst, int32 value) + { + _InterlockedExchange((long volatile*)dst, value); + } + static void AtomicStore(int64 volatile* dst, int64 value) + { + _InterlockedExchange64(dst, value); + } static void Prefetch(void const* ptr); static void* Allocate(uint64 size, uint64 alignment); static void Free(void* ptr); @@ -38,7 +74,10 @@ public: static MemoryStats GetMemoryStats(); static ProcessMemoryStats GetProcessMemoryStats(); static uint64 GetCurrentProcessId(); - static uint64 GetCurrentThreadID(); + static uint64 GetCurrentThreadID() + { + return GetCurrentThreadId(); + } static void SetThreadPriority(ThreadPriority priority); static void SetThreadAffinityMask(uint64 affinityMask); static void Sleep(int32 milliseconds);