Merge branch 'master' into Improve-HighlightedPopUpColor

2026-01-12 22:56:49 +01:00
parent b171071893 3cfc5db54a
commit ec0877004c
112 changed files with 7452 additions and 2382 deletions
--- a/Content/Editor/Camera/M_Camera.flax
+++ b/Content/Editor/Camera/M_Camera.flax
--- a/Content/Editor/CubeTexturePreviewMaterial.flax
+++ b/Content/Editor/CubeTexturePreviewMaterial.flax
--- a/Content/Editor/DebugMaterials/DDGIDebugProbes.flax
+++ b/Content/Editor/DebugMaterials/DDGIDebugProbes.flax
--- a/Content/Editor/DebugMaterials/SingleColor/Decal.flax
+++ b/Content/Editor/DebugMaterials/SingleColor/Decal.flax
--- a/Content/Editor/DebugMaterials/SingleColor/Particle.flax
+++ b/Content/Editor/DebugMaterials/SingleColor/Particle.flax
--- a/Content/Editor/DebugMaterials/SingleColor/Surface.flax
+++ b/Content/Editor/DebugMaterials/SingleColor/Surface.flax
--- a/Content/Editor/DebugMaterials/SingleColor/SurfaceAdditive.flax
+++ b/Content/Editor/DebugMaterials/SingleColor/SurfaceAdditive.flax
--- a/Content/Editor/DebugMaterials/SingleColor/Terrain.flax
+++ b/Content/Editor/DebugMaterials/SingleColor/Terrain.flax
--- a/Content/Editor/DefaultFontMaterial.flax
+++ b/Content/Editor/DefaultFontMaterial.flax
--- a/Content/Editor/Gizmo/FoliageBrushMaterial.flax
+++ b/Content/Editor/Gizmo/FoliageBrushMaterial.flax
--- a/Content/Editor/Gizmo/Material.flax
+++ b/Content/Editor/Gizmo/Material.flax
--- a/Content/Editor/Gizmo/MaterialWire.flax
+++ b/Content/Editor/Gizmo/MaterialWire.flax
--- a/Content/Editor/Gizmo/SelectionOutlineMaterial.flax
+++ b/Content/Editor/Gizmo/SelectionOutlineMaterial.flax
--- a/Content/Editor/Gizmo/VertexColorsPreviewMaterial.flax
+++ b/Content/Editor/Gizmo/VertexColorsPreviewMaterial.flax
--- a/Content/Editor/Highlight
+++ b/Content/Editor/Highlight
--- a/Content/Editor/Icons/IconsMaterial.flax
+++ b/Content/Editor/Icons/IconsMaterial.flax
--- a/Content/Editor/IesProfilePreviewMaterial.flax
+++ b/Content/Editor/IesProfilePreviewMaterial.flax
--- a/Content/Editor/Particles/Particle
+++ b/Content/Editor/Particles/Particle
--- a/Content/Editor/Particles/Smoke
+++ b/Content/Editor/Particles/Smoke
--- a/Content/Editor/SpriteMaterial.flax
+++ b/Content/Editor/SpriteMaterial.flax
--- a/Content/Editor/Terrain/Circle
+++ b/Content/Editor/Terrain/Circle
--- a/Content/Editor/Terrain/Highlight
+++ b/Content/Editor/Terrain/Highlight
--- a/Content/Editor/TexturePreviewMaterial.flax
+++ b/Content/Editor/TexturePreviewMaterial.flax
--- a/Content/Editor/Wires
+++ b/Content/Editor/Wires
--- a/Content/Engine/DefaultDeformableMaterial.flax
+++ b/Content/Engine/DefaultDeformableMaterial.flax
--- a/Content/Engine/DefaultMaterial.flax
+++ b/Content/Engine/DefaultMaterial.flax
--- a/Content/Engine/DefaultRadialMenu.flax
+++ b/Content/Engine/DefaultRadialMenu.flax
--- a/Content/Engine/DefaultTerrainMaterial.flax
+++ b/Content/Engine/DefaultTerrainMaterial.flax
--- a/Content/Engine/SingleColorMaterial.flax
+++ b/Content/Engine/SingleColorMaterial.flax
--- a/Content/Engine/SkyboxMaterial.flax
+++ b/Content/Engine/SkyboxMaterial.flax
--- a/Content/Shaders/GI/DDGI.flax
+++ b/Content/Shaders/GI/DDGI.flax
--- a/Content/Shaders/GI/GlobalSurfaceAtlas.flax
+++ b/Content/Shaders/GI/GlobalSurfaceAtlas.flax
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0f34bf867df5f4296ca66ac691c2bca4efa168fb9e21ca4e613e8086669575cf
-size 13296
+oid sha256:615dff65b01507be6c4de722e126324aba20fc197f8e12dafaa94a05e46cba6e
+size 13222
--- a/Content/Shaders/GlobalSignDistanceField.flax
+++ b/Content/Shaders/GlobalSignDistanceField.flax
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:064f54786958f109222c49cbc0358ff4f345b30010fcd5e8cc1fab7bdc68c4fe
-size 13349
+oid sha256:1f07ebb16820897e8598ae7a0627cb75b3d28e9dceea3ad4bd9ff543d5cdd01c
+size 13979
--- a/Flax.flaxproj
+++ b/Flax.flaxproj
@@ -4,7 +4,7 @@
    "Major": 1,
    "Minor": 11,
    "Revision": 0,
-    "Build": 6805
+    "Build": 6806
  },
  "Company": "Flax",
  "Copyright": "Copyright (c) 2012-2025 Wojciech Figat. All rights reserved.",
--- a/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs
+++ b/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs
@@ -909,7 +909,8 @@ namespace FlaxEditor.CustomEditors.Dedicated
                settingsButton.Tag = script;
                settingsButton.Clicked += OnSettingsButtonClicked;

-                group.Panel.HeaderTextMargin = new Margin(scriptDrag.Right - 12, 15, 2, 2);
+                // Adjust margin to not overlap with other ui elements in the header
+                group.Panel.HeaderTextMargin = group.Panel.HeaderTextMargin with { Left = scriptDrag.Right - 12, Right = settingsButton.Width + Utilities.Constants.UIMargin };
                group.Object(values, editor);
                // Remove drop down arrows and containment lines if no objects in the group
                if (group.Children.Count == 0)
--- a/Source/Editor/CustomEditors/Editors/CollectionEditor.cs
+++ b/Source/Editor/CustomEditors/Editors/CollectionEditor.cs
@@ -450,6 +450,7 @@ namespace FlaxEditor.CustomEditors.Editors
        protected bool NotNullItems;

        private IntValueBox _sizeBox;
+        private Label _label;
        private Color _background;
        private int _elementsCount, _minCount, _maxCount;
        private bool _readOnly;
@@ -566,7 +567,7 @@ namespace FlaxEditor.CustomEditors.Editors
                    Parent = dropPanel,
                };

-                var label = new Label
+                _label = new Label
                {
                    Text = "Size",
                    AnchorPreset = AnchorPresets.TopRight,
@@ -672,8 +673,10 @@ namespace FlaxEditor.CustomEditors.Editors
                    Resize(Count + 1);
                };
            }
-        }

+            Layout.ContainerControl.SizeChanged += OnLayoutSizeChanged;
+        }
+        
        private void OnSetupContextMenu(ContextMenu menu, DropPanel panel)
        {
            if (menu.Items.Any(x => x is ContextMenuButton b && b.Text.Equals("Open All", StringComparison.Ordinal)))
@@ -696,10 +699,24 @@ namespace FlaxEditor.CustomEditors.Editors
            });
        }

+        private void OnLayoutSizeChanged(Control control)
+        {
+            if (Layout.ContainerControl is DropPanel dropPanel)
+            {
+                // Hide "Size" text when array editor title overlaps
+                var headerTextSize = dropPanel.HeaderTextFont.GetFont().MeasureText(dropPanel.HeaderText);
+                if (headerTextSize.X + DropPanel.DropDownIconSize >= _label.Left)
+                    _label.TextColor = _label.TextColorHighlighted = Color.Transparent;
+                else
+                    _label.TextColor = _label.TextColorHighlighted = FlaxEngine.GUI.Style.Current.Foreground;
+            }
+        }
+
        /// <inheritdoc />
        protected override void Deinitialize()
        {
            _sizeBox = null;
+            Layout.ContainerControl.SizeChanged -= OnLayoutSizeChanged;

            base.Deinitialize();
        }
--- a/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs
+++ b/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs
@@ -44,7 +44,8 @@ namespace FlaxEditor.CustomEditors.Elements
        {
            var style = Style.Current;
            var settingsButtonSize = Panel.HeaderHeight;
-            return new Image
+            Panel.HeaderTextMargin = Panel.HeaderTextMargin with { Right = settingsButtonSize + Utilities.Constants.UIMargin };
+;           return new Image
            {
                TooltipText = "Settings",
                AutoFocus = true,
--- a/Source/Editor/GUI/Input/ValueBox.cs
+++ b/Source/Editor/GUI/Input/ValueBox.cs
@@ -99,6 +99,11 @@ namespace FlaxEditor.GUI.Input
        /// </summary>
        public event Action SlidingEnd;

+        /// <summary>
+        /// If enabled, pressing the arrow up or down key increments/ decrements the value.
+        /// </summary>
+        public bool ArrowKeysIncrement = true;
+
        /// <summary>
        /// Gets or sets the slider speed. Use value 0 to disable and hide slider UI.
        /// </summary>
@@ -239,6 +244,27 @@ namespace FlaxEditor.GUI.Input
            ResetViewOffset();
        }

+        /// <inheritdoc />
+        public override bool OnKeyDown(KeyboardKeys key)
+        {
+            if (ArrowKeysIncrement && (key == KeyboardKeys.ArrowUp || key == KeyboardKeys.ArrowDown))
+            {
+                bool altDown = Root.GetKey(KeyboardKeys.Alt);
+                bool shiftDown = Root.GetKey(KeyboardKeys.Shift);
+                bool controlDown = Root.GetKey(KeyboardKeys.Control);
+                float deltaValue = altDown ? 0.1f : (shiftDown ? 10f : (controlDown ? 100f : 1f));
+                float slideDelta = key == KeyboardKeys.ArrowUp ? deltaValue : -deltaValue;
+
+                _startSlideValue = Value;
+                ApplySliding(slideDelta);
+                EndSliding();
+                Focus();
+                return true;
+            }
+
+            return base.OnKeyDown(key);
+        }
+
        /// <inheritdoc />
        public override bool OnMouseDown(Float2 location, MouseButton button)
        {
--- a/Source/Editor/Modules/UIModule.cs
+++ b/Source/Editor/Modules/UIModule.cs
@@ -125,6 +125,7 @@ namespace FlaxEditor.Modules
        private ContextMenuButton _menuToolsProfilerWindow;
        private ContextMenuButton _menuToolsSetTheCurrentSceneViewAsDefault;
        private ContextMenuButton _menuToolsTakeScreenshot;
+        private ContextMenuButton _menuToolsOpenLocalFolder;
        private ContextMenuChildMenu _menuWindowApplyWindowLayout;

        private ToolStripButton _toolStripSaveAll;
@@ -725,6 +726,16 @@ namespace FlaxEditor.Modules
            _menuToolsTakeScreenshot = cm.AddButton("Take screenshot", inputOptions.TakeScreenshot, Editor.Windows.TakeScreenshot);
            cm.AddSeparator();
            cm.AddButton("Plugins", () => Editor.Windows.PluginsWin.Show());
+            cm.AddSeparator();
+            var childMenu = cm.AddChildMenu("Open Product Local folder");
+            childMenu.ContextMenu.AddButton("Editor", () => FileSystem.ShowFileExplorer(Globals.ProductLocalFolder));
+            _menuToolsOpenLocalFolder = childMenu.ContextMenu.AddButton("Game", () =>
+            {
+                string localAppData = Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData);
+                GameSettings settings = GameSettings.Load<GameSettings>();
+                string path = Path.Combine(localAppData, settings.CompanyName, settings.ProductName);
+                FileSystem.ShowFileExplorer(path);
+            });

            // Window
            MenuWindow = MainMenu.AddButton("Window");
@@ -1062,6 +1073,10 @@ namespace FlaxEditor.Modules
            _menuToolsBuildNavMesh.Enabled = canEdit;
            _menuToolsCancelBuilding.Enabled = GameCooker.IsRunning;
            _menuToolsSetTheCurrentSceneViewAsDefault.Enabled = Level.ScenesCount > 0;
+            string localAppData = Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData);
+            GameSettings settings = GameSettings.Load<GameSettings>();
+            string path = Path.Combine(localAppData, settings.CompanyName, settings.ProductName);
+            _menuToolsOpenLocalFolder.Enabled = Directory.Exists(path);

            c.PerformLayout();
        }
--- a/Source/Editor/Options/InputOptions.cs
+++ b/Source/Editor/Options/InputOptions.cs
@@ -571,6 +571,10 @@ namespace FlaxEditor.Options
        [EditorDisplay("View Flags"), EditorOrder(3260)]
        public InputBinding DebugDraw = new InputBinding(KeyboardKeys.Alpha4, KeyboardKeys.Control, KeyboardKeys.Shift);

+        [DefaultValue(typeof(InputBinding), "None")]
+        [EditorDisplay("View Flags"), EditorOrder(3270)]
+        public InputBinding Particles = new InputBinding(KeyboardKeys.None);
+
        #endregion

        #region Interface
--- a/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs
+++ b/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs
@@ -42,6 +42,7 @@ namespace FlaxEditor.SceneGraph.Actors
                if (value is BoxCollider collider)
                    collider.AutoResize(!_keepLocalOrientation);
            }
+            Presenter.OnModified();
        }
    }

--- a/Source/Editor/Utilities/ShuntingYardParser.cs
+++ b/Source/Editor/Utilities/ShuntingYardParser.cs
@@ -444,6 +444,9 @@ namespace FlaxEditor.Utilities
        /// <returns>The result value.</returns>
        public static double Parse(string text)
        {
+            // Hack to allow parsing numbers while using "_" as a separator (like this: 1_000)
+            text = text.Replace("_", string.Empty);
+
            var tokens = Tokenize(text);
            var rpn = OrderTokens(tokens);
            return EvaluateRPN(rpn);
--- a/Source/Editor/Viewport/EditorViewport.cs
+++ b/Source/Editor/Viewport/EditorViewport.cs
@@ -1063,6 +1063,7 @@ namespace FlaxEditor.Viewport
            InputActions.Add(options => options.Fog, () => Task.ViewFlags ^= ViewFlags.Fog);
            InputActions.Add(options => options.SpecularLight, () => Task.ViewFlags ^= ViewFlags.SpecularLight);
            InputActions.Add(options => options.Decals, () => Task.ViewFlags ^= ViewFlags.Decals);
+            InputActions.Add(options => options.Particles, () => Task.ViewFlags ^= ViewFlags.Particles);
            InputActions.Add(options => options.CustomPostProcess, () => Task.ViewFlags ^= ViewFlags.CustomPostProcess);
            InputActions.Add(options => options.Bloom, () => Task.ViewFlags ^= ViewFlags.Bloom);
            InputActions.Add(options => options.ToneMapping, () => Task.ViewFlags ^= ViewFlags.ToneMapping);
@@ -2115,6 +2116,7 @@ namespace FlaxEditor.Viewport
            new ViewFlagOptions(ViewFlags.Fog, "Fog", Editor.Instance.Options.Options.Input.Fog),
            new ViewFlagOptions(ViewFlags.SpecularLight, "Specular Light", Editor.Instance.Options.Options.Input.SpecularLight),
            new ViewFlagOptions(ViewFlags.Decals, "Decals", Editor.Instance.Options.Options.Input.Decals),
+            new ViewFlagOptions(ViewFlags.Particles, "Particles", Editor.Instance.Options.Options.Input.Particles),
            new ViewFlagOptions(ViewFlags.CustomPostProcess, "Custom Post Process", Editor.Instance.Options.Options.Input.CustomPostProcess),
            new ViewFlagOptions(ViewFlags.Bloom, "Bloom", Editor.Instance.Options.Options.Input.Bloom),
            new ViewFlagOptions(ViewFlags.ToneMapping, "Tone Mapping", Editor.Instance.Options.Options.Input.ToneMapping),
@@ -2134,12 +2136,13 @@ namespace FlaxEditor.Viewport
            if (cm.Visible == false)
                return;
            var ccm = (ContextMenu)cm;
+            var flags = Task.View.Flags;
            foreach (var e in ccm.Items)
            {
                if (e is ContextMenuButton b && b.Tag != null)
                {
                    var v = (ViewFlags)b.Tag;
-                    b.Icon = (Task.View.Flags & v) != 0 ? Style.Current.CheckBoxTick : SpriteHandle.Invalid;
+                    b.Icon = (flags & v) != 0 ? Style.Current.CheckBoxTick : SpriteHandle.Invalid;
                }
            }
        }
--- a/Source/Editor/Windows/EditorOptionsWindow.cs
+++ b/Source/Editor/Windows/EditorOptionsWindow.cs
@@ -45,7 +45,7 @@ namespace FlaxEditor.Windows
            {
                Parent = this
            };
-            _saveButton = (ToolStripButton)toolstrip.AddButton(editor.Icons.Save64, SaveData).LinkTooltip("Save");
+            _saveButton = (ToolStripButton)toolstrip.AddButton(editor.Icons.Save64, SaveData).LinkTooltip("Save.");
            _saveButton.Enabled = false;

            _tabs = new Tabs
@@ -104,6 +104,8 @@ namespace FlaxEditor.Windows
            {
                _saveButton.Enabled = true;
                _isDataDirty = true;
+                if (!Title.EndsWith('*'))
+                    Title += "*";
            }
        }

@@ -113,6 +115,8 @@ namespace FlaxEditor.Windows
            {
                _saveButton.Enabled = false;
                _isDataDirty = false;
+                if (Title.EndsWith('*'))
+                    Title = Title.Remove(Title.Length - 1);
            }
        }

--- a/Source/Engine/Content/Assets/Material.cpp
+++ b/Source/Engine/Content/Assets/Material.cpp
@@ -41,6 +41,35 @@ bool Material::IsMaterialInstance() const
    return false;
 }

+#if USE_EDITOR
+
+void Material::GetReferences(Array<Guid>& assets, Array<String>& files) const
+{
+    ShaderAssetTypeBase<MaterialBase>::GetReferences(assets, files);
+
+    // Collect references from material graph (needs to load it)
+    if (!WaitForLoaded() && HasChunk(SHADER_FILE_CHUNK_VISJECT_SURFACE))
+    {
+        ScopeLock lock(Locker);
+        if (!LoadChunks(GET_CHUNK_FLAG(SHADER_FILE_CHUNK_VISJECT_SURFACE)))
+        {
+            const auto surfaceChunk = GetChunk(SHADER_FILE_CHUNK_VISJECT_SURFACE);
+            if (surfaceChunk)
+            {
+                MemoryReadStream stream(surfaceChunk->Get(), surfaceChunk->Size());
+                MaterialGraph graph;
+                if (!graph.Load(&stream, false))
+                {
+                    graph.GetReferences(assets);
+                }
+            }
+        }
+    }
+
+}
+
+#endif
+
 const MaterialInfo& Material::GetInfo() const
 {
    if (_materialShader)
--- a/Source/Engine/Content/Assets/Material.h
+++ b/Source/Engine/Content/Assets/Material.h
@@ -38,6 +38,9 @@ public:
 public:
    // [MaterialBase]
    bool IsMaterialInstance() const override;
+#if USE_EDITOR
+    void GetReferences(Array<Guid>& assets, Array<String>& files) const override;
+#endif

    // [IMaterial]
    const MaterialInfo& GetInfo() const override;
--- a/Source/Engine/Debug/DebugDraw.cpp
+++ b/Source/Engine/Debug/DebugDraw.cpp
@@ -490,6 +490,18 @@ FORCE_INLINE DebugTriangle* AppendTriangles(int32 count, float duration, bool de
    return list->Get() + startIndex;
 }

+FORCE_INLINE DebugTriangle* AppendWireTriangles(int32 count, float duration, bool depthTest)
+{
+    Array<DebugTriangle>* list;
+    if (depthTest)
+        list = duration > 0 ? &Context->DebugDrawDepthTest.DefaultWireTriangles : &Context->DebugDrawDepthTest.OneFrameWireTriangles;
+    else
+        list = duration > 0 ? &Context->DebugDrawDefault.DefaultWireTriangles : &Context->DebugDrawDefault.OneFrameWireTriangles;
+    const int32 startIndex = list->Count();
+    list->AddUninitialized(count);
+    return list->Get() + startIndex;
+}
+
 inline void DrawText3D(const DebugText3D& t, const RenderContext& renderContext, const Float3& viewUp, const Matrix& f, const Matrix& vp, const Viewport& viewport, GPUContext* context, GPUTextureView* target, GPUTextureView* depthBuffer)
 {
    Matrix w, fw, m;
@@ -1714,7 +1726,7 @@ void DebugDraw::DrawWireTriangles(const Span<Float3>& vertices, const Color& col
    DebugTriangle t;
    t.Color = Color32(color);
    t.TimeLeft = duration;
-    auto dst = AppendTriangles(vertices.Length() / 3, duration, depthTest);
+    auto dst = AppendWireTriangles(vertices.Length() / 3, duration, depthTest);
    const Float3 origin = Context->Origin;
    for (int32 i = 0; i < vertices.Length();)
    {
@@ -1736,7 +1748,7 @@ void DebugDraw::DrawWireTriangles(const Span<Float3>& vertices, const Span<int32
    DebugTriangle t;
    t.Color = Color32(color);
    t.TimeLeft = duration;
-    auto dst = AppendTriangles(indices.Length() / 3, duration, depthTest);
+    auto dst = AppendWireTriangles(indices.Length() / 3, duration, depthTest);
    const Float3 origin = Context->Origin;
    for (int32 i = 0; i < indices.Length();)
    {
@@ -1758,7 +1770,7 @@ void DebugDraw::DrawWireTriangles(const Span<Double3>& vertices, const Color& co
    DebugTriangle t;
    t.Color = Color32(color);
    t.TimeLeft = duration;
-    auto dst = AppendTriangles(vertices.Length() / 3, duration, depthTest);
+    auto dst = AppendWireTriangles(vertices.Length() / 3, duration, depthTest);
    const Double3 origin = Context->Origin;
    for (int32 i = 0; i < vertices.Length();)
    {
@@ -1780,7 +1792,7 @@ void DebugDraw::DrawWireTriangles(const Span<Double3>& vertices, const Span<int3
    DebugTriangle t;
    t.Color = Color32(color);
    t.TimeLeft = duration;
-    auto dst = AppendTriangles(indices.Length() / 3, duration, depthTest);
+    auto dst = AppendWireTriangles(indices.Length() / 3, duration, depthTest);
    const Double3 origin = Context->Origin;
    for (int32 i = 0; i < indices.Length();)
    {
--- a/Source/Engine/Graphics/Enums.h
+++ b/Source/Engine/Graphics/Enums.h
@@ -1075,20 +1075,25 @@ API_ENUM(Attributes="Flags") enum class ViewFlags : uint64
    /// </summary>
    LightsDebug = 1 << 27,

+    /// <summary>
+    /// Shows/hides particle effects.
+    /// </summary>
+    Particles = 1 << 28,
+
    /// <summary>
    /// Default flags for Game.
    /// </summary>
-    DefaultGame = Reflections | DepthOfField | Fog | Decals | MotionBlur | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | GlobalSDF | Sky,
+    DefaultGame = Reflections | DepthOfField | Fog | Decals | MotionBlur | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | GlobalSDF | Sky | Particles,

    /// <summary>
    /// Default flags for Editor.
    /// </summary>
-    DefaultEditor = Reflections | Fog | Decals | DebugDraw | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | EditorSprites | ContactShadows | GlobalSDF | Sky,
+    DefaultEditor = Reflections | Fog | Decals | DebugDraw | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | EditorSprites | ContactShadows | GlobalSDF | Sky | Particles,

    /// <summary>
    /// Default flags for materials/models previews generating.
    /// </summary>
-    DefaultAssetPreview = Reflections | Decals | DirectionalLights | PointLights | SpotLights | SkyLights | SpecularLight | AntiAliasing | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | Sky,
+    DefaultAssetPreview = Reflections | Decals | DirectionalLights | PointLights | SpotLights | SkyLights | SpecularLight | AntiAliasing | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | Sky | Particles,
 };

 DECLARE_ENUM_OPERATORS(ViewFlags);
--- a/Source/Engine/Graphics/Materials/MaterialShader.h
+++ b/Source/Engine/Graphics/Materials/MaterialShader.h
@@ -10,7 +10,7 @@
 /// <summary>
 /// Current materials shader version.
 /// </summary>
-#define MATERIAL_GRAPH_VERSION 178
+#define MATERIAL_GRAPH_VERSION 179

 class Material;
 class GPUShader;
--- a/Source/Engine/Graphics/Materials/MaterialShaderFeatures.cpp
+++ b/Source/Engine/Graphics/Materials/MaterialShaderFeatures.cpp
@@ -191,7 +191,7 @@ bool GlobalIlluminationFeature::Bind(MaterialShader::BindParameters& params, Spa
    {
        // Unbind SRVs to prevent issues
        data.DDGI.CascadesCount = 0;
-        data.DDGI.FallbackIrradiance = Float3::Zero;
+        data.DDGI.FallbackIrradiance = Float4::Zero;
        params.GPUContext->UnBindSR(srv + 0);
        params.GPUContext->UnBindSR(srv + 1);
        params.GPUContext->UnBindSR(srv + 2);
--- a/Source/Engine/Graphics/Models/MeshAccessor.h
+++ b/Source/Engine/Graphics/Models/MeshAccessor.h
@@ -17,7 +17,7 @@ public:
    /// <summary>
    /// Mesh data stream.
    /// </summary>
-    struct Stream
+    struct FLAXENGINE_API Stream
    {
        friend MeshAccessor;

--- a/Source/Engine/Graphics/PostProcessSettings.h
+++ b/Source/Engine/Graphics/PostProcessSettings.h
@@ -378,7 +378,7 @@ API_STRUCT() struct FLAXENGINE_API GlobalIlluminationSettings : ISerializable
    /// The irradiance lighting outside the GI range used as a fallback to prevent pure-black scene outside the Global Illumination range.
    /// </summary>
    API_FIELD(Attributes="EditorOrder(40), PostProcessSetting((int)GlobalIlluminationSettingsOverride.FallbackIrradiance)")
-    Color FallbackIrradiance = Color::Black;
+    Color FallbackIrradiance = Color::Transparent;

 public:
    /// <summary>
--- a/Source/Engine/Graphics/RenderTools.cpp
+++ b/Source/Engine/Graphics/RenderTools.cpp
@@ -620,6 +620,40 @@ void RenderTools::ComputeSphereModelDrawMatrix(const RenderView& view, const Flo
    resultIsViewInside = Float3::DistanceSquared(view.Position, position) < Math::Square(radius * 1.1f); // Manually tweaked bias
 }

+Float3 RenderTools::GetColorQuantizationError(PixelFormat format)
+{
+    Float3 mantissaBits;
+    switch (format)
+    {
+    case PixelFormat::R11G11B10_Float:
+        mantissaBits = Float3(6, 6, 5);
+        break;
+    case PixelFormat::R10G10B10A2_UNorm:
+        mantissaBits = Float3(10, 10, 10);
+        break;
+    case PixelFormat::R16G16B16A16_Float:
+        mantissaBits = Float3(16, 16, 16);
+        break;
+    case PixelFormat::R32G32B32A32_Float:
+        mantissaBits = Float3(23, 23, 23);
+        break;
+    case PixelFormat::R9G9B9E5_SharedExp:
+        mantissaBits = Float3(5, 6, 5);
+        break;
+    case PixelFormat::R8G8B8A8_UNorm:
+    case PixelFormat::B8G8R8A8_UNorm:
+        mantissaBits = Float3(8, 8, 8);
+        break;
+    default:
+        return Float3::Zero;
+    }
+    return {
+        Math::Pow(0.5f, mantissaBits.X),
+        Math::Pow(0.5f, mantissaBits.Y),
+        Math::Pow(0.5f, mantissaBits.Z)
+    };
+}
+
 int32 MipLevelsCount(int32 width)
 {
    int32 result = 1;
--- a/Source/Engine/Graphics/RenderTools.h
+++ b/Source/Engine/Graphics/RenderTools.h
@@ -140,6 +140,9 @@ public:
    static void CalculateTangentFrame(Float3& resultNormal, Float4& resultTangent, const Float3& normal, const Float3& tangent);

    static void ComputeSphereModelDrawMatrix(const RenderView& view, const Float3& position, float radius, Matrix& resultWorld, bool& resultIsViewInside);
+
+    // Calculates error for a given render target format to reduce floating-point precision artifacts via QuantizeColor (from Noise.hlsl).
+    static Float3 GetColorQuantizationError(PixelFormat format);
 };

 // Calculates mip levels count for a texture 1D.
--- a/Source/Engine/Input/Input.cpp
+++ b/Source/Engine/Input/Input.cpp
@@ -80,6 +80,8 @@ Delegate<const Float2&, MouseButton> Input::MouseDoubleClick;
 Delegate<const Float2&, float> Input::MouseWheel;
 Delegate<const Float2&> Input::MouseMove;
 Action Input::MouseLeave;
+Delegate<InputGamepadIndex, GamepadButton> Input::GamepadButtonDown;
+Delegate<InputGamepadIndex, GamepadButton> Input::GamepadButtonUp;
 Delegate<const Float2&, int32> Input::TouchDown;
 Delegate<const Float2&, int32> Input::TouchMove;
 Delegate<const Float2&, int32> Input::TouchUp;
@@ -1027,6 +1029,19 @@ void InputService::Update()
            break;
        }
    }
+    // TODO: route gamepad button events into global InputEvents queue to improve processing
+    for (int32 i = 0; i < Input::Gamepads.Count(); i++)
+    {
+        auto gamepad = Input::Gamepads[i];
+        for (int32 buttonIdx = 1; buttonIdx < (int32)GamepadButton::MAX; buttonIdx++)
+        {
+            GamepadButton button = (GamepadButton)buttonIdx;
+            if (gamepad->GetButtonDown(button))
+                Input::GamepadButtonDown((InputGamepadIndex)i, button);
+            else if (gamepad->GetButtonUp(button))
+                Input::GamepadButtonUp((InputGamepadIndex)i, button);
+        }
+    }

    // Update all actions
    for (int32 i = 0; i < Input::ActionMappings.Count(); i++)
--- a/Source/Engine/Input/Input.h
+++ b/Source/Engine/Input/Input.h
@@ -113,6 +113,16 @@ public:
    /// </summary>
    API_EVENT() static Action MouseLeave;

+    /// <summary>
+    /// Event fired when gamepad button goes down.
+    /// </summary>
+    API_EVENT() static Delegate<InputGamepadIndex, GamepadButton> GamepadButtonDown;
+
+    /// <summary>
+    /// Event fired when gamepad button goes up.
+    /// </summary>
+    API_EVENT() static Delegate<InputGamepadIndex, GamepadButton> GamepadButtonUp;
+
    /// <summary>
    /// Event fired when touch action begins.
    /// </summary>
--- a/Source/Engine/Particles/ParticleEffect.cpp
+++ b/Source/Engine/Particles/ParticleEffect.cpp
@@ -601,7 +601,9 @@ bool ParticleEffect::HasContentLoaded() const

 void ParticleEffect::Draw(RenderContext& renderContext)
 {
-    if (renderContext.View.Pass == DrawPass::GlobalSDF || renderContext.View.Pass == DrawPass::GlobalSurfaceAtlas)
+    if (renderContext.View.Pass == DrawPass::GlobalSDF || 
+        renderContext.View.Pass == DrawPass::GlobalSurfaceAtlas ||
+        EnumHasNoneFlags(renderContext.View.Flags, ViewFlags::Particles))
        return;
    _lastMinDstSqr = Math::Min(_lastMinDstSqr, Vector3::DistanceSquared(GetPosition(), renderContext.View.WorldPosition));
    RenderContextBatch renderContextBatch(renderContext);
@@ -610,10 +612,12 @@ void ParticleEffect::Draw(RenderContext& renderContext)

 void ParticleEffect::Draw(RenderContextBatch& renderContextBatch)
 {
+    const RenderView& mainView = renderContextBatch.GetMainContext().View;
+    if (EnumHasNoneFlags(mainView.Flags, ViewFlags::Particles))
+        return;
    Particles::DrawParticles(renderContextBatch, this);

    // Cull again against the main context (if using multiple ones) to skip caching draw distance from shadow projections
-    const RenderView& mainView = renderContextBatch.GetMainContext().View;
    const BoundingSphere bounds(_sphere.Center - mainView.Origin, _sphere.Radius);
    if (renderContextBatch.Contexts.Count() > 1 && !mainView.CullingFrustum.Intersects(bounds))
        return;
--- a/Source/Engine/Physics/Colliders/BoxCollider.cpp
+++ b/Source/Engine/Physics/Colliders/BoxCollider.cpp
@@ -23,15 +23,15 @@ void BoxCollider::SetSize(const Float3& value)
 void BoxCollider::AutoResize(bool globalOrientation = true)
 {
    Actor* parent = GetParent();
-    if (Cast<Scene>(parent))
+    if (parent == nullptr || Cast<Scene>(parent))
        return;

    // Get bounds of all siblings (excluding itself)
    const Vector3 parentScale = parent->GetScale();
    if (parentScale.IsAnyZero())
-        return; // Avoid division by zero
+        return;

-    // Hacky way to get unrotated bounded box of parent.
+    // Hacky way to get unrotated bounded box of parent
    const Quaternion parentOrientation = parent->GetOrientation();
    parent->SetOrientation(Quaternion::Identity);
    BoundingBox parentBox = parent->GetBox();
--- a/Source/Engine/Platform/Windows/WindowsPlatform.cpp
+++ b/Source/Engine/Platform/Windows/WindowsPlatform.cpp
@@ -543,7 +543,6 @@ void WindowsPlatform::ReleaseMutex()
    }
 }

-PRAGMA_DISABLE_OPTIMIZATION;
 void CheckInstructionSet()
 {
 #if PLATFORM_ARCH_X86 || PLATFORM_ARCH_X64
--- a/Source/Engine/Renderer/ColorGradingPass.cpp
+++ b/Source/Engine/Renderer/ColorGradingPass.cpp
@@ -37,8 +37,45 @@ GPU_CB_STRUCT(Data {

    Float3 Dummy;
    float LutWeight;
+
+    void Init(const PostProcessSettings& settings, GPUTexture*& lut)
+    {
+        Dummy = Float2::Zero;
+        auto& toneMapping = settings.ToneMapping;
+        auto& colorGrading = settings.ColorGrading;
+        // White Balance
+        WhiteTemp = toneMapping.WhiteTemperature;
+        WhiteTint = toneMapping.WhiteTint;
+        // Shadows
+        ColorSaturationShadows = colorGrading.ColorSaturationShadows * colorGrading.ColorSaturation;
+        ColorContrastShadows = colorGrading.ColorContrastShadows * colorGrading.ColorContrast;
+        ColorGammaShadows = colorGrading.ColorGammaShadows * colorGrading.ColorGamma;
+        ColorGainShadows = colorGrading.ColorGainShadows * colorGrading.ColorGain;
+        ColorOffsetShadows = colorGrading.ColorOffsetShadows + colorGrading.ColorOffset;
+        ColorCorrectionShadowsMax = colorGrading.ShadowsMax;
+        // Midtones
+        ColorSaturationMidtones = colorGrading.ColorSaturationMidtones * colorGrading.ColorSaturation;
+        ColorContrastMidtones = colorGrading.ColorContrastMidtones * colorGrading.ColorContrast;
+        ColorGammaMidtones = colorGrading.ColorGammaMidtones * colorGrading.ColorGamma;
+        ColorGainMidtones = colorGrading.ColorGainMidtones * colorGrading.ColorGain;
+        ColorOffsetMidtones = colorGrading.ColorOffsetMidtones + colorGrading.ColorOffset;
+        // Highlights
+        ColorSaturationHighlights = colorGrading.ColorSaturationHighlights * colorGrading.ColorSaturation;
+        ColorContrastHighlights = colorGrading.ColorContrastHighlights * colorGrading.ColorContrast;
+        ColorGammaHighlights = colorGrading.ColorGammaHighlights * colorGrading.ColorGamma;
+        ColorGainHighlights = colorGrading.ColorGainHighlights * colorGrading.ColorGain;
+        ColorOffsetHighlights = colorGrading.ColorOffsetHighlights + colorGrading.ColorOffset;
+        ColorCorrectionHighlightsMin = colorGrading.HighlightsMin;
+        //
+        Texture* lutTexture = colorGrading.LutTexture.Get();
+        const bool useLut = lutTexture && lutTexture->IsLoaded() && lutTexture->GetResidentMipLevels() > 0 && colorGrading.LutWeight > ZeroTolerance;
+        LutWeight = useLut ? colorGrading.LutWeight : 0.0f;
+        lut = useLut ? lutTexture->GetTexture() : nullptr;
+    }
    });

+Data DefaultData;
+
 // Custom render buffer for caching Color Grading LUT.
 class ColorGradingCustomBuffer : public RenderBuffers::CustomBuffer
 {
@@ -46,7 +83,7 @@ public:
    GPUTexture* LUT = nullptr;
    Data CachedData;
    ToneMappingMode Mode = ToneMappingMode::None;
-    Texture* LutTexture = nullptr;
+    GPUTexture* LutTexture = nullptr;
 #if COMPILE_WITH_DEV_ENV
    uint64 FrameRendered = 0;
 #endif
@@ -82,6 +119,9 @@ bool ColorGradingPass::Init()
 #if COMPILE_WITH_DEV_ENV
    _shader.Get()->OnReloading.Bind<ColorGradingPass, &ColorGradingPass::OnShaderReloading>(this);
 #endif
+    PostProcessSettings defaultSettings;
+    GPUTexture* defaultLut;
+    DefaultData.Init(defaultSettings, defaultLut);
    return false;
 }

@@ -125,6 +165,18 @@ GPUTexture* ColorGradingPass::RenderLUT(RenderContext& renderContext)
 {
    PROFILE_CPU();

+    // Prepare the parameters
+    Data data;
+    GPUTexture* lutTexture;
+    auto& toneMapping = renderContext.List->Settings.ToneMapping;
+    data.Init(renderContext.List->Settings, lutTexture);
+
+    // Skip if color grading is unsued
+    if (Platform::MemoryCompare(&DefaultData, &data, sizeof(Data)) == 0 && 
+        lutTexture == nullptr && 
+        toneMapping.Mode == ToneMappingMode::None)
+        return nullptr;
+
    // Check if can use volume texture (3D) for a LUT (faster on modern platforms, requires geometry shader)
    const auto device = GPUDevice::Instance;
    bool use3D = GPU_ALLOW_GEOMETRY_SHADERS && Graphics::PostProcessing::ColorGradingVolumeLUT;
@@ -172,41 +224,8 @@ GPUTexture* ColorGradingPass::RenderLUT(RenderContext& renderContext)
        RENDER_TARGET_POOL_SET_NAME(colorGradingBuffer.LUT, "ColorGrading.LUT");
    }

-    // Prepare the parameters
-    Data data;
-    data.Dummy = Float2::Zero;
-    auto& toneMapping = renderContext.List->Settings.ToneMapping;
-    auto& colorGrading = renderContext.List->Settings.ColorGrading;
-    // White Balance
-    data.WhiteTemp = toneMapping.WhiteTemperature;
-    data.WhiteTint = toneMapping.WhiteTint;
-    // Shadows
-    data.ColorSaturationShadows = colorGrading.ColorSaturationShadows * colorGrading.ColorSaturation;
-    data.ColorContrastShadows = colorGrading.ColorContrastShadows * colorGrading.ColorContrast;
-    data.ColorGammaShadows = colorGrading.ColorGammaShadows * colorGrading.ColorGamma;
-    data.ColorGainShadows = colorGrading.ColorGainShadows * colorGrading.ColorGain;
-    data.ColorOffsetShadows = colorGrading.ColorOffsetShadows + colorGrading.ColorOffset;
-    data.ColorCorrectionShadowsMax = colorGrading.ShadowsMax;
-    // Midtones
-    data.ColorSaturationMidtones = colorGrading.ColorSaturationMidtones * colorGrading.ColorSaturation;
-    data.ColorContrastMidtones = colorGrading.ColorContrastMidtones * colorGrading.ColorContrast;
-    data.ColorGammaMidtones = colorGrading.ColorGammaMidtones * colorGrading.ColorGamma;
-    data.ColorGainMidtones = colorGrading.ColorGainMidtones * colorGrading.ColorGain;
-    data.ColorOffsetMidtones = colorGrading.ColorOffsetMidtones + colorGrading.ColorOffset;
-    // Highlights
-    data.ColorSaturationHighlights = colorGrading.ColorSaturationHighlights * colorGrading.ColorSaturation;
-    data.ColorContrastHighlights = colorGrading.ColorContrastHighlights * colorGrading.ColorContrast;
-    data.ColorGammaHighlights = colorGrading.ColorGammaHighlights * colorGrading.ColorGamma;
-    data.ColorGainHighlights = colorGrading.ColorGainHighlights * colorGrading.ColorGain;
-    data.ColorOffsetHighlights = colorGrading.ColorOffsetHighlights + colorGrading.ColorOffset;
-    data.ColorCorrectionHighlightsMin = colorGrading.HighlightsMin;
-    //
-    Texture* lutTexture = colorGrading.LutTexture.Get();
-    const bool useLut = lutTexture && lutTexture->IsLoaded() && lutTexture->GetResidentMipLevels() > 0 && colorGrading.LutWeight > ZeroTolerance;
-    data.LutWeight = useLut ? colorGrading.LutWeight : 0.0f;
-
    // Check if LUT parameter hasn't been changed since the last time
-    if (Platform::MemoryCompare(&colorGradingBuffer.CachedData , &data, sizeof(Data)) == 0 &&
+    if (Platform::MemoryCompare(&colorGradingBuffer.CachedData, &data, sizeof(Data)) == 0 &&
        colorGradingBuffer.Mode == toneMapping.Mode &&
 #if COMPILE_WITH_DEV_ENV
        colorGradingBuffer.FrameRendered > _reloadedFrame &&
@@ -232,7 +251,7 @@ GPUTexture* ColorGradingPass::RenderLUT(RenderContext& renderContext)
    context->BindCB(0, cb);
    context->SetViewportAndScissors((float)lutDesc.Width, (float)lutDesc.Height);
    context->SetState(_psLut.Get((int32)toneMapping.Mode));
-    context->BindSR(0, useLut ? lutTexture->GetTexture() : nullptr);
+    context->BindSR(0, lutTexture);
 #if GPU_ALLOW_GEOMETRY_SHADERS
    if (use3D)
    {
--- a/Source/Engine/Renderer/GI/DynamicDiffuseGlobalIllumination.cpp
+++ b/Source/Engine/Renderer/GI/DynamicDiffuseGlobalIllumination.cpp
@@ -11,6 +11,7 @@
 #include "Engine/Core/Math/Quaternion.h"
 #include "Engine/Core/Config/GraphicsSettings.h"
 #include "Engine/Engine/Engine.h"
+#include "Engine/Engine/Units.h"
 #include "Engine/Content/Content.h"
 #include "Engine/Debug/DebugDraw.h"
 #include "Engine/Graphics/GPUContext.h"
@@ -41,6 +42,7 @@
 #define DDGI_PROBE_RESOLUTION_DISTANCE 14 // Resolution (in texels) for probe distance data (excluding 1px padding on each side)
 #define DDGI_PROBE_UPDATE_BORDERS_GROUP_SIZE 8
 #define DDGI_PROBE_CLASSIFY_GROUP_SIZE 32
+#define DDGI_PROBE_EMPTY_AREA_DENSITY 8 // Spacing (in probe grid) between fallback probes placed into empty areas to provide valid GI for nearby dynamic objects or transparency
 #define DDGI_DEBUG_STATS 0 // Enables additional GPU-driven stats for probe/rays count
 #define DDGI_DEBUG_INSTABILITY 0 // Enables additional probe irradiance instability debugging

@@ -68,11 +70,14 @@ GPU_CB_STRUCT(Data0 {
    Int4 ProbeScrollClears[4];
    Float3 ViewDir;
    float Padding1;
+    Float3 QuantizationError;
+    int32 FrameIndexMod8;
    });

 GPU_CB_STRUCT(Data1 {
    // TODO: use push constants on Vulkan or root signature data on DX12 to reduce overhead of changing single DWORD
-    Float2 Padding2;
+    float Padding2;
+    int32 StepSize;
    uint32 CascadeIndex;
    uint32 ProbeIndexOffset;
    });
@@ -214,6 +219,7 @@ bool DynamicDiffuseGlobalIlluminationPass::setupResources()
        return true;
    _csClassify = shader->GetCS("CS_Classify");
    _csUpdateProbesInitArgs = shader->GetCS("CS_UpdateProbesInitArgs");
+    _csUpdateInactiveProbes = shader->GetCS("CS_UpdateInactiveProbes");
    _csTraceRays[0] = shader->GetCS("CS_TraceRays", 0);
    _csTraceRays[1] = shader->GetCS("CS_TraceRays", 1);
    _csTraceRays[2] = shader->GetCS("CS_TraceRays", 2);
@@ -245,6 +251,7 @@ void DynamicDiffuseGlobalIlluminationPass::OnShaderReloading(Asset* obj)
    LastFrameShaderReload = Engine::FrameCount;
    _csClassify = nullptr;
    _csUpdateProbesInitArgs = nullptr;
+    _csUpdateInactiveProbes = nullptr;
    _csTraceRays[0] = nullptr;
    _csTraceRays[1] = nullptr;
    _csTraceRays[2] = nullptr;
@@ -322,7 +329,6 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
    const float indirectLightingIntensity = settings.Intensity;
    const float probeHistoryWeight = Math::Clamp(settings.TemporalResponse, 0.0f, 0.98f);
    const float distance = settings.Distance;
-    const Color fallbackIrradiance = settings.FallbackIrradiance;

    // Automatically calculate amount of cascades to cover the GI distance at the current probes spacing
    const int32 idealProbesCount = 20; // Ideal amount of probes per-cascade to try to fit in order to cover whole distance
@@ -335,7 +341,7 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
    }

    // Calculate the probes count based on the amount of cascades and the distance to cover
-    const float cascadesDistanceScales[] = { 1.0f, 3.0f, 6.0f, 10.0f }; // Scales each cascade further away from the camera origin
+    const float cascadesDistanceScales[] = { 1.0f, 3.0f, 5.0f, 10.0f }; // Scales each cascade further away from the camera origin
    const float distanceExtent = distance / cascadesDistanceScales[cascadesCount - 1];
    const float verticalRangeScale = 0.8f; // Scales the probes volume size at Y axis (horizontal aspect ratio makes the DDGI use less probes vertically to cover whole screen)
    Int3 probesCounts(Float3::Ceil(Float3(distanceExtent, distanceExtent * verticalRangeScale, distanceExtent) / probesSpacing));
@@ -351,6 +357,7 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
    // Initialize cascades
    float probesSpacings[4];
    Float3 viewOrigins[4];
+    Float3 blendOrigins[4];
    for (int32 cascadeIndex = 0; cascadeIndex < cascadesCount; cascadeIndex++)
    {
        // Each cascade has higher spacing between probes
@@ -361,14 +368,15 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
        // Calculate view origin for cascade by shifting it towards the view direction to account for better view frustum coverage
        Float3 viewOrigin = renderContext.View.Position;
        Float3 viewDirection = renderContext.View.Direction;
-        const Float3 probesDistance = Float3(probesCounts) * cascadeProbesSpacing;
+        const Float3 probesDistance = Float3(probesCounts - 1) * cascadeProbesSpacing;
        const float probesDistanceMax = probesDistance.MaxValue();
        const Float3 viewRayHit = CollisionsHelper::LineHitsBox(viewOrigin, viewOrigin + viewDirection * (probesDistanceMax * 2.0f), viewOrigin - probesDistance, viewOrigin + probesDistance);
        const float viewOriginOffset = viewRayHit.Y * probesDistanceMax * 0.6f;
        viewOrigin += viewDirection * viewOriginOffset;
+        //viewOrigin = Float3::Zero;
+        blendOrigins[cascadeIndex] = viewOrigin;
        const float viewOriginSnapping = cascadeProbesSpacing;
        viewOrigin = Float3::Floor(viewOrigin / viewOriginSnapping) * viewOriginSnapping;
-        //viewOrigin = Float3::Zero;
        viewOrigins[cascadeIndex] = viewOrigin;
    }

@@ -500,6 +508,7 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
        {
            auto& cascade = ddgiData.Cascades[cascadeIndex];
            ddgiData.Result.Constants.ProbesOriginAndSpacing[cascadeIndex] = Float4(cascade.ProbesOrigin, cascade.ProbesSpacing);
+            ddgiData.Result.Constants.BlendOrigin[cascadeIndex] = Float4(blendOrigins[cascadeIndex], 0.0f);
            ddgiData.Result.Constants.ProbesScrollOffsets[cascadeIndex] = Int4(cascade.ProbeScrollOffsets, 0);
        }
        ddgiData.Result.Constants.RayMaxDistance = distance;
@@ -508,7 +517,7 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
        ddgiData.Result.Constants.ProbeHistoryWeight = probeHistoryWeight;
        ddgiData.Result.Constants.IrradianceGamma = 1.5f;
        ddgiData.Result.Constants.IndirectLightingIntensity = indirectLightingIntensity;
-        ddgiData.Result.Constants.FallbackIrradiance = fallbackIrradiance.ToFloat3() * fallbackIrradiance.A;
+        ddgiData.Result.Constants.FallbackIrradiance = settings.FallbackIrradiance.ToFloat4();
        ddgiData.Result.ProbesData = ddgiData.ProbesData->View();
        ddgiData.Result.ProbesDistance = ddgiData.ProbesDistance->View();
        ddgiData.Result.ProbesIrradiance = ddgiData.ProbesIrradiance->View();
@@ -535,6 +544,8 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
        data.TemporalTime = renderContext.List->Setup.UseTemporalAAJitter ? RenderTools::ComputeTemporalTime() : 0.0f;
        data.ViewDir = renderContext.View.Direction;
        data.SkyboxIntensity = renderContext.List->Sky ? renderContext.List->Sky->GetIndirectLightingIntensity() : 1.0f;
+        data.QuantizationError = RenderTools::GetColorQuantizationError(ddgiData.ProbesIrradiance->Format());
+        data.FrameIndexMod8 = (int32)(Engine::FrameCount % 8);
        GBufferPass::SetInputs(renderContext.View, data.GBuffer);
        context->UpdateCB(_cb0, &data);
        context->BindCB(0, _cb0);
@@ -581,6 +592,23 @@ bool DynamicDiffuseGlobalIlluminationPass::RenderInner(RenderContext& renderCont
                context->ResetUA();
            }

+            // For inactive probes, search nearby ones to find the closest valid for quick fallback when sampling irradiance
+            {
+                PROFILE_GPU_CPU_NAMED("Update Inactive Probes");
+                // TODO: this could run within GPUComputePass during Trace Rays or Update Probes to overlap compute works
+                context->BindUA(0, ddgiData.Result.ProbesData);
+                Data1 data;
+                data.CascadeIndex = cascadeIndex;
+                int32 iterations = Math::CeilToInt(Math::Log2((float)Math::Min(probesCounts.MaxValue(), DDGI_PROBE_EMPTY_AREA_DENSITY) + 1.0f));
+                for (int32 i = iterations - 1; i >= 0; i--)
+                {
+                    data.StepSize = Math::FloorToInt(Math::Pow(2, (float)i) + 0.5f); // Jump Flood step size
+                    context->UpdateCB(_cb1, &data);
+                    context->Dispatch(_csUpdateInactiveProbes, threadGroupsX, 1, 1);
+                }
+                context->ResetUA();
+            }
+
            // Update probes in batches so ProbesTrace texture can be smaller
            uint32 arg = 0;
            // TODO: use rays allocator to dispatch raytracing in packets (eg. 8 threads in a group instead of hardcoded limit)
--- a/Source/Engine/Renderer/GI/DynamicDiffuseGlobalIllumination.h
+++ b/Source/Engine/Renderer/GI/DynamicDiffuseGlobalIllumination.h
@@ -15,7 +15,8 @@ public:
    // Constant buffer data for DDGI access on a GPU.
    GPU_CB_STRUCT(ConstantsData {
        Float4 ProbesOriginAndSpacing[4];
-        Int4 ProbesScrollOffsets[4];
+        Float4 BlendOrigin[4]; // w is unused
+        Int4 ProbesScrollOffsets[4]; // w is unused
        uint32 ProbesCounts[3];
        uint32 CascadesCount;
        float IrradianceGamma;
@@ -24,8 +25,7 @@ public:
        float IndirectLightingIntensity;
        Float3 ViewPos;
        uint32 RaysCount;
-        Float3 FallbackIrradiance;
-        float Padding0;
+        Float4 FallbackIrradiance;
        });

    // Binding data for the GPU.
@@ -44,6 +44,7 @@ private:
    GPUConstantBuffer* _cb1 = nullptr;
    GPUShaderProgramCS* _csClassify;
    GPUShaderProgramCS* _csUpdateProbesInitArgs;
+    GPUShaderProgramCS* _csUpdateInactiveProbes;
    GPUShaderProgramCS* _csTraceRays[4];
    GPUShaderProgramCS* _csUpdateProbesIrradiance;
    GPUShaderProgramCS* _csUpdateProbesDistance;
--- a/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp
+++ b/Source/Engine/Renderer/GI/GlobalSurfaceAtlasPass.cpp
@@ -428,6 +428,7 @@ public:
            // Write to objects buffer (this must match unpacking logic in HLSL)
            uint32 objectAddress = ObjectsBuffer.Data.Count() / sizeof(Float4);
            ObjectsListBuffer.Write(objectAddress);
+            ObjectsBuffer.Data.EnsureCapacity(ObjectsBuffer.Data.Count() + sizeof(Float4) * (GLOBAL_SURFACE_ATLAS_OBJECT_DATA_STRIDE + 6 * GLOBAL_SURFACE_ATLAS_TILE_DATA_STRIDE));
            auto* objectData = ObjectsBuffer.WriteReserve<Float4>(GLOBAL_SURFACE_ATLAS_OBJECT_DATA_STRIDE);
            objectData[0] = Float4(object.Position, object.Radius);
            objectData[1] = Float4::Zero;
@@ -511,6 +512,7 @@ public:
            {
                // Dirty object to redraw
                object->LastFrameUpdated = 0;
+                return;
            }
            GlobalSurfaceAtlasLight* light = Lights.TryGet(a->GetID());
            if (light)
--- a/Source/Engine/Renderer/PostProcessingPass.cpp
+++ b/Source/Engine/Renderer/PostProcessingPass.cpp
@@ -269,7 +269,7 @@ void PostProcessingPass::Render(RenderContext& renderContext, GPUTexture* input,
    int32 bloomMipCount = CalculateBloomMipCount(w1, h1);

    // Ensure to have valid data and if at least one effect should be applied
-    if (!(useBloom || useToneMapping || useCameraArtifacts) || checkIfSkipPass() || w8 <= 1 || h8 <= 1)
+    if (!(useBloom || useToneMapping || useCameraArtifacts || colorGradingLUT) || checkIfSkipPass() || w8 <= 1 || h8 <= 1)
    {
        // Resources are missing. Do not perform rendering. Just copy raw frame
        context->SetViewportAndScissors((float)output->Width(), (float)output->Height());
--- a/Source/Engine/Renderer/Renderer.cpp
+++ b/Source/Engine/Renderer/Renderer.cpp
@@ -402,6 +402,8 @@ void RenderInner(SceneRenderTask* task, RenderContext& renderContext, RenderCont
        case ViewMode::MaterialComplexity:
        case ViewMode::Wireframe:
        case ViewMode::NoPostFx:
+        case ViewMode::VertexColors:
+        case ViewMode::QuadOverdraw:
            setup.UseTemporalAAJitter = false;
            break;
        }
--- a/Source/Engine/Scripting/Scripting.cs
+++ b/Source/Engine/Scripting/Scripting.cs
@@ -137,8 +137,8 @@ namespace FlaxEngine
            {
                Debug.LogError($"Unhandled Exception: {exception.Message}");
                Debug.LogException(exception);
-                if (e.IsTerminating && !System.Diagnostics.Debugger.IsAttached)
-                    Platform.Fatal($"Unhandled Exception: {exception}");
+                //if (e.IsTerminating && !System.Diagnostics.Debugger.IsAttached)
+                //    Platform.Fatal($"Unhandled Exception: {exception}");
            }
        }

--- a/Source/Engine/UI/GUI/Panels/DropPanel.cs
+++ b/Source/Engine/UI/GUI/Panels/DropPanel.cs
@@ -11,6 +11,11 @@ namespace FlaxEngine.GUI
    [ActorToolbox("GUI")]
    public class DropPanel : ContainerControl
    {
+        /// <summary>
+        /// Size of the drop down icon. 
+        /// </summary>
+        public const float DropDownIconSize = 14.0f;
+
        /// <summary>
        /// The header height.
        /// </summary>
@@ -368,7 +373,7 @@ namespace FlaxEngine.GUI
            var style = Style.Current;
            var enabled = EnabledInHierarchy;

-            // Paint Background
+            // Draw Background
            var backgroundColor = BackgroundColor;
            if (backgroundColor.A > 0.0f)
            {
@@ -386,7 +391,7 @@ namespace FlaxEngine.GUI
            float textLeft = 0;
            if (EnableDropDownIcon)
            {
-                textLeft += 14;
+                textLeft += DropDownIconSize;
                var dropDownRect = new Rectangle(2, (HeaderHeight - 12) / 2, 12, 12);
                var arrowColor = _mouseOverHeader ? style.Foreground : style.ForegroundGrey;
                if (_isClosed)
@@ -395,7 +400,7 @@ namespace FlaxEngine.GUI
                    ArrowImageOpened?.Draw(dropDownRect, arrowColor);
            }

-            // Text
+            // Header text
            var textRect = new Rectangle(textLeft, 0, Width - textLeft, HeaderHeight);
            _headerTextMargin.ShrinkRectangle(ref textRect);
            var textColor = HeaderTextColor;
@@ -404,7 +409,9 @@ namespace FlaxEngine.GUI
                textColor *= 0.6f;
            }

+            Render2D.PushClip(textRect);
            Render2D.DrawText(HeaderTextFont.GetFont(), HeaderTextMaterial, HeaderText, textRect, textColor, TextAlignment.Near, TextAlignment.Center);
+            Render2D.PopClip();

            if (!_isClosed && EnableContainmentLines)
            {
--- a/Source/Shaders/GI/DDGI.hlsl
+++ b/Source/Shaders/GI/DDGI.hlsl
@@ -20,17 +20,23 @@
 #define DDGI_PROBE_ATTENTION_MAX 0.98f // Maximum probe attention value that still makes it active (but not activated which is 1.0f).
 #define DDGI_PROBE_RESOLUTION_IRRADIANCE 6 // Resolution (in texels) for probe irradiance data (excluding 1px padding on each side)
 #define DDGI_PROBE_RESOLUTION_DISTANCE 14 // Resolution (in texels) for probe distance data (excluding 1px padding on each side)
-#define DDGI_CASCADE_BLEND_SIZE 2.5f // Distance in probes over which cascades blending happens
+#define DDGI_CASCADE_BLEND_SIZE 2.0f // Distance in probes over which cascades blending happens
 #ifndef DDGI_CASCADE_BLEND_SMOOTH
 #define DDGI_CASCADE_BLEND_SMOOTH 0 // Enables smooth cascade blending, otherwise dithering will be used
 #endif
 #define DDGI_SRGB_BLENDING 1 // Enables blending in sRGB color space, otherwise irradiance blending is done in linear space
+#define DDGI_DEFAULT_BIAS 0.2f // Default value for DDGI sampling bias
+#define DDGI_FALLBACK_COORDS_ENCODE(coord) ((float3)(coord + 1) / 128.0f)
+#define DDGI_FALLBACK_COORDS_DECODE(data) (uint3)(data.xyz * 128.0f - 1)
+#define DDGI_FALLBACK_COORDS_VALID(data) (length(data.xyz) > 0)
+//#define DDGI_DEBUG_CASCADE 0 // Forces a specific cascade to be only in use (for debugging)

 // DDGI data for a constant buffer
 struct DDGIData
 {
    float4 ProbesOriginAndSpacing[4];
-    int4 ProbesScrollOffsets[4]; // w unused
+    float4 BlendOrigin[4]; // w is unused
+    int4 ProbesScrollOffsets[4]; // w is unused
    uint3 ProbesCounts;
    uint CascadesCount;
    float IrradianceGamma;
@@ -39,8 +45,7 @@ struct DDGIData
    float IndirectLightingIntensity;
    float3 ViewPos;
    uint RaysCount;
-    float3 FallbackIrradiance;
-    float Padding0;
+    float4 FallbackIrradiance;
 };

 uint GetDDGIProbeIndex(DDGIData data, uint3 probeCoords)
@@ -159,6 +164,8 @@ float2 GetDDGIProbeUV(DDGIData data, uint cascadeIndex, uint probeIndex, float2

 float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D<snorm float4> probesData, Texture2D<float4> probesDistance, Texture2D<float4> probesIrradiance, float3 worldPosition, float3 worldNormal, uint cascadeIndex, float3 probesOrigin, float3 probesExtent, float probesSpacing, float3 biasedWorldPosition)
 {
+    bool invalidCascade = cascadeIndex >= data.CascadesCount;
+    cascadeIndex = min(cascadeIndex, data.CascadesCount - 1);
    uint3 probeCoordsEnd = data.ProbesCounts - uint3(1, 1, 1);
    uint3 baseProbeCoords = clamp(uint3((worldPosition - probesOrigin + probesExtent) / probesSpacing), uint3(0, 0, 0), probeCoordsEnd);

@@ -168,7 +175,6 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D<snorm float4> probes

    // Loop over the closest probes to accumulate their contributions
    float4 irradiance = float4(0, 0, 0, 0);
-    const int3 SearchAxisMasks[3] = { int3(1, 0, 0), int3(0, 1, 0), int3(0, 0, 1) };
    for (uint i = 0; i < 8; i++)
    {
        uint3 probeCoordsOffset = uint3(i, i >> 1, i >> 2) & 1;
@@ -178,33 +184,23 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D<snorm float4> probes
        // Load probe position and state
        float4 probeData = LoadDDGIProbeData(data, probesData, cascadeIndex, probeIndex);
        uint probeState = DecodeDDGIProbeState(probeData);
+        uint useVisibility = true;
+        float minWight = 0.000001f;
        if (probeState == DDGI_PROBE_STATE_INACTIVE)
        {
-            // Search nearby probes to find any nearby GI sample
-            for (int searchDistance = 1; searchDistance < 3 && probeState == DDGI_PROBE_STATE_INACTIVE; searchDistance++)
-                for (uint searchAxis = 0; searchAxis < 3; searchAxis++)
-                {
-                    int searchAxisDir = probeCoordsOffset[searchAxis] ? 1 : -1;
-                    int3 searchCoordsOffset = SearchAxisMasks[searchAxis] * searchAxisDir * searchDistance;
-                    uint3 searchCoords = clamp((int3)probeCoords + searchCoordsOffset, int3(0, 0, 0), (int3)probeCoordsEnd);
-                    uint searchIndex = GetDDGIScrollingProbeIndex(data, cascadeIndex, searchCoords);
-                    float4 searchData = LoadDDGIProbeData(data, probesData, cascadeIndex, searchIndex);
-                    uint searchState = DecodeDDGIProbeState(searchData);
-                    if (searchState != DDGI_PROBE_STATE_INACTIVE)
-                    {
-                        // Use nearby probe as a fallback (visibility test might ignore it but with smooth gradient)
-                        probeCoords = searchCoords;
-                        probeIndex = searchIndex;
-                        probeData = searchData;
-                        probeState = searchState;
-                        break;
-                    }
-                }
-            if (probeState == DDGI_PROBE_STATE_INACTIVE)
-                continue;
+            // Use fallback probe that is closest to this one
+            uint3 fallbackCoords = DDGI_FALLBACK_COORDS_DECODE(probeData);
+            float fallbackToProbeDist = length((float3)probeCoords - (float3)fallbackCoords);
+            useVisibility = fallbackToProbeDist <= 1.0f; // Skip visibility test that blocks too far probes due to limiting max distance to 1.5 of probe spacing
+            if (fallbackToProbeDist > 2.0f) minWight = 1.0f;
+            probeCoords = fallbackCoords;
+            probeIndex = GetDDGIScrollingProbeIndex(data, cascadeIndex, fallbackCoords);
+            probeData = LoadDDGIProbeData(data, probesData, cascadeIndex, probeIndex);
+            //if (DecodeDDGIProbeState(probeData) == DDGI_PROBE_STATE_INACTIVE) continue;
        }
-        float3 probeBasePosition = baseProbeWorldPosition + ((probeCoords - baseProbeCoords) * probesSpacing);
-        float3 probePosition = probeBasePosition + probeData.xyz * probesSpacing; // Probe offset is [-1;1] within probes spacing
+
+        // Calculate probe position
+        float3 probePosition = baseProbeWorldPosition + (((float3)probeCoords - (float3)baseProbeCoords) * probesSpacing) + probeData.xyz * probesSpacing;

        // Calculate the distance and direction from the (biased and non-biased) shading point and the probe
        float3 worldPosToProbe = normalize(probePosition - worldPosition);
@@ -213,6 +209,7 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D<snorm float4> probes

        // Smooth backface test
        float weight = Square(dot(worldPosToProbe, worldNormal) * 0.5f + 0.5f);
+        weight = max(weight, 0.1f);

        // Sample distance texture
        float2 octahedralCoords = GetOctahedralCoords(-biasedPosToProbe);
@@ -220,24 +217,23 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D<snorm float4> probes
        float2 probeDistance = probesDistance.SampleLevel(SamplerLinearClamp, uv, 0).rg * 2.0f;

        // Visibility weight (Chebyshev)
-        if (biasedPosToProbeDist > probeDistance.x)
+        if (biasedPosToProbeDist > probeDistance.x && useVisibility)
        {
            float variance = abs(Square(probeDistance.x) - probeDistance.y);
            float visibilityWeight = variance / (variance + Square(biasedPosToProbeDist - probeDistance.x));
-            weight *= max(visibilityWeight * visibilityWeight * visibilityWeight, 0.05f);
+            weight *= max(visibilityWeight * visibilityWeight * visibilityWeight, 0.0f);
        }

        // Avoid a weight of zero
-        weight = max(weight, 0.000001f);
+        weight = max(weight, minWight);

        // Adjust weight curve to inject a small portion of light
        const float minWeightThreshold = 0.2f;
-        if (weight < minWeightThreshold)
-            weight *= Square(weight) / Square(minWeightThreshold);
+        if (weight < minWeightThreshold) weight *= (weight * weight) * (1.0f / (minWeightThreshold * minWeightThreshold));

        // Calculate trilinear weights based on the distance to each probe to smoothly transition between grid of 8 probes
        float3 trilinear = lerp(1.0f - biasAlpha, biasAlpha, (float3)probeCoordsOffset);
-        weight *= max(trilinear.x * trilinear.y * trilinear.z, 0.001f);
+        weight *= saturate(trilinear.x * trilinear.y * trilinear.z * 2.0f);

        // Sample irradiance texture
        octahedralCoords = GetOctahedralCoords(worldNormal);
@@ -269,7 +265,9 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D<snorm float4> probes
    if (irradiance.a > 0.0f)
    {
        // Normalize irradiance
-        irradiance.rgb /= irradiance.a;
+        //irradiance.rgb /= irradiance.a;
+        //irradiance.rgb /= lerp(1, irradiance.a, saturate(irradiance.a * irradiance.a + 0.9f));
+        irradiance.rgb /= invalidCascade ? irradiance.a : lerp(1, irradiance.a, saturate(irradiance.a * irradiance.a + 0.9f));
 #if DDGI_SRGB_BLENDING
        irradiance.rgb *= irradiance.rgb;
 #endif
@@ -281,22 +279,34 @@ float3 SampleDDGIIrradianceCascade(DDGIData data, Texture2D<snorm float4> probes
 float3 GetDDGISurfaceBias(float3 viewDir, float probesSpacing, float3 worldNormal, float bias)
 {
    // Bias the world-space position to reduce artifacts
-    return (worldNormal * 0.2f + viewDir * 0.8f) * (0.75f * probesSpacing * bias);
+    return (worldNormal * 0.2f + viewDir * 0.8f) * (0.6f * probesSpacing * bias);
+}
+
+// [Inigo Quilez, https://iquilezles.org/articles/distfunctions/]
+float sdRoundBox(float3 p, float3 b, float r)
+{
+    float3 q = abs(p) - b + r;
+    return length(max(q, 0.0f)) + min(max(q.x, max(q.y, q.z)), 0.0f) - r;
 }

 // Samples DDGI probes volume at the given world-space position and returns the irradiance.
 // bias - scales the bias vector to the initial sample point to reduce self-shading artifacts
 // dither - randomized per-pixel value in range 0-1, used to smooth dithering for cascades blending
-float3 SampleDDGIIrradiance(DDGIData data, Texture2D<snorm float4> probesData, Texture2D<float4> probesDistance, Texture2D<float4> probesIrradiance, float3 worldPosition, float3 worldNormal, float bias = 0.2f, float dither = 0.0f)
+float3 SampleDDGIIrradiance(DDGIData data, Texture2D<snorm float4> probesData, Texture2D<float4> probesDistance, Texture2D<float4> probesIrradiance, float3 worldPosition, float3 worldNormal, float bias = DDGI_DEFAULT_BIAS, float dither = 0.0f)
 {
    // Select the highest cascade that contains the sample location
-    uint cascadeIndex = 0;
    float probesSpacing = 0, cascadeWeight = 0;
    float3 probesOrigin = (float3)0, probesExtent = (float3)0, biasedWorldPosition = (float3)0;
    float3 viewDir = normalize(data.ViewPos - worldPosition);
 #if DDGI_CASCADE_BLEND_SMOOTH
    dither = 0.0f;
 #endif
+#ifdef DDGI_DEBUG_CASCADE
+    uint cascadeIndex = DDGI_DEBUG_CASCADE;
+#else
+    uint cascadeIndex = 0;
+    if (data.CascadesCount == 0)
+        return float3(0, 0, 0);
    for (; cascadeIndex < data.CascadesCount; cascadeIndex++)
    {
        // Get cascade data
@@ -306,26 +316,21 @@ float3 SampleDDGIIrradiance(DDGIData data, Texture2D<snorm float4> probesData, T
        biasedWorldPosition = worldPosition + GetDDGISurfaceBias(viewDir, probesSpacing, worldNormal, bias);

        // Calculate cascade blending weight (use input bias to smooth transition)
-        float cascadeBlendSmooth = frac(max(distance(data.ViewPos, worldPosition) - probesExtent.x, 0) / probesSpacing) * 0.1f;
-        float3 cascadeBlendPoint = worldPosition - probesOrigin - cascadeBlendSmooth * probesSpacing;
        float fadeDistance = probesSpacing * DDGI_CASCADE_BLEND_SIZE;
-#if DDGI_CASCADE_BLEND_SMOOTH
-        fadeDistance *= 2.0f; // Make it even smoother when using linear blending
-#endif
-        cascadeWeight = saturate(Min3(probesExtent - abs(cascadeBlendPoint)) / fadeDistance);
+        float3 blendPos = worldPosition - data.BlendOrigin[cascadeIndex].xyz;
+        cascadeWeight = sdRoundBox(blendPos, probesExtent - probesSpacing, probesSpacing * 2) + fadeDistance;
+        cascadeWeight = 1 - saturate(cascadeWeight / fadeDistance);
        if (cascadeWeight > dither)
            break;
    }
-    if (cascadeIndex == data.CascadesCount)
-        return data.FallbackIrradiance;
+#endif

    // Sample cascade
    float3 result = SampleDDGIIrradianceCascade(data, probesData, probesDistance, probesIrradiance, worldPosition, worldNormal, cascadeIndex, probesOrigin, probesExtent, probesSpacing, biasedWorldPosition);

    // Blend with the next cascade (or fallback irradiance outside the volume)
+#if DDGI_CASCADE_BLEND_SMOOTH && !defined(DDGI_DEBUG_CASCADE)
    cascadeIndex++;
-#if DDGI_CASCADE_BLEND_SMOOTH
-    result *= cascadeWeight;
    if (cascadeIndex < data.CascadesCount && cascadeWeight < 0.99f)
    {
        probesSpacing = data.ProbesOriginAndSpacing[cascadeIndex].w;
@@ -333,18 +338,16 @@ float3 SampleDDGIIrradiance(DDGIData data, Texture2D<snorm float4> probesData, T
        probesExtent = (data.ProbesCounts - 1) * (probesSpacing * 0.5f);
        biasedWorldPosition = worldPosition + GetDDGISurfaceBias(viewDir, probesSpacing, worldNormal, bias);
        float3 resultNext = SampleDDGIIrradianceCascade(data, probesData, probesDistance, probesIrradiance, worldPosition, worldNormal, cascadeIndex, probesOrigin, probesExtent, probesSpacing, biasedWorldPosition);
+        result *= cascadeWeight;
        result += resultNext * (1 - cascadeWeight);
    }
-    else
-    {
-        result += data.FallbackIrradiance * (1 - cascadeWeight);
-    }
-#else
-    if (cascadeIndex == data.CascadesCount)
-    {
-        result += data.FallbackIrradiance * (1 - cascadeWeight);
-    }
 #endif
+    if (cascadeIndex >= data.CascadesCount)
+    {
+        // Blend between the last cascade and the fallback irradiance
+        float fallbackWeight = (1 - cascadeWeight) * data.FallbackIrradiance.a;
+        result = lerp(result, data.FallbackIrradiance.rgb, fallbackWeight);
+    }

    return result;
 }
--- a/Source/Shaders/GI/DDGI.shader
+++ b/Source/Shaders/GI/DDGI.shader
@@ -13,6 +13,7 @@
 #include "./Flax/Math.hlsl"
 #include "./Flax/Noise.hlsl"
 #include "./Flax/Quaternion.hlsl"
+#include "./Flax/MonteCarlo.hlsl"
 #include "./Flax/GlobalSignDistanceField.hlsl"
 #include "./Flax/GI/GlobalSurfaceAtlas.hlsl"
 #include "./Flax/GI/DDGI.hlsl"
@@ -26,6 +27,7 @@
 #define DDGI_PROBE_CLASSIFY_GROUP_SIZE 32
 #define DDGI_PROBE_RELOCATE_ITERATIVE 1 // If true, probes relocation algorithm tries to move them in additive way, otherwise all nearby locations are checked to find the best position
 #define DDGI_PROBE_RELOCATE_FIND_BEST 1 // If true, probes relocation algorithm tries to move to the best matching location within nearby area
+#define DDGI_PROBE_EMPTY_AREA_DENSITY 8 // Spacing (in probe grid) between fallback probes placed into empty areas to provide valid GI for nearby dynamic objects or transparency
 #define DDGI_DEBUG_STATS 0 // Enables additional GPU-driven stats for probe/rays count
 #define DDGI_DEBUG_INSTABILITY 0 // Enables additional probe irradiance instability debugging

@@ -42,10 +44,13 @@ float TemporalTime;
 int4 ProbeScrollClears[4];
 float3 ViewDir;
 float Padding1;
+float3 QuantizationError;
+uint FrameIndexMod8;
 META_CB_END

 META_CB_BEGIN(1, Data1)
-float2 Padding2;
+float Padding2;
+int StepSize;
 uint CascadeIndex;
 uint ProbeIndexOffset;
 META_CB_END
@@ -98,6 +103,11 @@ float3 Remap(float3 value, float3 fromMin, float3 fromMax, float3 toMin, float3
    return (value - fromMin) / (fromMax - fromMin) * (toMax - toMin) + toMin;
 }

+bool IsProbeAtBorder(uint3 probeCoords)
+{
+    return min(probeCoords.x, min(probeCoords.y, probeCoords.z)) == 0 || probeCoords.x == DDGI.ProbesCounts.x - 1 || probeCoords.y == DDGI.ProbesCounts.y - 1 || probeCoords.z == DDGI.ProbesCounts.z - 1;
+}
+
 // Compute shader for updating probes state between active and inactive and performing probes relocation.
 META_CS(true, FEATURE_LEVEL_SM5)
 [numthreads(DDGI_PROBE_CLASSIFY_GROUP_SIZE, 1, 1)]
@@ -112,6 +122,14 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
    float probesSpacing = DDGI.ProbesOriginAndSpacing[CascadeIndex].w;
    float3 probeBasePosition = GetDDGIProbeWorldPosition(DDGI, CascadeIndex, probeCoords);

+#ifdef DDGI_DEBUG_CASCADE
+    // Single cascade-only debugging
+    if (CascadeIndex != DDGI_DEBUG_CASCADE)
+    {
+        RWProbesData[probeDataCoords] = EncodeDDGIProbeData(float3(0, 0, 0), DDGI_PROBE_STATE_INACTIVE, 0.0f);
+        return;
+    }
+#else
    // Disable probes that are is in the range of higher-quality cascade
    if (CascadeIndex > 0)
    {
@@ -119,15 +137,15 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
        float prevProbesSpacing = DDGI.ProbesOriginAndSpacing[prevCascade].w;
        float3 prevProbesOrigin = DDGI.ProbesScrollOffsets[prevCascade].xyz * prevProbesSpacing + DDGI.ProbesOriginAndSpacing[prevCascade].xyz;
        float3 prevProbesExtent = (DDGI.ProbesCounts - 1) * (prevProbesSpacing * 0.5f);
-        prevProbesExtent -= probesSpacing * ceil(DDGI_CASCADE_BLEND_SIZE); // Apply safe margin to allow probes on cascade edges
+        prevProbesExtent -= probesSpacing * ceil(DDGI_CASCADE_BLEND_SIZE) * 2; // Apply safe margin to allow probes on cascade edges
        float prevCascadeWeight = Min3(prevProbesExtent - abs(probeBasePosition - prevProbesOrigin));
        if (prevCascadeWeight > 0.1f)
        {
-            // Disable probe
            RWProbesData[probeDataCoords] = EncodeDDGIProbeData(float3(0, 0, 0), DDGI_PROBE_STATE_INACTIVE, 0.0f);
            return;
        }
    }
+#endif

    // Check if probe was scrolled
    int3 probeScrollClears = ProbeScrollClears[CascadeIndex].xyz;
@@ -171,9 +189,29 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
    float voxelLimit = GlobalSDF.CascadeVoxelSize[CascadeIndex] * 0.8f;
    float distanceLimit = probesSpacing * ProbesDistanceLimits[CascadeIndex];
    float relocateLimit = probesSpacing * ProbesRelocateLimits[CascadeIndex];
-    if (sdfDst > distanceLimit + length(probeOffset)) // Probe is too far from geometry (or deep inside)
+#ifdef DDGI_PROBE_EMPTY_AREA_DENSITY
+    uint3 probeCoordsStable = GetDDGIProbeCoords(DDGI, probeIndex);
+    if (sdf > probesSpacing * DDGI.ProbesCounts.x * 0.3f
+#if DDGI_PROBE_EMPTY_AREA_DENSITY > 1
+        && (
+            // Low-density grid grid
+            (probeCoordsStable.x % DDGI_PROBE_EMPTY_AREA_DENSITY == 0 && probeCoordsStable.y % DDGI_PROBE_EMPTY_AREA_DENSITY == 0 && probeCoordsStable.z % DDGI_PROBE_EMPTY_AREA_DENSITY == 0)
+            // Edge probes at the last cascade (for good fallback irradiance outside the GI distance)
+            //|| (CascadeIndex + 1 == DDGI.CascadesCount && IsProbeAtBorder(probeCoords))
+        )
+#endif
+    )
    {
-        // Disable it
+        // Addd some fallback probes in empty areas to provide valid GI for nearby dynamic objects or transparency
+        probeOffset = float3(0, 0, 0);
+        probeState = wasScrolled || probeStateOld == DDGI_PROBE_STATE_INACTIVE ? DDGI_PROBE_STATE_ACTIVATED : DDGI_PROBE_STATE_ACTIVE;
+        probeAttention = DDGI_PROBE_ATTENTION_MIN;
+    }
+    else 
+#endif
+    if (sdfDst > distanceLimit + length(probeOffset))
+    {
+        // Probe is too far from geometry (or deep inside) so disable it
        probeOffset = float3(0, 0, 0);
        probeState = DDGI_PROBE_STATE_INACTIVE;
        probeAttention = 0.0f;
@@ -194,6 +232,7 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
        probeAttention = clamp(probeAttention, DDGI_PROBE_ATTENTION_MIN, DDGI_PROBE_ATTENTION_MAX);

        // Relocate only if probe location is not good enough
+        BRANCH
        if (sdf <= voxelLimit)
        {
 #if DDGI_PROBE_RELOCATE_ITERATIVE
@@ -265,6 +304,7 @@ void CS_Classify(uint3 DispatchThreadId : SV_DispatchThreadID)
        bool wasActivated = probeStateOld == DDGI_PROBE_STATE_INACTIVE;
        bool wasRelocated = distance(probeOffset, probeOffsetOld) > 2.0f;
 #if DDGI_PROBE_RELOCATE_FIND_BEST || DDGI_PROBE_RELOCATE_ITERATIVE
+        BRANCH
        if (wasRelocated && !wasActivated)
        {
            // If probe was relocated but the previous location is visible from the new one, then don't re-activate it for smoother blend
@@ -323,6 +363,78 @@ void CS_UpdateProbesInitArgs()

 #endif

+#ifdef _CS_UpdateInactiveProbes
+
+RWTexture2D<snorm float4> RWProbesData : register(u0);
+
+void CheckNearbyProbe(inout uint3 fallbackCoords, inout uint probeState, inout float minDistance, uint3 probeCoords, int3 probeCoordsEnd, int3 offset)
+{
+    uint3 nearbyCoords = (uint3)clamp(((int3)probeCoords + offset), int3(0, 0, 0), probeCoordsEnd);
+    uint nearbyIndex = GetDDGIScrollingProbeIndex(DDGI, CascadeIndex, nearbyCoords);
+    float4 nearbyData = RWProbesData[GetDDGIProbeTexelCoords(DDGI, CascadeIndex, nearbyIndex)];
+    float nearbyDist = distance((float3)nearbyCoords, (float3)probeCoords);
+    if (DecodeDDGIProbeState(nearbyData) != DDGI_PROBE_STATE_INACTIVE && nearbyDist < minDistance)
+    {
+        // Use nearby probe
+        fallbackCoords = nearbyCoords;
+        probeState = DDGI_PROBE_STATE_ACTIVE;
+        minDistance = nearbyDist;
+        return;
+    }
+    nearbyCoords = DDGI_FALLBACK_COORDS_DECODE(nearbyData);
+    nearbyDist = distance((float3)nearbyCoords, (float3)probeCoords);
+    if (DDGI_FALLBACK_COORDS_VALID(nearbyData) && nearbyDist < minDistance)
+    {
+        // Use fallback probe
+        fallbackCoords = nearbyCoords;
+        probeState = DDGI_PROBE_STATE_ACTIVE;
+        minDistance = nearbyDist;
+    }
+}
+
+// Compute shader to store closest valid probe coords inside inactive probes data for quick fallback lookup when sampling irradiance.
+// Uses Jump Flood algorithm.
+META_CS(true, FEATURE_LEVEL_SM5)
+[numthreads(DDGI_PROBE_CLASSIFY_GROUP_SIZE, 1, 1)]
+void CS_UpdateInactiveProbes(uint3 DispatchThreadId : SV_DispatchThreadID)
+{
+    uint probeIndex = min(DispatchThreadId.x, ProbesCount - 1);
+    uint3 fallbackCoords = uint3(1000, 1000, 1000);
+
+    // Load probe data for the current thread
+    uint3 probeCoords = GetDDGIProbeCoords(DDGI, probeIndex);
+    probeIndex = GetDDGIScrollingProbeIndex(DDGI, CascadeIndex, probeCoords);
+    int2 probeDataCoords = GetDDGIProbeTexelCoords(DDGI, CascadeIndex, probeIndex);
+    float4 probeData = RWProbesData[probeDataCoords];
+    uint probeState = DecodeDDGIProbeState(probeData);
+    BRANCH
+    if (probeState == DDGI_PROBE_STATE_INACTIVE)
+    {
+        // Find the closest active probe (Jump Flood)
+        int3 probeCoordsEnd = (int3)DDGI.ProbesCounts - int3(1, 1, 1);
+        float minDistance = 1e27f;
+        UNROLL for (int z = -1; z <= 1; z++)
+        UNROLL for (int y = -1; y <= 1; y++)
+        UNROLL for (int x = -1; x <= 1; x++)
+        {
+            int3 offset = int3(x, y, z) * StepSize;
+            CheckNearbyProbe(fallbackCoords, probeState, minDistance, probeCoords, probeCoordsEnd, offset);
+        }
+    }
+
+    // Ensure all threads (within dispatch) got proper data before writing back to the same memory
+    AllMemoryBarrierWithGroupSync();
+
+    // Write modified probe data back (remain inactive)
+    BRANCH
+    if (probeState != DDGI_PROBE_STATE_INACTIVE && DispatchThreadId.x < ProbesCount && fallbackCoords.x != 1000)
+    {
+        RWProbesData[probeDataCoords] = EncodeDDGIProbeData(DDGI_FALLBACK_COORDS_ENCODE(fallbackCoords), DDGI_PROBE_STATE_INACTIVE, 0.0f);
+    }
+}
+
+#endif
+
 #ifdef _CS_TraceRays

 RWTexture2D<float4> RWProbesTrace : register(u0);
@@ -392,6 +504,8 @@ void CS_TraceRays(uint3 DispatchThreadId : SV_DispatchThreadID)

            // Add some bias to prevent self occlusion artifacts in Chebyshev due to Global SDF being very incorrect in small scale
            radiance.w = max(radiance.w + GlobalSDF.CascadeVoxelSize[hit.HitCascade] * 0.5f, 0);
+            float probesSpacing = DDGI.ProbesOriginAndSpacing[CascadeIndex].w;
+            radiance.w += probesSpacing * 0.05f;
        }
    }
    else
@@ -639,7 +753,7 @@ void CS_UpdateProbes(uint3 GroupThreadId : SV_GroupThreadID, uint3 GroupId : SV_

        // Add distance (R), distance^2 (G) and weight (A)
        float rayDistance = CachedProbesTraceDistance[rayIndex];
-        result += float4(rayDistance * rayWeight, (rayDistance * rayDistance) * rayWeight, 0.0f, rayWeight);
+        result += float4(rayDistance, rayDistance * rayDistance, 0.0f, 1.0f) * rayWeight;
 #endif
    }

@@ -700,13 +814,17 @@ void CS_UpdateProbes(uint3 GroupThreadId : SV_GroupThreadID, uint3 GroupId : SV_
        //result.rgb = previous + (irradianceDelta * 0.25f);
    }
    result = float4(lerp(result.rgb, previous.rgb, historyWeight), 1.0f);
+
+    // Apply quantization error to reduce yellowish artifacts due to R11G11B10 format
+    float noise = InterleavedGradientNoise(octahedralCoords, FrameIndexMod8);
+    result.rgb = QuantizeColor(result.rgb, noise, QuantizationError);
 #else
    result = float4(lerp(result.rg, previous.rg, historyWeight), 0.0f, 1.0f);
 #endif

    RWOutput[outputCoords] = result;
-
    GroupMemoryBarrierWithGroupSync();
+
    uint2 baseCoords = GetDDGIProbeTexelCoords(DDGI, CascadeIndex, probeIndex) * (DDGI_PROBE_RESOLUTION + 2);

 #if DDGI_PROBE_UPDATE_MODE == 0
@@ -786,10 +904,10 @@ void PS_IndirectLighting(Quad_VS2PS input, out float4 output : SV_Target0)
    }

    // Sample irradiance
-    float bias = 0.2f;
    float dither = RandN2(input.TexCoord + TemporalTime).x;
-    float3 irradiance = SampleDDGIIrradiance(DDGI, ProbesData, ProbesDistance, ProbesIrradiance, gBuffer.WorldPos, gBuffer.Normal, bias, dither);
-    
+    float3 samplePos = gBuffer.WorldPos + gBuffer.Normal * (dither * 0.1f + 0.1f);
+    float3 irradiance = SampleDDGIIrradiance(DDGI, ProbesData, ProbesDistance, ProbesIrradiance, samplePos, gBuffer.Normal, DDGI_DEFAULT_BIAS, dither);
+
    // Calculate lighting
    float3 diffuseColor = GetDiffuseColor(gBuffer);
    float3 diffuse = Diffuse_Lambert(diffuseColor);
--- a/Source/Shaders/GI/GlobalSurfaceAtlas.shader
+++ b/Source/Shaders/GI/GlobalSurfaceAtlas.shader
@@ -328,7 +328,6 @@ float4 PS_Debug(Quad_VS2PS input) : SV_Target
 	float3 viewRay = lerp(lerp(ViewFrustumWorldRays[3], ViewFrustumWorldRays[0], input.TexCoord.x), lerp(ViewFrustumWorldRays[2], ViewFrustumWorldRays[1], input.TexCoord.x), 1 - input.TexCoord.y).xyz;
 	viewRay = normalize(viewRay - ViewWorldPos);
 	trace.Init(ViewWorldPos, viewRay, ViewNearPlane, ViewFarPlane);
-	trace.NeedsHitNormal = true;
 	GlobalSDFHit hit = RayTraceGlobalSDF(GlobalSDF, GlobalSDFTex, GlobalSDFMip, trace);

    float3 color;
@@ -337,7 +336,6 @@ float4 PS_Debug(Quad_VS2PS input) : SV_Target
        // Sample Global Surface Atlas at the hit location
        float surfaceThreshold = GetGlobalSurfaceAtlasThreshold(GlobalSDF, hit);
        color = SampleGlobalSurfaceAtlas(GlobalSurfaceAtlas, GlobalSurfaceAtlasChunks, GlobalSurfaceAtlasCulledObjects, GlobalSurfaceAtlasObjects, GlobalSurfaceAtlasDepth, GlobalSurfaceAtlasTex, hit.GetHitPosition(trace), -viewRay, surfaceThreshold).rgb;
-	    //color = hit.HitNormal * 0.5f + 0.5f;
    }
    else
    {
--- a/Source/Shaders/GlobalSignDistanceField.hlsl
+++ b/Source/Shaders/GlobalSignDistanceField.hlsl
@@ -32,17 +32,13 @@ struct GlobalSDFTrace
    float MinDistance;
    float3 WorldDirection;
    float MaxDistance;
-    float StepScale;
-    bool NeedsHitNormal;

-    void Init(float3 worldPosition, float3 worldDirection, float minDistance, float maxDistance, float stepScale = 1.0f)
+    void Init(float3 worldPosition, float3 worldDirection, float minDistance, float maxDistance)
    {
        WorldPosition = worldPosition;
        WorldDirection = worldDirection;
        MinDistance = minDistance;
        MaxDistance = maxDistance;
-        StepScale = stepScale;
-        NeedsHitNormal = false;
    }
 };

@@ -75,12 +71,23 @@ void GetGlobalSDFCascadeUV(const GlobalSDFData data, uint cascade, float3 worldP
    textureUV = float3(((float)cascade + cascadeUV.x) / (float)data.CascadesCount, cascadeUV.y, cascadeUV.z); // Cascades are placed next to each other on X axis
 }

-// Clamps Global SDF cascade UV to ensure it can be sued for gradient sampling (clamps first and last pixels).
+void GetGlobalSDFCascadeUV(const GlobalSDFData data, uint cascade, float3 worldPosition, out float3 cascadeUV, out float3 textureUV, out float3 textureMipUV)
+{
+    float4 cascadePosDistance = data.CascadePosDistance[cascade];
+    float3 posInCascade = worldPosition - cascadePosDistance.xyz;
+    float cascadeSize = cascadePosDistance.w * 2;
+    cascadeUV = saturate(posInCascade / cascadeSize + 0.5f);
+    textureUV = float3(((float)cascade + cascadeUV.x) / (float)data.CascadesCount, cascadeUV.y, cascadeUV.z); // Cascades are placed next to each other on X axis
+    float halfTexelOffsetMip = (GLOBAL_SDF_RASTERIZE_MIP_FACTOR * 0.5f) / data.Resolution;
+    textureMipUV = textureUV + float3(halfTexelOffsetMip / (float)data.CascadesCount, halfTexelOffsetMip, halfTexelOffsetMip); // Mipmaps are offset by half texel to sample correctly
+}
+
+// Clamps Global SDF cascade UV to ensure it can be used for gradient sampling (clamps first and last pixels).
 void ClampGlobalSDFTextureGradientUV(const GlobalSDFData data, uint cascade, float texelOffset, inout float3 textureUV)
 {
    float cascadeSizeUV = 1.0f / data.CascadesCount;
-    float cascadeUVStart = cascadeSizeUV * cascade + texelOffset;
-    float cascadeUVEnd = cascadeUVStart + cascadeSizeUV - texelOffset * 3;
+    float cascadeUVStart = cascadeSizeUV * cascade + texelOffset * 2;
+    float cascadeUVEnd = cascadeUVStart + cascadeSizeUV - texelOffset * 4;
    textureUV.x = clamp(textureUV.x, cascadeUVStart, cascadeUVEnd);
 }

@@ -144,13 +151,13 @@ float SampleGlobalSDF(const GlobalSDFData data, Texture3D<snorm float> tex, Text
    startCascade = min(startCascade, data.CascadesCount - 1);
    for (uint cascade = startCascade; cascade < data.CascadesCount; cascade++)
    {
-        float3 cascadeUV, textureUV;
-        GetGlobalSDFCascadeUV(data, cascade, worldPosition, cascadeUV, textureUV);
+        float3 cascadeUV, textureUV, textureMipUV;
+        GetGlobalSDFCascadeUV(data, cascade, worldPosition, cascadeUV, textureUV, textureMipUV);
        float voxelSize = data.CascadeVoxelSize[cascade];
        float chunkSize = voxelSize * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE;
        float chunkMargin = voxelSize * (GLOBAL_SDF_CHUNK_MARGIN_SCALE * GLOBAL_SDF_RASTERIZE_CHUNK_MARGIN);
        float maxDistanceMip = data.CascadeMaxDistanceMip[cascade];
-        float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0);
+        float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureMipUV, 0);
        if (distanceMip < chunkSize && all(cascadeUV > 0) && all(cascadeUV < 1))
        {
            distance = distanceMip * maxDistanceMip;
@@ -208,13 +215,13 @@ float3 SampleGlobalSDFGradient(const GlobalSDFData data, Texture3D<snorm float>
    startCascade = min(startCascade, data.CascadesCount - 1);
    for (uint cascade = startCascade; cascade < data.CascadesCount; cascade++)
    {
-        float3 cascadeUV, textureUV;
-        GetGlobalSDFCascadeUV(data, cascade, worldPosition, cascadeUV, textureUV);
+        float3 cascadeUV, textureUV, textureMipUV;
+        GetGlobalSDFCascadeUV(data, cascade, worldPosition, cascadeUV, textureUV, textureMipUV);
        float voxelSize = data.CascadeVoxelSize[cascade];
        float chunkSize = voxelSize * GLOBAL_SDF_RASTERIZE_CHUNK_SIZE;
        float chunkMargin = voxelSize * (GLOBAL_SDF_CHUNK_MARGIN_SCALE * GLOBAL_SDF_RASTERIZE_CHUNK_MARGIN);
        float maxDistanceMip = data.CascadeMaxDistanceMip[cascade];
-        float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0) * maxDistanceMip;
+        float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureMipUV, 0) * maxDistanceMip;
        if (distanceMip < chunkSize && all(cascadeUV > 0) && all(cascadeUV < 1))
        {
            float maxDistanceTex = data.CascadeMaxDistanceTex[cascade];
@@ -236,13 +243,13 @@ float3 SampleGlobalSDFGradient(const GlobalSDFData data, Texture3D<snorm float>
            {
                distance = distanceMip;
                float texelOffset = (float)GLOBAL_SDF_RASTERIZE_MIP_FACTOR / data.Resolution;
-                ClampGlobalSDFTextureGradientUV(data, cascade, texelOffset, textureUV);
-                float xp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x + texelOffset, textureUV.y, textureUV.z), 0).x;
-                float xn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x - texelOffset, textureUV.y, textureUV.z), 0).x;
-                float yp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y + texelOffset, textureUV.z), 0).x;
-                float yn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y - texelOffset, textureUV.z), 0).x;
-                float zp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y, textureUV.z + texelOffset), 0).x;
-                float zn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y, textureUV.z - texelOffset), 0).x;
+                ClampGlobalSDFTextureGradientUV(data, cascade, texelOffset, textureMipUV);
+                float xp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x + texelOffset, textureMipUV.y, textureMipUV.z), 0).x;
+                float xn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x - texelOffset, textureMipUV.y, textureMipUV.z), 0).x;
+                float yp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x, textureMipUV.y + texelOffset, textureMipUV.z), 0).x;
+                float yn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x, textureMipUV.y - texelOffset, textureMipUV.z), 0).x;
+                float zp = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x, textureMipUV.y, textureMipUV.z + texelOffset), 0).x;
+                float zn = mip.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureMipUV.x, textureMipUV.y, textureMipUV.z - texelOffset), 0).x;
                gradient = float3(xp - xn, yp - yn, zp - zn) * maxDistanceMip;
            }
            break;
@@ -290,59 +297,32 @@ GlobalSDFHit RayTraceGlobalSDF(const GlobalSDFData data, Texture3D<snorm float>
            float maxDistanceTex = data.CascadeMaxDistanceTex[cascade];
            float maxDistanceMip = data.CascadeMaxDistanceMip[cascade];
            LOOP
-            for (; step < 250 && stepTime < intersections.y && hit.HitTime < 0.0f; step++)
+            for (; step < 100 && stepTime < intersections.y && hit.HitTime < 0.0f; step++)
            {
                float3 stepPosition = trace.WorldPosition + trace.WorldDirection * stepTime;
-                float stepScale = trace.StepScale;

                // Sample SDF
-                float stepDistance, voxelSizeScale = (float)GLOBAL_SDF_RASTERIZE_MIP_FACTOR;
-                float3 cascadeUV, textureUV;
-                GetGlobalSDFCascadeUV(data, cascade, stepPosition, cascadeUV, textureUV);
-                float distanceMip = mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0) * maxDistanceMip;
-                if (distanceMip < chunkSize)
-                {
-                    stepDistance = distanceMip;
-                    float distanceTex = tex.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0) * maxDistanceTex;
-                    if (distanceTex < chunkMargin)
-                    {
-                        stepDistance = distanceTex;
-                        voxelSizeScale = 1.0f;
-                        stepScale *= 0.63f; // Perform smaller steps nearby geometry
-                    }
-                }
-                else
-                {
-                    // Assume no SDF nearby so perform a jump to the next chunk
-                    stepDistance = chunkSize;
-                    voxelSizeScale = 1.0f;
-                }
+                float stepDistance;
+                float3 cascadeUV, textureUV, textureMipUV;
+                GetGlobalSDFCascadeUV(data, cascade, stepPosition, cascadeUV, textureUV, textureMipUV);
+                stepDistance = min(mip.SampleLevel(GLOBAL_SDF_SAMPLER, textureMipUV, 0) * maxDistanceMip, chunkSize);
+                float distanceTex = tex.SampleLevel(GLOBAL_SDF_SAMPLER, textureUV, 0) * maxDistanceTex;
+                FLATTEN
+                if (distanceTex < chunkMargin)
+                    stepDistance = distanceTex;

                // Detect surface hit
-                float minSurfaceThickness = voxelSizeScale * voxelExtent * saturate(stepTime / voxelSize);
+                float minSurfaceThickness = voxelExtent * saturate(stepTime / voxelSize);
                if (stepDistance < minSurfaceThickness)
                {
                    // Surface hit
                    hit.HitTime = max(stepTime + stepDistance - minSurfaceThickness, 0.0f);
                    hit.HitCascade = cascade;
                    hit.HitSDF = stepDistance;
-                    if (trace.NeedsHitNormal)
-                    {
-                        // Calculate hit normal from SDF gradient
-                        float texelOffset = 1.0f / data.Resolution;
-                        ClampGlobalSDFTextureGradientUV(data, cascade, texelOffset, textureUV);
-                        float xp = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x + texelOffset, textureUV.y, textureUV.z), 0).x;
-                        float xn = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x - texelOffset, textureUV.y, textureUV.z), 0).x;
-                        float yp = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y + texelOffset, textureUV.z), 0).x;
-                        float yn = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y - texelOffset, textureUV.z), 0).x;
-                        float zp = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y, textureUV.z + texelOffset), 0).x;
-                        float zn = tex.SampleLevel(GLOBAL_SDF_SAMPLER, float3(textureUV.x, textureUV.y, textureUV.z - texelOffset), 0).x;
-                        hit.HitNormal = normalize(float3(xp - xn, yp - yn, zp - zn));
-                    }
                }

                // Move forward
-                stepTime += max(stepDistance * stepScale, voxelSize);
+                stepTime += max(stepDistance, voxelSize);
            }
            hit.StepsCount += step;
        }
--- a/Source/Shaders/GlobalSignDistanceField.shader
+++ b/Source/Shaders/GlobalSignDistanceField.shader
@@ -311,26 +311,39 @@ float4 PS_Debug(Quad_VS2PS input) : SV_Target
 	float3 viewRay = lerp(lerp(ViewFrustumWorldRays[3], ViewFrustumWorldRays[0], input.TexCoord.x), lerp(ViewFrustumWorldRays[2], ViewFrustumWorldRays[1], input.TexCoord.x), 1 - input.TexCoord.y).xyz;
 	viewRay = normalize(viewRay - ViewWorldPos);
 	trace.Init(ViewWorldPos, viewRay, ViewNearPlane, ViewFarPlane);
-	trace.NeedsHitNormal = true;
 	GlobalSDFHit hit = RayTraceGlobalSDF(GlobalSDF, GlobalSDFTex, GlobalSDFMip, trace);

 	// Debug draw
-	float3 color = saturate(hit.StepsCount / 80.0f).xxx;
-	if (!hit.IsHit())
-		color.rg *= 0.4f;
-#if 0
-	else
-	{
+	float3 color = saturate(hit.StepsCount / 50.0f).xxx;
+	if (hit.IsHit())
+    {
+#if 1
+        float3 hitPosition = hit.GetHitPosition(trace);
+        float hitSDF;
+        float3 hitNormal = SampleGlobalSDFGradient(GlobalSDF, GlobalSDFTex, GlobalSDFMip, hitPosition, hitSDF, hit.HitCascade);
+#if 1
+        // Composite step count with SDF normals
+		//color.rgb *= saturate(normalize(hitNormal) * 0.5f + 0.7f) + 0.3f;
+		color = lerp(normalize(hitNormal) * 0.5f + 0.5f, 1 - color, saturate(hit.StepsCount / 80.0f));
+#else
 		// Debug draw SDF normals
-		color.rgb = normalize(hit.HitNormal) * 0.5f + 0.5f;
-	}
-#elif 1
+		color = normalize(hitNormal) * 0.5f + 0.5f;
+#endif
+#else
+        // Heatmap with step count
+        if (hit.StepsCount > 40)
+            color = float3(saturate(hit.StepsCount / 80.0f), 0, 0);
+        else if (hit.StepsCount > 20)
+            color = float3(saturate(hit.StepsCount / 40.0f).xx, 0);
+        else
+            color = float3(0, saturate(hit.StepsCount / 20.0f), 0);
+#endif
+    }
    else
    {
-        // Composite with SDF normals
-		color.rgb *= saturate(normalize(hit.HitNormal) * 0.5f + 0.7f) + 0.1f;
+        // Bluish sky
+		color.rg *= 0.4f;
    }
-#endif
 	return float4(color, 1);
 }

--- a/Source/Shaders/Noise.hlsl
+++ b/Source/Shaders/Noise.hlsl
@@ -54,6 +54,26 @@ float2 PerlinNoiseFade(float2 t)
    return t * t * t * (t * (t * 6.0 - 15.0) + 10.0);
 }

+// "Next Generation Post Processing in Call of Duty: Advanced Warfare"
+// http://advances.realtimerendering.com/s2014/index.html
+float InterleavedGradientNoise(float2 uv, uint frameCount)
+{
+    const float2 magicFrameScale = float2(47, 17) * 0.695;
+    uv += frameCount * magicFrameScale;
+    const float3 magic = float3(0.06711056, 0.00583715, 52.9829189);
+    return frac(magic.z * frac(dot(uv, magic.xy)));
+}
+
+// Removes error from the color to properly store it in lower precision formats (error = 2^(-mantissaBits))
+float3 QuantizeColor(float3 color, float noise, float3 error)
+{
+    float3 delta = color * error;
+    delta.x = asfloat(asuint(delta.x) & ~0x007fffff);
+    delta.y = asfloat(asuint(delta.y) & ~0x007fffff);
+    delta.z = asfloat(asuint(delta.z) & ~0x007fffff);
+    return color + delta * noise;
+}
+
 float rand2dTo1d(float2 value, float2 dotDir = float2(12.9898, 78.233))
 {
    // https://www.ronja-tutorials.com/post/024-white-noise/
--- a/Source/ThirdParty/meshoptimizer/allocator.cpp
+++ b/Source/ThirdParty/meshoptimizer/allocator.cpp
@@ -1,8 +1,17 @@
 // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
 #include "meshoptimizer.h"

-void meshopt_setAllocator(void*(MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void(MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
+#ifdef MESHOPTIMIZER_ALLOC_EXPORT
+meshopt_Allocator::Storage& meshopt_Allocator::storage()
 {
-	meshopt_Allocator::Storage::allocate = allocate;
-	meshopt_Allocator::Storage::deallocate = deallocate;
+	static Storage s = {::operator new, ::operator delete };
+	return s;
+}
+#endif
+
+void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
+{
+	meshopt_Allocator::Storage& s = meshopt_Allocator::storage();
+	s.allocate = allocate;
+	s.deallocate = deallocate;
 }
--- a/Source/ThirdParty/meshoptimizer/clusterizer.cpp
+++ b/Source/ThirdParty/meshoptimizer/clusterizer.cpp
--- a/Source/ThirdParty/meshoptimizer/vcacheanalyzer.cpp
+++ b/Source/ThirdParty/meshoptimizer/vcacheanalyzer.cpp
@@ -71,3 +71,56 @@ meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* ind

 	return result;
 }
+
+meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	meshopt_VertexFetchStatistics result = {};
+
+	unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count);
+	memset(vertex_visited, 0, vertex_count);
+
+	const size_t kCacheLine = 64;
+	const size_t kCacheSize = 128 * 1024;
+
+	// simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
+	size_t cache[kCacheSize / kCacheLine] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		vertex_visited[index] = 1;
+
+		size_t start_address = index * vertex_size;
+		size_t end_address = start_address + vertex_size;
+
+		size_t start_tag = start_address / kCacheLine;
+		size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
+
+		assert(start_tag < end_tag);
+
+		for (size_t tag = start_tag; tag < end_tag; ++tag)
+		{
+			size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
+
+			// we store +1 since cache is filled with 0 by default
+			result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
+			cache[line] = tag + 1;
+		}
+	}
+
+	size_t unique_vertex_count = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_vertex_count += vertex_visited[i];
+
+	result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
+
+	return result;
+}
--- a/Source/ThirdParty/meshoptimizer/indexcodec.cpp
+++ b/Source/ThirdParty/meshoptimizer/indexcodec.cpp
@@ -14,6 +14,7 @@ const unsigned char kIndexHeader = 0xe0;
 const unsigned char kSequenceHeader = 0xd0;

 static int gEncodeIndexVersion = 1;
+const int kDecodeIndexVersion = 1;

 typedef unsigned int VertexFifo[16];
 typedef unsigned int EdgeFifo[16][2];
@@ -209,6 +210,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons

 		if (fer >= 0 && (fer >> 2) < 15)
 		{
+			// note: getEdgeFifo implicitly rotates triangles by matching a/b to existing edge
 			const unsigned int* order = kTriangleIndexOrder[fer & 3];

 			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
@@ -266,6 +268,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
 			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);

 			// after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a
+			// note: decoder implicitly assumes that if feb=fec=0, then fea=0 (reset code); this is enforced by rotation
 			int fea = (a == next) ? (next++, 0) : 15;
 			int feb = (fb >= 0 && fb < 14) ? fb + 1 : (b == next ? (next++, 0) : 15);
 			int fec = (fc >= 0 && fc < 14) ? fc + 1 : (c == next ? (next++, 0) : 15);
@@ -354,11 +357,28 @@ size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)

 void meshopt_encodeIndexVersion(int version)
 {
-	assert(unsigned(version) <= 1);
+	assert(unsigned(version) <= unsigned(meshopt::kDecodeIndexVersion));

 	meshopt::gEncodeIndexVersion = version;
 }

+int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size)
+{
+	if (buffer_size < 1)
+		return -1;
+
+	unsigned char header = buffer[0];
+
+	if ((header & 0xf0) != meshopt::kIndexHeader && (header & 0xf0) != meshopt::kSequenceHeader)
+		return -1;
+
+	int version = header & 0x0f;
+	if (version > meshopt::kDecodeIndexVersion)
+		return -1;
+
+	return version;
+}
+
 int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
 {
 	using namespace meshopt;
@@ -374,7 +394,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
 		return -1;

 	int version = buffer[0] & 0x0f;
-	if (version > 1)
+	if (version > kDecodeIndexVersion)
 		return -1;

 	EdgeFifo edgefifo;
@@ -415,6 +435,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
 			// fifo reads are wrapped around 16 entry buffer
 			unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
 			unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
+			unsigned int c = 0;

 			int fec = codetri & 15;

@@ -424,37 +445,30 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
 			{
 				// fifo reads are wrapped around 16 entry buffer
 				unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
-				unsigned int c = (fec == 0) ? next : cf;
+				c = (fec == 0) ? next : cf;

 				int fec0 = fec == 0;
 				next += fec0;

-				// output triangle
-				writeTriangle(destination, i, index_size, a, b, c);
-
-				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				// push vertex fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
 				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
-
-				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
-				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
 			}
 			else
 			{
-				unsigned int c = 0;
-
 				// fec - (fec ^ 3) decodes 13, 14 into -1, 1
 				// note that we need to update the last index since free indices are delta-encoded
 				last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);

-				// output triangle
-				writeTriangle(destination, i, index_size, a, b, c);
-
 				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
 				pushVertexFifo(vertexfifo, c, vertexfifooffset);
-
-				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
-				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
 			}
+
+			// push edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+
+			// output triangle
+			writeTriangle(destination, i, index_size, a, b, c);
 		}
 		else
 		{
@@ -627,7 +641,7 @@ int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t in
 		return -1;

 	int version = buffer[0] & 0x0f;
-	if (version > 1)
+	if (version > kDecodeIndexVersion)
 		return -1;

 	const unsigned char* data = buffer + 1;
--- a/Source/ThirdParty/meshoptimizer/indexgenerator.cpp
+++ b/Source/ThirdParty/meshoptimizer/indexgenerator.cpp
@@ -5,7 +5,9 @@
 #include <string.h>

 // This work is based on:
+// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
 // John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010
+// John Hable. Variable Rate Shading with Visibility Buffer Rendering. 2024
 namespace meshopt
 {

@@ -85,6 +87,46 @@ struct VertexStreamHasher
 	}
 };

+struct VertexCustomHasher
+{
+	const float* vertex_positions;
+	size_t vertex_stride_float;
+
+	int (*callback)(void*, unsigned int, unsigned int);
+	void* context;
+
+	size_t hash(unsigned int index) const
+	{
+		const unsigned int* key = reinterpret_cast<const unsigned int*>(vertex_positions + index * vertex_stride_float);
+
+		unsigned int x = key[0], y = key[1], z = key[2];
+
+		// replace negative zero with zero
+		x = (x == 0x80000000) ? 0 : x;
+		y = (y == 0x80000000) ? 0 : y;
+		z = (z == 0x80000000) ? 0 : z;
+
+		// scramble bits to make sure that integer coordinates have entropy in lower bits
+		x ^= x >> 17;
+		y ^= y >> 17;
+		z ^= z >> 17;
+
+		// Optimized Spatial Hashing for Collision Detection of Deformable Objects
+		return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791);
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		const float* lp = vertex_positions + lhs * vertex_stride_float;
+		const float* rp = vertex_positions + rhs * vertex_stride_float;
+
+		if (lp[0] != rp[0] || lp[1] != rp[1] || lp[2] != rp[2])
+			return false;
+
+		return callback ? callback(context, lhs, rhs) : true;
+	}
+};
+
 struct EdgeHasher
 {
 	const unsigned int* remap;
@@ -182,6 +224,43 @@ static void buildPositionRemap(unsigned int* remap, const float* vertex_position
 	allocator.deallocate(vertex_table);
 }

+template <typename Hash>
+static size_t generateVertexRemap(unsigned int* remap, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator)
+{
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(index < vertex_count);
+
+		if (remap[index] != ~0u)
+			continue;
+
+		unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u);
+
+		if (*entry == ~0u)
+		{
+			*entry = index;
+			remap[index] = next_vertex++;
+		}
+		else
+		{
+			assert(remap[*entry] != ~0u);
+			remap[index] = remap[*entry];
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+	return next_vertex;
+}
+
 template <size_t BlockSize>
 static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
 {
@@ -196,6 +275,35 @@ static void remapVertices(void* destination, const void* vertices, size_t vertex
 		}
 }

+template <typename Hash>
+static void generateShadowBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator)
+{
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (remap[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u);
+
+			if (*entry == ~0u)
+				*entry = index;
+
+			remap[index] = *entry;
+		}
+
+		destination[i] = remap[index];
+	}
+}
+
 } // namespace meshopt

 size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
@@ -207,44 +315,9 @@ size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int
 	assert(vertex_size > 0 && vertex_size <= 256);

 	meshopt_Allocator allocator;
-
-	memset(destination, -1, vertex_count * sizeof(unsigned int));
-
 	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_size};

-	size_t table_size = hashBuckets(vertex_count);
-	unsigned int* table = allocator.allocate<unsigned int>(table_size);
-	memset(table, -1, table_size * sizeof(unsigned int));
-
-	unsigned int next_vertex = 0;
-
-	for (size_t i = 0; i < index_count; ++i)
-	{
-		unsigned int index = indices ? indices[i] : unsigned(i);
-		assert(index < vertex_count);
-
-		if (destination[index] == ~0u)
-		{
-			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
-
-			if (*entry == ~0u)
-			{
-				*entry = index;
-
-				destination[index] = next_vertex++;
-			}
-			else
-			{
-				assert(destination[*entry] != ~0u);
-
-				destination[index] = destination[*entry];
-			}
-		}
-	}
-
-	assert(next_vertex <= vertex_count);
-
-	return next_vertex;
+	return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
 }

 size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
@@ -262,44 +335,24 @@ size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigne
 	}

 	meshopt_Allocator allocator;
-
-	memset(destination, -1, vertex_count * sizeof(unsigned int));
-
 	VertexStreamHasher hasher = {streams, stream_count};

-	size_t table_size = hashBuckets(vertex_count);
-	unsigned int* table = allocator.allocate<unsigned int>(table_size);
-	memset(table, -1, table_size * sizeof(unsigned int));
+	return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
+}

-	unsigned int next_vertex = 0;
+size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context)
+{
+	using namespace meshopt;

-	for (size_t i = 0; i < index_count; ++i)
-	{
-		unsigned int index = indices ? indices[i] : unsigned(i);
-		assert(index < vertex_count);
+	assert(indices || index_count == vertex_count);
+	assert(!indices || index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);

-		if (destination[index] == ~0u)
-		{
-			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+	meshopt_Allocator allocator;
+	VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), callback, context};

-			if (*entry == ~0u)
-			{
-				*entry = index;
-
-				destination[index] = next_vertex++;
-			}
-			else
-			{
-				assert(destination[*entry] != ~0u);
-
-				destination[index] = destination[*entry];
-			}
-		}
-	}
-
-	assert(next_vertex <= vertex_count);
-
-	return next_vertex;
+	return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
 }

 void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
@@ -361,33 +414,9 @@ void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned
 	assert(vertex_size <= vertex_stride);

 	meshopt_Allocator allocator;
-
-	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
-	memset(remap, -1, vertex_count * sizeof(unsigned int));
-
 	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_stride};

-	size_t table_size = hashBuckets(vertex_count);
-	unsigned int* table = allocator.allocate<unsigned int>(table_size);
-	memset(table, -1, table_size * sizeof(unsigned int));
-
-	for (size_t i = 0; i < index_count; ++i)
-	{
-		unsigned int index = indices[i];
-		assert(index < vertex_count);
-
-		if (remap[index] == ~0u)
-		{
-			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
-
-			if (*entry == ~0u)
-				*entry = index;
-
-			remap[index] = *entry;
-		}
-
-		destination[i] = remap[index];
-	}
+	generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator);
 }

 void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
@@ -405,32 +434,33 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns
 	}

 	meshopt_Allocator allocator;
-
-	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
-	memset(remap, -1, vertex_count * sizeof(unsigned int));
-
 	VertexStreamHasher hasher = {streams, stream_count};

+	generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator);
+}
+
+void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+	VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), NULL, NULL};
+
 	size_t table_size = hashBuckets(vertex_count);
 	unsigned int* table = allocator.allocate<unsigned int>(table_size);
 	memset(table, -1, table_size * sizeof(unsigned int));

-	for (size_t i = 0; i < index_count; ++i)
+	for (size_t i = 0; i < vertex_count; ++i)
 	{
-		unsigned int index = indices[i];
-		assert(index < vertex_count);
+		unsigned int* entry = hashLookup(table, table_size, hasher, unsigned(i), ~0u);

-		if (remap[index] == ~0u)
-		{
-			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+		if (*entry == ~0u)
+			*entry = unsigned(i);

-			if (*entry == ~0u)
-				*entry = index;
-
-			remap[index] = *entry;
-		}
-
-		destination[i] = remap[index];
+		destination[i] = *entry;
 	}
 }

@@ -576,3 +606,99 @@ void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const un
 		memcpy(destination + i * 4, patch, sizeof(patch));
 	}
 }
+
+size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	assert(index_count % 3 == 0);
+
+	meshopt_Allocator allocator;
+
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	// compute vertex valence; this is used to prioritize least used corner
+	// note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic
+	unsigned char* valence = allocator.allocate<unsigned char>(vertex_count);
+	memset(valence, 0, vertex_count);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		valence[index]++;
+	}
+
+	unsigned int reorder_offset = 0;
+
+	// assign provoking vertices; leave the rest for the next pass
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		// try to rotate triangle such that provoking vertex hasn't been seen before
+		// if multiple vertices are new, prioritize the one with least valence
+		// this reduces the risk that a future triangle will have all three vertices seen
+		unsigned int va = remap[a] == ~0u ? valence[a] : ~0u;
+		unsigned int vb = remap[b] == ~0u ? valence[b] : ~0u;
+		unsigned int vc = remap[c] == ~0u ? valence[c] : ~0u;
+
+		if (vb != ~0u && vb <= va && vb <= vc)
+		{
+			// abc -> bca
+			unsigned int t = a;
+			a = b, b = c, c = t;
+		}
+		else if (vc != ~0u && vc <= va && vc <= vb)
+		{
+			// abc -> cab
+			unsigned int t = c;
+			c = b, b = a, a = t;
+		}
+
+		unsigned int newidx = reorder_offset;
+
+		// now remap[a] = ~0u or all three vertices are old
+		// recording remap[a] makes it possible to remap future references to the same index, conserving space
+		if (remap[a] == ~0u)
+			remap[a] = newidx;
+
+		// we need to clone the provoking vertex to get a unique index
+		// if all three are used the choice is arbitrary since no future triangle will be able to reuse any of these
+		reorder[reorder_offset++] = a;
+
+		// note: first vertex is final, the other two will be fixed up in next pass
+		destination[i + 0] = newidx;
+		destination[i + 1] = b;
+		destination[i + 2] = c;
+
+		// update vertex valences for corner heuristic
+		valence[a]--;
+		valence[b]--;
+		valence[c]--;
+	}
+
+	// remap or clone non-provoking vertices (iterating to skip provoking vertices)
+	int step = 1;
+
+	for (size_t i = 1; i < index_count; i += step, step ^= 3)
+	{
+		unsigned int index = destination[i];
+
+		if (remap[index] == ~0u)
+		{
+			// we haven't seen the vertex before as a provoking vertex
+			// to maintain the reference to the original vertex we need to clone it
+			unsigned int newidx = reorder_offset;
+
+			remap[index] = newidx;
+			reorder[reorder_offset++] = index;
+		}
+
+		destination[i] = remap[index];
+	}
+
+	assert(reorder_offset <= vertex_count + index_count / 3);
+	return reorder_offset;
+}
--- a/Source/ThirdParty/meshoptimizer/meshoptimizer.h
+++ b/Source/ThirdParty/meshoptimizer/meshoptimizer.h
@@ -1,7 +1,7 @@
 /**
- * meshoptimizer - version 0.21
+ * meshoptimizer - version 1.0
 *
- * Copyright (C) 2016-2024, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2016-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
 *
 * This library is distributed under the MIT License. See notice at the end of this file.
@@ -12,7 +12,7 @@
 #include <stddef.h>

 /* Version macro; major * 1000 + minor * 10 + patch */
-#define MESHOPTIMIZER_VERSION 210 /* 0.21 */
+#define MESHOPTIMIZER_VERSION 1000 /* 1.0 */

 /* If no API is defined, assume default */
 #ifndef MESHOPTIMIZER_API
@@ -29,11 +29,14 @@
 #endif

 /* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */
+#ifndef MESHOPTIMIZER_EXPERIMENTAL
 #define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API
+#endif

 /* C interface */
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

 /**
@@ -71,6 +74,19 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination,
 */
 MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);

+/**
+ * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
+ * As a result, all vertices that are equivalent map to the same (new) location, with no gaps in the resulting sequence.
+ * Equivalence is checked in two steps: vertex positions are compared for equality, and then the user-specified equality function is called (if provided).
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * indices can be NULL if the input is unindexed
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * callback can be NULL if no additional equality check is needed; otherwise, it should return 1 if vertices with specified indices are equivalent and 0 if they are not
+ */
+MESHOPTIMIZER_API size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context);
+
 /**
 * Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap
 *
@@ -108,6 +124,16 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
 */
 MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);

+/**
+ * Generates a remap table that maps all vertices with the same position to the same (existing) index.
+ * Similarly to meshopt_generateShadowIndexBuffer, this can be helpful to pre-process meshes for position-only rendering.
+ * This can also be used to implement algorithms that require positional-only connectivity, such as hierarchical simplification.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ */
+MESHOPTIMIZER_API void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
 /**
 * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
 * Each triangle is converted into a 6-vertex patch with the following layout:
@@ -137,10 +163,23 @@ MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destin
 */
 MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);

+/**
+ * Generate index buffer that can be used for visibility buffer rendering and returns the size of the reorder table
+ * Each triangle's provoking vertex index is equal to primitive id; this allows passing it to the fragment shader using flat/nointerpolation attribute.
+ * This is important for performance on hardware where primitive id can't be accessed efficiently in fragment shader.
+ * The reorder table stores the original vertex id for each vertex in the new index buffer, and should be used in the vertex shader to load vertex data.
+ * The provoking vertex is assumed to be the first vertex in the triangle; if this is not the case (OpenGL), rotate each triangle (abc -> bca) before rendering.
+ * For maximum efficiency the input index buffer should be optimized for vertex cache first.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * reorder must contain enough space for the worst case reorder table (vertex_count + index_count/3 elements)
+ */
+MESHOPTIMIZER_API size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
 /**
 * Vertex transform cache optimizer
 * Reorders indices to reduce the number of GPU vertex shader invocations
- * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
 *
 * destination must contain enough space for the resulting index buffer (index_count elements)
 */
@@ -159,7 +198,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destinatio
 * Vertex transform cache optimizer for FIFO caches
 * Reorders indices to reduce the number of GPU vertex shader invocations
 * Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache
- * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
 *
 * destination must contain enough space for the resulting index buffer (index_count elements)
 * cache_size should be less than the actual GPU cache size to avoid cache thrashing
@@ -169,7 +208,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination
 /**
 * Overdraw optimizer
 * Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw
- * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
 *
 * destination must contain enough space for the resulting index buffer (index_count elements)
 * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
@@ -182,7 +221,7 @@ MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const
 * Vertex fetch cache optimizer
 * Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing
 * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
- * This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
+ * This function works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
 *
 * destination must contain enough space for the resulting vertex buffer (vertex_count elements)
 * indices is used both as an input and as an output index buffer
@@ -212,7 +251,8 @@ MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t
 MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count);

 /**
- * Set index encoder format version
+ * Set index encoder format version (defaults to 1)
+ *
 * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+)
 */
 MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
@@ -227,6 +267,13 @@ MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
 */
 MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);

+/**
+ * Get encoded index format version
+ * Returns format version of the encoded index buffer/sequence, or -1 if the buffer header is invalid
+ * Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed.
+ */
+MESHOPTIMIZER_API int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size);
+
 /**
 * Index sequence encoder
 * Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original.
@@ -254,15 +301,31 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde
 * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
 * This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream.
 * Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized.
+ * For maximum efficiency the vertex buffer being encoded has to be quantized and optimized for locality of reference (cache/fetch) first.
 *
 * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
+ * vertex_size must be a multiple of 4 (and <= 256)
 */
 MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
 MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);

 /**
- * Set vertex encoder format version
- * version must specify the data format version to encode; valid values are 0 (decodable by all library versions)
+ * Vertex buffer encoder
+ * Encodes vertex data just like meshopt_encodeVertexBuffer, but allows to override compression level.
+ * For compression level to take effect, the vertex encoding version must be set to 1.
+ * The default compression level implied by meshopt_encodeVertexBuffer is 2.
+ *
+ * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
+ * vertex_size must be a multiple of 4 (and <= 256)
+ * level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio.
+ * version should be -1 to use the default version (specified via meshopt_encodeVertexVersion), or 0/1 to override the version; per above, level won't take effect if version is 0.
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version);
+
+/**
+ * Set vertex encoder format version (defaults to 1)
+ *
+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.23+)
 */
 MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);

@@ -273,32 +336,44 @@ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
 * The decoder is safe to use for untrusted input, but it may produce garbage data.
 *
 * destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes)
+ * vertex_size must be a multiple of 4 (and <= 256)
 */
 MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);

+/**
+ * Get encoded vertex format version
+ * Returns format version of the encoded vertex buffer, or -1 if the buffer header is invalid
+ * Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed.
+ */
+MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size);
+
 /**
 * Vertex buffer filters
 * These functions can be used to filter output of meshopt_decodeVertexBuffer in-place.
 *
- * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f.
+ * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit signed X/Y as an input; Z must store 1.0f.
 * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
 *
- * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct.
+ * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit component encoding and a 2-bit component index indicating which component to reconstruct.
 * Each component is stored as an 16-bit integer; stride must be equal to 8.
 *
 * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
 * Each 32-bit component is decoded in isolation; stride must be divisible by 4.
+ *
+ * meshopt_decodeFilterColor decodes RGBA colors from YCoCg (+A) color encoding where RGB is converted to YCoCg space with K-bit component encoding, and A is stored using K-1 bits.
+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8.
 */
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);

 /**
 * Vertex buffer filter encoders
 * These functions can be used to encode data in a format that meshopt_decodeFilter can decode
 *
- * meshopt_encodeFilterOct encodes unit vectors with K-bit (K <= 16) signed X/Y as an output.
- * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
+ * meshopt_encodeFilterOct encodes unit vectors with K-bit (2 <= K <= 16) signed X/Y as an output.
+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. Z will store 1.0f, W is preserved as is.
 * Input data must contain 4 floats for every vector (count*4 total).
 *
 * meshopt_encodeFilterQuat encodes unit quaternions with K-bit (4 <= K <= 16) component encoding.
@@ -308,6 +383,10 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t cou
 * meshopt_encodeFilterExp encodes arbitrary (finite) floating-point data with 8-bit exponent and K-bit integer mantissa (1 <= K <= 24).
 * Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
 * Input data must contain stride/4 floats for every vector (count*stride/4 total).
+ *
+ * meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with K-bit (2 <= K <= 16) component encoding; A is stored using K-1 bits.
+ * Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8.
+ * Input data must contain 4 floats for every color (count*4 total).
 */
 enum meshopt_EncodeExpMode
 {
@@ -317,11 +396,14 @@ enum meshopt_EncodeExpMode
 	meshopt_EncodeExpSharedVector,
 	/* When encoding exponents, use shared value for each component of all vectors (best compression) */
 	meshopt_EncodeExpSharedComponent,
+	/* When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */
+	meshopt_EncodeExpClamped,
 };

-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
+MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
+MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
+MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
+MESHOPTIMIZER_API void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);

 /**
 * Simplification options
@@ -334,16 +416,34 @@ enum
 	meshopt_SimplifySparse = 1 << 1,
 	/* Treat error limit and resulting error as absolute instead of relative to mesh extents. */
 	meshopt_SimplifyErrorAbsolute = 1 << 2,
+	/* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */
+	meshopt_SimplifyPrune = 1 << 3,
+	/* Produce more regular triangle sizes and shapes during simplification, at some cost to geometric and attribute quality. */
+	meshopt_SimplifyRegularize = 1 << 4,
+	/* Experimental: Allow collapses across attribute discontinuities, except for vertices that are tagged with meshopt_SimplifyVertex_Protect in vertex_lock. */
+	meshopt_SimplifyPermissive = 1 << 5,
+};
+
+/**
+ * Experimental: Simplification vertex flags/locks, for use in `vertex_lock` arrays in simplification APIs
+ */
+enum
+{
+	/* Do not move this vertex. */
+	meshopt_SimplifyVertex_Lock = 1 << 0,
+	/* Protect attribute discontinuity at this vertex; must be used together with meshopt_SimplifyPermissive option. */
+	meshopt_SimplifyVertex_Protect = 1 << 1,
 };

 /**
 * Mesh simplifier
 * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
 * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
- * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification.
+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
 * Returns the number of indices after simplification, with destination containing new index data
+ *
 * The resulting index buffer references vertices from the original vertex buffer.
- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
 *
 * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
 * vertex_positions should have float3 position in the first 12 bytes of each vertex
@@ -354,45 +454,94 @@ enum
 MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error);

 /**
- * Experimental: Mesh simplifier with attribute metric
- * The algorithm ehnahces meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details.
- * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes.
+ * Mesh simplifier with attribute metric
+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
+ * Similar to meshopt_simplify, but incorporates attribute values into the error metric used to prioritize simplification order.
+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
+ * Returns the number of indices after simplification, with destination containing new index data
 *
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * Note that the number of attributes with non-zero weights affects memory requirements and running time.
+ *
+ * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
 * vertex_attributes should have attribute_count floats for each vertex
- * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position. The recommended weight range is [1e-3..1e-1], assuming attribute data is in [0..1] range.
- * attribute_count must be <= 16
+ * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
+ * attribute_count must be <= 32
 * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
- * TODO target_error/result_error currently use combined distance+attribute error; this may change in the future
+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
 */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
+MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);

 /**
- * Experimental: Mesh simplifier (sloppy)
+ * Mesh simplifier with position/attribute update
+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
+ * Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance.
+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
+ * Returns the number of indices after simplification, indices are destructively updated with new index data
+ *
+ * The updated index buffer references vertices from the original vertex buffer, however the vertex positions and attributes are updated in-place.
+ * Creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended; if the original vertex data is needed, it should be copied before simplification.
+ * Note that the number of attributes with non-zero weights affects memory requirements and running time. Attributes with zero weights are not updated.
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * vertex_attributes should have attribute_count floats for each vertex
+ * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
+ * attribute_count must be <= 32
+ * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
+ */
+MESHOPTIMIZER_API size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
+
+/**
+ * Mesh simplifier (sloppy)
 * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
 * The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error.
 * Returns the number of indices after simplification, with destination containing new index data
 * The resulting index buffer references vertices from the original vertex buffer.
- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
 *
 * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
 * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; vertices that can't be moved should set 1 consistently for all indices with the same position
 * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
 * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
 */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
+MESHOPTIMIZER_API size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error);

 /**
- * Experimental: Point cloud simplifier
+ * Mesh simplifier (pruner)
+ * Reduces the number of triangles in the mesh by removing small isolated parts of the mesh
+ * Returns the number of indices after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the target index buffer, worst case is index_count elements
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
+ */
+MESHOPTIMIZER_API size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
+
+/**
+ * Point cloud simplifier
 * Reduces the number of points in the cloud to reach the given target
 * Returns the number of points after simplification, with destination containing new index data
 * The resulting index buffer references vertices from the original vertex buffer.
- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
 *
 * destination must contain enough space for the target index buffer (target_vertex_count elements)
 * vertex_positions should have float3 position in the first 12 bytes of each vertex
- * vertex_colors should can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
+ * vertex_colors can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
+ * color_weight determines relative priority of color wrt position; 1.0 is a safe default
 */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);
+MESHOPTIMIZER_API size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);

 /**
 * Returns the error scaling factor used by the simplifier to convert between absolute and relative extents
@@ -440,6 +589,19 @@ struct meshopt_VertexCacheStatistics
 */
 MESHOPTIMIZER_API struct meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);

+struct meshopt_VertexFetchStatistics
+{
+	unsigned int bytes_fetched;
+	float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
+};
+
+/**
+ * Vertex fetch cache analyzer
+ * Returns cache hit statistics using a simplified direct mapped model
+ * Results may not match actual GPU performance
+ */
+MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+
 struct meshopt_OverdrawStatistics
 {
 	unsigned int pixels_covered;
@@ -456,26 +618,34 @@ struct meshopt_OverdrawStatistics
 */
 MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);

-struct meshopt_VertexFetchStatistics
+struct meshopt_CoverageStatistics
 {
-	unsigned int bytes_fetched;
-	float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
+	float coverage[3];
+	float extent; /* viewport size in mesh coordinates */
 };

 /**
- * Vertex fetch cache analyzer
- * Returns cache hit statistics using a simplified direct mapped model
- * Results may not match actual GPU performance
+ * Coverage analyzer
+ * Returns coverage statistics (ratio of viewport pixels covered from each axis) using a software rasterizer
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
 */
-MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_API struct meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);

+/**
+ * Meshlet is a small mesh cluster (subset) that consists of:
+ * - triangles, an 8-bit micro triangle (index) buffer, that for each triangle specifies three local vertices to use;
+ * - vertices, a 32-bit vertex indirection buffer, that for each local vertex specifies which mesh vertex to fetch vertex attributes from.
+ *
+ * For efficiency, meshlet triangles and vertices are packed into two large arrays; this structure contains offsets and counts to access the data.
+ */
 struct meshopt_Meshlet
 {
 	/* offsets within meshlet_vertices and meshlet_triangles arrays with meshlet data */
 	unsigned int vertex_offset;
 	unsigned int triangle_offset;

-	/* number of vertices and triangles used in the meshlet; data is stored in consecutive range defined by offset and count */
+	/* number of vertices and triangles used in the meshlet; data is stored in consecutive range [offset..offset+count) for vertices and [offset..offset+count*3) for triangles */
 	unsigned int vertex_count;
 	unsigned int triangle_count;
 };
@@ -484,14 +654,15 @@ struct meshopt_Meshlet
 * Meshlet builder
 * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
 * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
+ * When targeting mesh shading hardware, for maximum efficiency meshlets should be further optimized using meshopt_optimizeMeshlet.
 * When using buildMeshlets, vertex positions need to be provided to minimize the size of the resulting clusters.
 * When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
 *
 * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
- * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
- * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
 * vertex_positions should have float3 position in the first 12 bytes of each vertex
- * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512; max_triangles must be divisible by 4)
+ * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512)
 * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
 */
 MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
@@ -499,14 +670,41 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshl
 MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);

 /**
- * Experimental: Meshlet optimizer
- * Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput
+ * Meshlet builder with flexible cluster sizes
+ * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet.
+ * Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor.
 *
- * meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these
- * need to be computed from meshlet's vertex_offset and triangle_offset
- * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 255 - not 256!, triangle_count <= 512)
+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
+ * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
+ * split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold
 */
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
+MESHOPTIMIZER_API size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
+
+/**
+ * Meshlet builder that produces clusters optimized for raytracing
+ * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but optimizes cluster subdivision for raytracing and allows to specify minimum and maximum number of triangles per meshlet.
+ *
+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
+ * fill_weight allows to prioritize clusters that are closer to maximum size at some cost to SAH quality; 0.5 is a safe default
+ */
+MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
+
+/**
+ * Meshlet optimizer
+ * Reorders meshlet vertices and triangles to maximize locality which can improve rasterizer throughput or ray tracing performance when using fast-build modes.
+ *
+ * meshlet_triangles and meshlet_vertices must refer to meshlet data; when buildMeshlets* is used, these need to be computed from meshlet's vertex_offset and triangle_offset
+ * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512)
+ */
+MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);

 struct meshopt_Bounds
 {
@@ -544,11 +742,35 @@ struct meshopt_Bounds
 * Real-Time Rendering 4th Edition, section 19.3).
 *
 * vertex_positions should have float3 position in the first 12 bytes of each vertex
- * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size)
+ * vertex_count should specify the number of vertices in the entire mesh, not cluster or meshlet
+ * index_count/3 and triangle_count must not exceed implementation limits (<= 512)
 */
 MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);

+/**
+ * Sphere bounds generator
+ * Creates bounding sphere around a set of points or a set of spheres; returns the center and radius of the sphere, with other fields of the result set to 0.
+ *
+ * positions should have float3 position in the first 12 bytes of each element
+ * radii can be NULL; when it's not NULL, it should have a non-negative float radius in the first 4 bytes of each element
+ */
+MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride);
+
+/**
+ * Cluster partitioner
+ * Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices or are close to each other.
+ * When vertex positions are not provided, only clusters that share vertices will be grouped together, which may result in small partitions for some inputs.
+ *
+ * destination must contain enough space for the resulting partition data (cluster_count elements)
+ * destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function
+ * cluster_indices should have the vertex indices referenced by each cluster, stored sequentially
+ * cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count
+ * vertex_positions can be NULL; when it's not NULL, it should have float3 position in the first 12 bytes of each vertex
+ * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger (up to target + target/3)
+ */
+MESHOPTIMIZER_API size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
+
 /**
 * Spatial sorter
 * Generates a remap table that can be used to reorder points for spatial locality.
@@ -560,13 +782,44 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsig
 MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);

 /**
- * Experimental: Spatial sorter
+ * Spatial sorter
 * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
 *
 * destination must contain enough space for the resulting index buffer (index_count elements)
 * vertex_positions should have float3 position in the first 12 bytes of each vertex
 */
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_API void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Spatial clusterizer
+ * Reorders points into clusters optimized for spatial locality, and generates a new index buffer.
+ * Ensures the output can be split into cluster_size chunks where each chunk has good positional locality. Only the last chunk will be smaller than cluster_size.
+ *
+ * destination must contain enough space for the resulting index buffer (vertex_count elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ */
+MESHOPTIMIZER_API void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size);
+
+/**
+ * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
+ * Representable magnitude range: [6e-5; 65504]
+ * Maximum relative reconstruction error: 5e-4
+ */
+MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
+
+/**
+ * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
+ * Preserves infinities/NaN, flushes denormals to zero, rounds to nearest
+ * Assumes N is in a valid mantissa precision range, which is 1..23
+ */
+MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
+
+/**
+ * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
+ * Preserves Inf/NaN, flushes denormals to zero
+ */
+MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);

 /**
 * Set allocation callbacks
@@ -574,13 +827,13 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* desti
 * Note that all algorithms only allocate memory for temporary use.
 * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
 */
-MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*));
+MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*));

 #ifdef __cplusplus
 } /* extern "C" */
 #endif

-/* Quantization into commonly supported data formats */
+/* Quantization into fixed point normalized formats; these are only available as inline C++ functions */
 #ifdef __cplusplus
 /**
 * Quantize a float in [0..1] range into an N-bit fixed point unorm value
@@ -595,27 +848,6 @@ inline int meshopt_quantizeUnorm(float v, int N);
 * Maximum reconstruction error: 1/2^N
 */
 inline int meshopt_quantizeSnorm(float v, int N);
-
-/**
- * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
- * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
- * Representable magnitude range: [6e-5; 65504]
- * Maximum relative reconstruction error: 5e-4
- */
-MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
-
-/**
- * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
- * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
- * Assumes N is in a valid mantissa precision range, which is 1..23
- */
-MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
-
-/**
- * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
- * Preserves Inf/NaN, flushes denormals to zero
- */
-MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
 #endif

 /**
@@ -631,6 +863,10 @@ template <typename T>
 inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
 template <typename T>
 inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
+template <typename F>
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback);
+template <typename T, typename F>
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback);
 template <typename T>
 inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap);
 template <typename T>
@@ -642,6 +878,8 @@ inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indice
 template <typename T>
 inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 template <typename T>
+inline size_t meshopt_generateProvokingIndexBuffer(T* destination, unsigned int* reorder, const T* indices, size_t index_count, size_t vertex_count);
+template <typename T>
 inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count);
 template <typename T>
 inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count);
@@ -661,29 +899,44 @@ template <typename T>
 inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
 template <typename T>
 inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
+inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level);
 template <typename T>
 inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
 template <typename T>
 inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
 template <typename T>
+inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
+template <typename T>
 inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL);
 template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error = NULL);
+template <typename T>
+inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
+template <typename T>
 inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
 template <typename T>
 inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index);
 template <typename T>
-inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size);
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
+template <typename T>
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
 template <typename T>
 inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 template <typename T>
-inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+inline meshopt_CoverageStatistics meshopt_analyzeCoverage(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 template <typename T>
 inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
 template <typename T>
 inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
 template <typename T>
+inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
+template <typename T>
+inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
+template <typename T>
 inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 template <typename T>
+inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
+template <typename T>
 inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 #endif

@@ -717,31 +970,39 @@ inline int meshopt_quantizeSnorm(float v, int N)
 class meshopt_Allocator
 {
 public:
-	template <typename T>
-	struct StorageT
+	struct Storage
 	{
-		static void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t);
-		static void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*);
+		void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t);
+		void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*);
 	};

-	typedef StorageT<void> Storage;
+#ifdef MESHOPTIMIZER_ALLOC_EXPORT
+	MESHOPTIMIZER_API static Storage& storage();
+#else
+	static Storage& storage()
+	{
+		static Storage s = {::operator new, ::operator delete };
+		return s;
+	}
+#endif

 	meshopt_Allocator()
-		: blocks()
-		, count(0)
+	    : blocks()
+	    , count(0)
 	{
 	}

 	~meshopt_Allocator()
 	{
 		for (size_t i = count; i > 0; --i)
-			Storage::deallocate(blocks[i - 1]);
+			storage().deallocate(blocks[i - 1]);
 	}

-	template <typename T> T* allocate(size_t size)
+	template <typename T>
+	T* allocate(size_t size)
 	{
 		assert(count < sizeof(blocks) / sizeof(blocks[0]));
-		T* result = static_cast<T*>(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
+		T* result = static_cast<T*>(storage().allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
 		blocks[count++] = result;
 		return result;
 	}
@@ -749,7 +1010,7 @@ public:
 	void deallocate(void* ptr)
 	{
 		assert(count > 0 && blocks[count - 1] == ptr);
-		Storage::deallocate(ptr);
+		storage().deallocate(ptr);
 		count--;
 	}

@@ -757,10 +1018,6 @@ private:
 	void* blocks[24];
 	size_t count;
 };
-
-// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
-template <typename T> void* (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
-template <typename T> void (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
 #endif

 /* Inline implementation for C++ templated wrappers */
@@ -782,7 +1039,7 @@ struct meshopt_IndexAdapter<T, false>
 	{
 		size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int);

-		data = static_cast<unsigned int*>(meshopt_Allocator::Storage::allocate(size));
+		data = static_cast<unsigned int*>(meshopt_Allocator::storage().allocate(size));

 		if (input)
 		{
@@ -799,7 +1056,7 @@ struct meshopt_IndexAdapter<T, false>
 				result[i] = T(data[i]);
 		}

-		meshopt_Allocator::Storage::deallocate(data);
+		meshopt_Allocator::storage().deallocate(data);
 	}
 };

@@ -830,6 +1087,30 @@ inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const
 	return meshopt_generateVertexRemapMulti(destination, indices ? in.data : NULL, index_count, vertex_count, streams, stream_count);
 }

+template <typename F>
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback)
+{
+	struct Call
+	{
+		static int compare(void* context, unsigned int lhs, unsigned int rhs) { return (*static_cast<F*>(context))(lhs, rhs) ? 1 : 0; }
+	};
+
+	return meshopt_generateVertexRemapCustom(destination, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, &Call::compare, &callback);
+}
+
+template <typename T, typename F>
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback)
+{
+	struct Call
+	{
+		static int compare(void* context, unsigned int lhs, unsigned int rhs) { return (*static_cast<F*>(context))(lhs, rhs) ? 1 : 0; }
+	};
+
+	meshopt_IndexAdapter<T> in(NULL, indices, indices ? index_count : 0);
+
+	return meshopt_generateVertexRemapCustom(destination, indices ? in.data : NULL, index_count, vertex_positions, vertex_count, vertex_positions_stride, &Call::compare, &callback);
+}
+
 template <typename T>
 inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap)
 {
@@ -875,6 +1156,19 @@ inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* ind
 	meshopt_generateTessellationIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
 }

+template <typename T>
+inline size_t meshopt_generateProvokingIndexBuffer(T* destination, unsigned int* reorder, const T* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
+
+	size_t bound = vertex_count + (index_count / 3);
+	assert(size_t(T(bound - 1)) == bound - 1); // bound - 1 must fit in T
+	(void)bound;
+
+	return meshopt_generateProvokingIndexBuffer(out.data, reorder, in.data, index_count, vertex_count);
+}
+
 template <typename T>
 inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
 {
@@ -961,6 +1255,11 @@ inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const
 	return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size);
 }

+inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level)
+{
+	return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, level, -1);
+}
+
 template <typename T>
 inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error)
 {
@@ -979,13 +1278,39 @@ inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, s
 	return meshopt_simplifyWithAttributes(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
 }

+template <typename T>
+inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error)
+{
+	meshopt_IndexAdapter<T> inout(indices, indices, index_count);
+
+	return meshopt_simplifyWithUpdate(inout.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
+}
+
 template <typename T>
 inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error)
 {
 	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
 	meshopt_IndexAdapter<T> out(destination, NULL, index_count);

-	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error);
+	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, NULL, target_index_count, target_error, result_error);
+}
+
+template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error)
+{
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
+
+	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_lock, target_index_count, target_error, result_error);
+}
+
+template <typename T>
+inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error)
+{
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
+
+	return meshopt_simplifyPrune(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_error);
 }

 template <typename T>
@@ -1007,11 +1332,19 @@ inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_
 }

 template <typename T>
-inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size)
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size)
 {
 	meshopt_IndexAdapter<T> in(NULL, indices, index_count);

-	return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size);
+	return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, primgroup_size);
+}
+
+template <typename T>
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+
+	return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
 }

 template <typename T>
@@ -1023,11 +1356,11 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size
 }

 template <typename T>
-inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+inline meshopt_CoverageStatistics meshopt_analyzeCoverage(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
 	meshopt_IndexAdapter<T> in(NULL, indices, index_count);

-	return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
+	return meshopt_analyzeCoverage(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
 }

 template <typename T>
@@ -1046,6 +1379,22 @@ inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int*
 	return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles);
 }

+template <typename T>
+inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)
+{
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+
+	return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, cone_weight, split_factor);
+}
+
+template <typename T>
+inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
+{
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+
+	return meshopt_buildMeshletsSpatial(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, fill_weight);
+}
+
 template <typename T>
 inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
@@ -1054,6 +1403,14 @@ inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t inde
 	return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
 }

+template <typename T>
+inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size)
+{
+	meshopt_IndexAdapter<T> in(NULL, cluster_indices, total_index_count);
+
+	return meshopt_partitionClusters(destination, in.data, total_index_count, cluster_index_counts, cluster_count, vertex_positions, vertex_count, vertex_positions_stride, target_partition_size);
+}
+
 template <typename T>
 inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
@@ -1065,7 +1422,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_
 #endif

 /**
- * Copyright (c) 2016-2024 Arseny Kapoulkine
+ * Copyright (c) 2016-2025 Arseny Kapoulkine
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
--- a/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp
+++ b/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp
@@ -10,24 +10,24 @@
 namespace meshopt
 {

-static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
+static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
 {
 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);

 	float mesh_centroid[3] = {};

-	for (size_t i = 0; i < index_count; ++i)
+	for (size_t i = 0; i < vertex_count; ++i)
 	{
-		const float* p = vertex_positions + vertex_stride_float * indices[i];
+		const float* p = vertex_positions + vertex_stride_float * i;

 		mesh_centroid[0] += p[0];
 		mesh_centroid[1] += p[1];
 		mesh_centroid[2] += p[2];
 	}

-	mesh_centroid[0] /= index_count;
-	mesh_centroid[1] /= index_count;
-	mesh_centroid[2] /= index_count;
+	mesh_centroid[0] /= float(vertex_count);
+	mesh_centroid[1] /= float(vertex_count);
+	mesh_centroid[2] /= float(vertex_count);

 	for (size_t cluster = 0; cluster < cluster_count; ++cluster)
 	{
@@ -306,7 +306,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind

 	// fill sort data
 	float* sort_data = allocator.allocate<float>(cluster_count);
-	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
+	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, clusters, cluster_count);

 	// sort clusters using sort data
 	unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);
--- a/Source/ThirdParty/meshoptimizer/partition.cpp
+++ b/Source/ThirdParty/meshoptimizer/partition.cpp
@@ -0,0 +1,624 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+// This work is based on:
+// Takio Kurita. An efficient agglomerative clustering algorithm using a heap. 1991
+namespace meshopt
+{
+
+// To avoid excessive recursion for malformed inputs, we switch to bisection after some depth
+const int kMergeDepthCutoff = 40;
+
+struct ClusterAdjacency
+{
+	unsigned int* offsets;
+	unsigned int* clusters;
+	unsigned int* shared;
+};
+
+static void filterClusterIndices(unsigned int* data, unsigned int* offsets, const unsigned int* cluster_indices, const unsigned int* cluster_index_counts, size_t cluster_count, unsigned char* used, size_t vertex_count, size_t total_index_count)
+{
+	(void)vertex_count;
+	(void)total_index_count;
+
+	size_t cluster_start = 0;
+	size_t cluster_write = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		offsets[i] = unsigned(cluster_write);
+
+		// copy cluster indices, skipping duplicates
+		for (size_t j = 0; j < cluster_index_counts[i]; ++j)
+		{
+			unsigned int v = cluster_indices[cluster_start + j];
+			assert(v < vertex_count);
+
+			data[cluster_write] = v;
+			cluster_write += 1 - used[v];
+			used[v] = 1;
+		}
+
+		// reset used flags for the next cluster
+		for (size_t j = offsets[i]; j < cluster_write; ++j)
+			used[data[j]] = 0;
+
+		cluster_start += cluster_index_counts[i];
+	}
+
+	assert(cluster_start == total_index_count);
+	assert(cluster_write <= total_index_count);
+	offsets[cluster_count] = unsigned(cluster_write);
+}
+
+static float computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, float* out_center)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float center[3] = {0, 0, 0};
+
+	// approximate center of the cluster by averaging all vertex positions
+	for (size_t j = 0; j < index_count; ++j)
+	{
+		const float* p = vertex_positions + indices[j] * vertex_stride_float;
+
+		center[0] += p[0];
+		center[1] += p[1];
+		center[2] += p[2];
+	}
+
+	// note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes
+	if (index_count)
+	{
+		center[0] /= float(index_count);
+		center[1] /= float(index_count);
+		center[2] /= float(index_count);
+	}
+
+	// compute radius of the bounding sphere for each cluster
+	float radiussq = 0;
+
+	for (size_t j = 0; j < index_count; ++j)
+	{
+		const float* p = vertex_positions + indices[j] * vertex_stride_float;
+
+		float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+
+		radiussq = radiussq < d2 ? d2 : radiussq;
+	}
+
+	memcpy(out_center, center, sizeof(center));
+	return sqrtf(radiussq);
+}
+
+static void buildClusterAdjacency(ClusterAdjacency& adjacency, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	unsigned int* ref_offsets = allocator.allocate<unsigned int>(vertex_count + 1);
+
+	// compute number of clusters referenced by each vertex
+	memset(ref_offsets, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+			ref_offsets[cluster_indices[j]]++;
+	}
+
+	// compute (worst-case) number of adjacent clusters for each cluster
+	size_t total_adjacency = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		size_t count = 0;
+
+		// worst case is every vertex has a disjoint cluster list
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+			count += ref_offsets[cluster_indices[j]] - 1;
+
+		// ... but only every other cluster can be adjacent in the end
+		total_adjacency += count < cluster_count - 1 ? count : cluster_count - 1;
+	}
+
+	// we can now allocate adjacency buffers
+	adjacency.offsets = allocator.allocate<unsigned int>(cluster_count + 1);
+	adjacency.clusters = allocator.allocate<unsigned int>(total_adjacency);
+	adjacency.shared = allocator.allocate<unsigned int>(total_adjacency);
+
+	// convert ref counts to offsets
+	size_t total_refs = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		size_t count = ref_offsets[i];
+		ref_offsets[i] = unsigned(total_refs);
+		total_refs += count;
+	}
+
+	unsigned int* ref_data = allocator.allocate<unsigned int>(total_refs);
+
+	// fill cluster refs for each vertex
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+			ref_data[ref_offsets[cluster_indices[j]]++] = unsigned(i);
+	}
+
+	// after the previous pass, ref_offsets contain the end of the data for each vertex; shift it forward to get the start
+	memmove(ref_offsets + 1, ref_offsets, vertex_count * sizeof(unsigned int));
+	ref_offsets[0] = 0;
+
+	// fill cluster adjacency for each cluster...
+	adjacency.offsets[0] = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		unsigned int* adj = adjacency.clusters + adjacency.offsets[i];
+		unsigned int* shd = adjacency.shared + adjacency.offsets[i];
+		size_t count = 0;
+
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+		{
+			unsigned int v = cluster_indices[j];
+
+			// merge the entire cluster list of each vertex into current list
+			for (size_t k = ref_offsets[v]; k < ref_offsets[v + 1]; ++k)
+			{
+				unsigned int c = ref_data[k];
+				assert(c < cluster_count);
+
+				if (c == unsigned(i))
+					continue;
+
+				// if the cluster is already in the list, increment the shared count
+				bool found = false;
+				for (size_t l = 0; l < count; ++l)
+					if (adj[l] == c)
+					{
+						found = true;
+						shd[l]++;
+						break;
+					}
+
+				// .. or append a new cluster
+				if (!found)
+				{
+					adj[count] = c;
+					shd[count] = 1;
+					count++;
+				}
+			}
+		}
+
+		// mark the end of the adjacency list; the next cluster will start there as well
+		adjacency.offsets[i + 1] = adjacency.offsets[i] + unsigned(count);
+	}
+
+	assert(adjacency.offsets[cluster_count] <= total_adjacency);
+
+	// ref_offsets can't be deallocated as it was allocated before adjacency
+	allocator.deallocate(ref_data);
+}
+
+struct ClusterGroup
+{
+	int group;
+	int next;
+	unsigned int size; // 0 unless root
+	unsigned int vertices;
+
+	float center[3];
+	float radius;
+};
+
+struct GroupOrder
+{
+	unsigned int id;
+	int order;
+};
+
+static void heapPush(GroupOrder* heap, size_t size, GroupOrder item)
+{
+	// insert a new element at the end (breaks heap invariant)
+	heap[size++] = item;
+
+	// bubble up the new element to its correct position
+	size_t i = size - 1;
+	while (i > 0 && heap[i].order < heap[(i - 1) / 2].order)
+	{
+		size_t p = (i - 1) / 2;
+
+		GroupOrder temp = heap[i];
+		heap[i] = heap[p];
+		heap[p] = temp;
+		i = p;
+	}
+}
+
+static GroupOrder heapPop(GroupOrder* heap, size_t size)
+{
+	assert(size > 0);
+	GroupOrder top = heap[0];
+
+	// move the last element to the top (breaks heap invariant)
+	heap[0] = heap[--size];
+
+	// bubble down the new top element to its correct position
+	size_t i = 0;
+	while (i * 2 + 1 < size)
+	{
+		// find the smallest child
+		size_t j = i * 2 + 1;
+		j += (j + 1 < size && heap[j + 1].order < heap[j].order);
+
+		// if the parent is already smaller than both children, we're done
+		if (heap[j].order >= heap[i].order)
+			break;
+
+		// otherwise, swap the parent and child and continue
+		GroupOrder temp = heap[i];
+		heap[i] = heap[j];
+		heap[j] = temp;
+		i = j;
+	}
+
+	return top;
+}
+
+static unsigned int countShared(const ClusterGroup* groups, int group1, int group2, const ClusterAdjacency& adjacency)
+{
+	unsigned int total = 0;
+
+	for (int i1 = group1; i1 >= 0; i1 = groups[i1].next)
+		for (int i2 = group2; i2 >= 0; i2 = groups[i2].next)
+		{
+			for (unsigned int adj = adjacency.offsets[i1]; adj < adjacency.offsets[i1 + 1]; ++adj)
+				if (adjacency.clusters[adj] == unsigned(i2))
+				{
+					total += adjacency.shared[adj];
+					break;
+				}
+		}
+
+	return total;
+}
+
+static void mergeBounds(ClusterGroup& target, const ClusterGroup& source)
+{
+	float r1 = target.radius, r2 = source.radius;
+	float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
+	float d = sqrtf(dx * dx + dy * dy + dz * dz);
+
+	if (d + r1 < r2)
+	{
+		target.center[0] = source.center[0];
+		target.center[1] = source.center[1];
+		target.center[2] = source.center[2];
+		target.radius = source.radius;
+		return;
+	}
+
+	if (d + r2 > r1)
+	{
+		float k = d > 0 ? (d + r2 - r1) / (2 * d) : 0.f;
+
+		target.center[0] += dx * k;
+		target.center[1] += dy * k;
+		target.center[2] += dz * k;
+		target.radius = (d + r2 + r1) / 2;
+	}
+}
+
+static float boundsScore(const ClusterGroup& target, const ClusterGroup& source)
+{
+	float r1 = target.radius, r2 = source.radius;
+	float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
+	float d = sqrtf(dx * dx + dy * dy + dz * dz);
+
+	float mr = d + r1 < r2 ? r2 : (d + r2 < r1 ? r1 : (d + r2 + r1) / 2);
+
+	return mr > 0 ? r1 / mr : 0.f;
+}
+
+static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, bool use_bounds)
+{
+	assert(groups[id].size > 0);
+
+	float group_rsqrt = 1.f / sqrtf(float(int(groups[id].vertices)));
+
+	int best_group = -1;
+	float best_score = 0;
+
+	for (int ci = id; ci >= 0; ci = groups[ci].next)
+	{
+		for (unsigned int adj = adjacency.offsets[ci]; adj != adjacency.offsets[ci + 1]; ++adj)
+		{
+			int other = groups[adjacency.clusters[adj]].group;
+			if (other < 0)
+				continue;
+
+			assert(groups[other].size > 0);
+			if (groups[id].size + groups[other].size > max_partition_size)
+				continue;
+
+			unsigned int shared = countShared(groups, id, other, adjacency);
+			float other_rsqrt = 1.f / sqrtf(float(int(groups[other].vertices)));
+
+			// normalize shared count by the expected boundary of each group (+ keeps scoring symmetric)
+			float score = float(int(shared)) * (group_rsqrt + other_rsqrt);
+
+			// incorporate spatial score to favor merging nearby groups
+			if (use_bounds)
+				score *= 1.f + 0.4f * boundsScore(groups[id], groups[other]);
+
+			if (score > best_score)
+			{
+				best_group = other;
+				best_score = score;
+			}
+		}
+	}
+
+	return best_group;
+}
+
+static void mergeLeaf(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size)
+{
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = order[i];
+		if (groups[id].size == 0 || groups[id].size >= target_partition_size)
+			continue;
+
+		float best_score = -1.f;
+		int best_group = -1;
+
+		for (size_t j = 0; j < count; ++j)
+		{
+			unsigned int other = order[j];
+			if (id == other || groups[other].size == 0)
+				continue;
+
+			if (groups[id].size + groups[other].size > max_partition_size)
+				continue;
+
+			// favor merging nearby groups
+			float score = boundsScore(groups[id], groups[other]);
+
+			if (score > best_score)
+			{
+				best_score = score;
+				best_group = other;
+			}
+		}
+
+		// merge id *into* best_group; that way, we may merge more groups into the same best_group, maximizing the chance of reaching target
+		if (best_group != -1)
+		{
+			// combine groups by linking them together
+			unsigned int tail = best_group;
+			while (groups[tail].next >= 0)
+				tail = groups[tail].next;
+
+			groups[tail].next = id;
+
+			// update group sizes; note, we omit vertices update for simplicity as it's not used for spatial merge
+			groups[best_group].size += groups[id].size;
+			groups[id].size = 0;
+
+			// merge bounding spheres
+			mergeBounds(groups[best_group], groups[id]);
+			groups[id].radius = 0.f;
+		}
+	}
+}
+
+static size_t mergePartition(unsigned int* order, size_t count, const ClusterGroup* groups, int axis, float pivot)
+{
+	size_t m = 0;
+
+	// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
+	for (size_t i = 0; i < count; ++i)
+	{
+		float v = groups[order[i]].center[axis];
+
+		// swap(m, i) unconditionally
+		unsigned int t = order[m];
+		order[m] = order[i];
+		order[i] = t;
+
+		// when v >= pivot, we swap i with m without advancing it, preserving invariants
+		m += v < pivot;
+	}
+
+	return m;
+}
+
+static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size, int depth)
+{
+	size_t total = 0;
+	for (size_t i = 0; i < count; ++i)
+		total += groups[order[i]].size;
+
+	if (total <= max_partition_size || count <= leaf_size)
+		return mergeLeaf(groups, order, count, target_partition_size, max_partition_size);
+
+	float mean[3] = {};
+	float vars[3] = {};
+	float runc = 1, runs = 1;
+
+	// gather statistics on the points in the subtree using Welford's algorithm
+	for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
+	{
+		const float* point = groups[order[i]].center;
+
+		for (int k = 0; k < 3; ++k)
+		{
+			float delta = point[k] - mean[k];
+			mean[k] += delta * runs;
+			vars[k] += delta * (point[k] - mean[k]);
+		}
+	}
+
+	// split axis is one where the variance is largest
+	int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
+
+	float split = mean[axis];
+	size_t middle = mergePartition(order, count, groups, axis, split);
+
+	// enforce balance for degenerate partitions
+	// this also ensures recursion depth is bounded on pathological inputs
+	if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2 || depth >= kMergeDepthCutoff)
+		middle = count / 2;
+
+	// recursion depth is logarithmic and bounded due to max depth check above
+	mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
+	mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
+}
+
+} // namespace meshopt
+
+size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size)
+{
+	using namespace meshopt;
+
+	assert((vertex_positions == NULL || vertex_positions_stride >= 12) && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(target_partition_size > 0);
+
+	size_t max_partition_size = target_partition_size + target_partition_size / 3;
+
+	meshopt_Allocator allocator;
+
+	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
+	memset(used, 0, vertex_count);
+
+	unsigned int* cluster_newindices = allocator.allocate<unsigned int>(total_index_count);
+	unsigned int* cluster_offsets = allocator.allocate<unsigned int>(cluster_count + 1);
+
+	// make new cluster index list that filters out duplicate indices
+	filterClusterIndices(cluster_newindices, cluster_offsets, cluster_indices, cluster_index_counts, cluster_count, used, vertex_count, total_index_count);
+	cluster_indices = cluster_newindices;
+
+	// build cluster adjacency along with edge weights (shared vertex count)
+	ClusterAdjacency adjacency = {};
+	buildClusterAdjacency(adjacency, cluster_indices, cluster_offsets, cluster_count, vertex_count, allocator);
+
+	ClusterGroup* groups = allocator.allocate<ClusterGroup>(cluster_count);
+	memset(groups, 0, sizeof(ClusterGroup) * cluster_count);
+
+	GroupOrder* order = allocator.allocate<GroupOrder>(cluster_count);
+	size_t pending = 0;
+
+	// create a singleton group for each cluster and order them by priority
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		groups[i].group = int(i);
+		groups[i].next = -1;
+		groups[i].size = 1;
+		groups[i].vertices = cluster_offsets[i + 1] - cluster_offsets[i];
+		assert(groups[i].vertices > 0);
+
+		// compute bounding sphere for each cluster if positions are provided
+		if (vertex_positions)
+			groups[i].radius = computeClusterBounds(cluster_indices + cluster_offsets[i], cluster_offsets[i + 1] - cluster_offsets[i], vertex_positions, vertex_positions_stride, groups[i].center);
+
+		GroupOrder item = {};
+		item.id = unsigned(i);
+		item.order = groups[i].vertices;
+
+		heapPush(order, pending++, item);
+	}
+
+	// iteratively merge the smallest group with the best group
+	while (pending)
+	{
+		GroupOrder top = heapPop(order, pending--);
+
+		// this group was merged into another group earlier
+		if (groups[top.id].size == 0)
+			continue;
+
+		// disassociate clusters from the group to prevent them from being merged again; we will re-associate them if the group is reinserted
+		for (int i = top.id; i >= 0; i = groups[i].next)
+		{
+			assert(groups[i].group == int(top.id));
+			groups[i].group = -1;
+		}
+
+		// the group is large enough, emit as is
+		if (groups[top.id].size >= target_partition_size)
+			continue;
+
+		int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, /* use_bounds= */ vertex_positions);
+
+		// we can't grow the group any more, emit as is
+		if (best_group == -1)
+			continue;
+
+		// compute shared vertices to adjust the total vertices estimate after merging
+		unsigned int shared = countShared(groups, top.id, best_group, adjacency);
+
+		// combine groups by linking them together
+		unsigned int tail = top.id;
+		while (groups[tail].next >= 0)
+			tail = groups[tail].next;
+
+		groups[tail].next = best_group;
+
+		// update group sizes; note, the vertex update is a O(1) approximation which avoids recomputing the true size
+		groups[top.id].size += groups[best_group].size;
+		groups[top.id].vertices += groups[best_group].vertices;
+		groups[top.id].vertices = (groups[top.id].vertices > shared) ? groups[top.id].vertices - shared : 1;
+
+		groups[best_group].size = 0;
+		groups[best_group].vertices = 0;
+
+		// merge bounding spheres if bounds are available
+		if (vertex_positions)
+		{
+			mergeBounds(groups[top.id], groups[best_group]);
+			groups[best_group].radius = 0;
+		}
+
+		// re-associate all clusters back to the merged group
+		for (int i = top.id; i >= 0; i = groups[i].next)
+			groups[i].group = int(top.id);
+
+		top.order = groups[top.id].vertices;
+		heapPush(order, pending++, top);
+	}
+
+	// if vertex positions are provided, we do a final pass to see if we can merge small groups based on spatial locality alone
+	if (vertex_positions)
+	{
+		unsigned int* merge_order = reinterpret_cast<unsigned int*>(order);
+		size_t merge_offset = 0;
+
+		for (size_t i = 0; i < cluster_count; ++i)
+			if (groups[i].size)
+				merge_order[merge_offset++] = unsigned(i);
+
+		mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8, 0);
+	}
+
+	// output each remaining group
+	size_t next_group = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		if (groups[i].size == 0)
+			continue;
+
+		for (int j = int(i); j >= 0; j = groups[j].next)
+			destination[j] = unsigned(next_group);
+
+		next_group++;
+	}
+
+	assert(next_group <= cluster_count);
+	return next_group;
+}
--- a/Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp
+++ b/Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp
@@ -18,14 +18,6 @@ struct OverdrawBuffer
 	unsigned int overdraw[kViewport][kViewport][2];
 };

-#ifndef min
-#define min(a, b) ((a) < (b) ? (a) : (b))
-#endif
-
-#ifndef max
-#define max(a, b) ((a) > (b) ? (a) : (b))
-#endif
-
 static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3)
 {
 	// z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1)
@@ -36,8 +28,8 @@ static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1,
 	float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1);
 	float invdet = (det == 0) ? 0 : 1 / det;

-	dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet;
-	dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet;
+	dzdx = ((z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1)) * invdet;
+	dzdy = ((x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1)) * invdet;

 	return det;
 }
@@ -76,11 +68,26 @@ static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, f
 	// bounding rectangle, clipped against viewport
 	// since we rasterize pixels with covered centers, min >0.5 should round up
 	// as for max, due to top-left filling convention we will never rasterize right/bottom edges
-	// so max >= 0.5 should round down
-	int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0);
-	int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport);
-	int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0);
-	int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport);
+	// so max >= 0.5 should round down for inclusive bounds, and up for exclusive (in our case)
+	int minx = X1 < X2 ? X1 : X2;
+	minx = minx < X3 ? minx : X3;
+	minx = (minx + 7) >> 4;
+	minx = minx < 0 ? 0 : minx;
+
+	int miny = Y1 < Y2 ? Y1 : Y2;
+	miny = miny < Y3 ? miny : Y3;
+	miny = (miny + 7) >> 4;
+	miny = miny < 0 ? 0 : miny;
+
+	int maxx = X1 > X2 ? X1 : X2;
+	maxx = maxx > X3 ? maxx : X3;
+	maxx = (maxx + 7) >> 4;
+	maxx = maxx > kViewport ? kViewport : maxx;
+
+	int maxy = Y1 > Y2 ? Y1 : Y2;
+	maxy = maxy > Y3 ? maxy : Y3;
+	maxy = (maxy + 7) >> 4;
+	maxy = maxy > kViewport ? kViewport : maxy;

 	// deltas, 28.4 fixed point
 	int DX12 = X1 - X2;
@@ -139,22 +146,10 @@ static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, f
 	}
 }

-} // namespace meshopt
-
-meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+static float transformTriangles(float* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
-	using namespace meshopt;
-
-	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
-	assert(vertex_positions_stride % sizeof(float) == 0);
-
-	meshopt_Allocator allocator;
-
 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);

-	meshopt_OverdrawStatistics result = {};
-
 	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
 	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};

@@ -164,15 +159,20 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,

 		for (int j = 0; j < 3; ++j)
 		{
-			minv[j] = min(minv[j], v[j]);
-			maxv[j] = max(maxv[j], v[j]);
+			float vj = v[j];
+
+			minv[j] = minv[j] > vj ? vj : minv[j];
+			maxv[j] = maxv[j] < vj ? vj : maxv[j];
 		}
 	}

-	float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2]));
-	float scale = kViewport / extent;
+	float extent = 0.f;

-	float* triangles = allocator.allocate<float>(index_count * 3);
+	extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
+	extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
+	extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
+
+	float scale = kViewport / extent;

 	for (size_t i = 0; i < index_count; ++i)
 	{
@@ -186,31 +186,55 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
 		triangles[i * 3 + 2] = (v[2] - minv[2]) * scale;
 	}

+	return extent;
+}
+
+static void rasterizeTriangles(OverdrawBuffer* buffer, const float* triangles, size_t index_count, int axis)
+{
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		const float* vn0 = &triangles[3 * (i + 0)];
+		const float* vn1 = &triangles[3 * (i + 1)];
+		const float* vn2 = &triangles[3 * (i + 2)];
+
+		switch (axis)
+		{
+		case 0:
+			rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
+			break;
+		case 1:
+			rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
+			break;
+		case 2:
+			rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
+			break;
+		}
+	}
+}
+
+} // namespace meshopt
+
+meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	meshopt_OverdrawStatistics result = {};
+
+	float* triangles = allocator.allocate<float>(index_count * 3);
+	transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+
 	OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);

 	for (int axis = 0; axis < 3; ++axis)
 	{
 		memset(buffer, 0, sizeof(OverdrawBuffer));
-
-		for (size_t i = 0; i < index_count; i += 3)
-		{
-			const float* vn0 = &triangles[3 * (i + 0)];
-			const float* vn1 = &triangles[3 * (i + 1)];
-			const float* vn2 = &triangles[3 * (i + 2)];
-
-			switch (axis)
-			{
-			case 0:
-				rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
-				break;
-			case 1:
-				rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
-				break;
-			case 2:
-				rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
-				break;
-			}
-		}
+		rasterizeTriangles(buffer, triangles, index_count, axis);

 		for (int y = 0; y < kViewport; ++y)
 			for (int x = 0; x < kViewport; ++x)
@@ -227,3 +251,39 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,

 	return result;
 }
+
+meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	meshopt_CoverageStatistics result = {};
+
+	float* triangles = allocator.allocate<float>(index_count * 3);
+	float extent = transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+
+	OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);
+
+	for (int axis = 0; axis < 3; ++axis)
+	{
+		memset(buffer, 0, sizeof(OverdrawBuffer));
+		rasterizeTriangles(buffer, triangles, index_count, axis);
+
+		unsigned int covered = 0;
+
+		for (int y = 0; y < kViewport; ++y)
+			for (int x = 0; x < kViewport; ++x)
+				covered += (buffer->overdraw[y][x][0] | buffer->overdraw[y][x][1]) > 0;
+
+		result.coverage[axis] = float(covered) / float(kViewport * kViewport);
+	}
+
+	result.extent = extent;
+
+	return result;
+}
--- a/Source/ThirdParty/meshoptimizer/simplifier.cpp
+++ b/Source/ThirdParty/meshoptimizer/simplifier.cpp
--- a/Source/ThirdParty/meshoptimizer/spatialorder.cpp
+++ b/Source/ThirdParty/meshoptimizer/spatialorder.cpp
@@ -10,18 +10,19 @@
 namespace meshopt
 {

-// "Insert" two 0 bits after each of the 10 low bits of x
-inline unsigned int part1By2(unsigned int x)
+// "Insert" two 0 bits after each of the 20 low bits of x
+inline unsigned long long part1By2(unsigned long long x)
 {
-	x &= 0x000003ff;                  // x = ---- ---- ---- ---- ---- --98 7654 3210
-	x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
-	x = (x ^ (x << 8)) & 0x0300f00f;  // x = ---- --98 ---- ---- 7654 ---- ---- 3210
-	x = (x ^ (x << 4)) & 0x030c30c3;  // x = ---- --98 ---- 76-- --54 ---- 32-- --10
-	x = (x ^ (x << 2)) & 0x09249249;  // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+	x &= 0x000fffffull;                          // x = ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- jihg fedc ba98 7654 3210
+	x = (x ^ (x << 32)) & 0x000f00000000ffffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- ---- ---- ---- ---- fedc ba98 7654 3210
+	x = (x ^ (x << 16)) & 0x000f0000ff0000ffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- fedc ba98 ---- ---- ---- ---- 7654 3210
+	x = (x ^ (x << 8)) & 0x000f00f00f00f00full;  // x = ---- ---- ---- jihg ---- ---- fedc ---- ---- ba98 ---- ---- 7654 ---- ---- 3210
+	x = (x ^ (x << 4)) & 0x00c30c30c30c30c3ull;  // x = ---- ---- ji-- --hg ---- fe-- --dc ---- ba-- --98 ---- 76-- --54 ---- 32-- --10
+	x = (x ^ (x << 2)) & 0x0249249249249249ull;  // x = ---- --j- -i-- h--g --f- -e-- d--c --b- -a-- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
 	return x;
 }

-static void computeOrder(unsigned int* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
+static void computeOrder(unsigned long long* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, bool morton)
 {
 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);

@@ -47,66 +48,171 @@ static void computeOrder(unsigned int* result, const float* vertex_positions_dat
 	extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
 	extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);

-	float scale = extent == 0 ? 0.f : 1.f / extent;
+	// rescale each axis to 16 bits to get 48-bit Morton codes
+	float scale = extent == 0 ? 0.f : 65535.f / extent;

 	// generate Morton order based on the position inside a unit cube
 	for (size_t i = 0; i < vertex_count; ++i)
 	{
 		const float* v = vertex_positions_data + i * vertex_stride_float;

-		int x = int((v[0] - minv[0]) * scale * 1023.f + 0.5f);
-		int y = int((v[1] - minv[1]) * scale * 1023.f + 0.5f);
-		int z = int((v[2] - minv[2]) * scale * 1023.f + 0.5f);
+		int x = int((v[0] - minv[0]) * scale + 0.5f);
+		int y = int((v[1] - minv[1]) * scale + 0.5f);
+		int z = int((v[2] - minv[2]) * scale + 0.5f);

-		result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
+		if (morton)
+			result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
+		else
+			result[i] = ((unsigned long long)x << 0) | ((unsigned long long)y << 20) | ((unsigned long long)z << 40);
 	}
 }

-static void computeHistogram(unsigned int (&hist)[1024][3], const unsigned int* data, size_t count)
+static void radixSort10(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count)
 {
+	unsigned int hist[1024];
 	memset(hist, 0, sizeof(hist));

-	// compute 3 10-bit histograms in parallel
+	// compute histogram (assume keys are 10-bit)
 	for (size_t i = 0; i < count; ++i)
-	{
-		unsigned int id = data[i];
+		hist[keys[i]]++;

-		hist[(id >> 0) & 1023][0]++;
-		hist[(id >> 10) & 1023][1]++;
-		hist[(id >> 20) & 1023][2]++;
-	}
-
-	unsigned int sumx = 0, sumy = 0, sumz = 0;
+	unsigned int sum = 0;

 	// replace histogram data with prefix histogram sums in-place
 	for (int i = 0; i < 1024; ++i)
 	{
-		unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
-
-		hist[i][0] = sumx;
-		hist[i][1] = sumy;
-		hist[i][2] = sumz;
-
-		sumx += hx;
-		sumy += hy;
-		sumz += hz;
+		unsigned int h = hist[i];
+		hist[i] = sum;
+		sum += h;
 	}

-	assert(sumx == count && sumy == count && sumz == count);
+	assert(sum == count);
+
+	// reorder values
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = keys[source[i]];
+
+		destination[hist[id]++] = source[i];
+	}
 }

-static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned int* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
+static void computeHistogram(unsigned int (&hist)[256][2], const unsigned short* data, size_t count)
 {
-	int bitoff = pass * 10;
+	memset(hist, 0, sizeof(hist));
+
+	// compute 2 8-bit histograms in parallel
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned long long id = data[i];
+
+		hist[(id >> 0) & 255][0]++;
+		hist[(id >> 8) & 255][1]++;
+	}
+
+	unsigned int sum0 = 0, sum1 = 0;
+
+	// replace histogram data with prefix histogram sums in-place
+	for (int i = 0; i < 256; ++i)
+	{
+		unsigned int h0 = hist[i][0], h1 = hist[i][1];
+
+		hist[i][0] = sum0;
+		hist[i][1] = sum1;
+
+		sum0 += h0;
+		sum1 += h1;
+	}
+
+	assert(sum0 == count && sum1 == count);
+}
+
+static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count, unsigned int (&hist)[256][2], int pass)
+{
+	int bitoff = pass * 8;

 	for (size_t i = 0; i < count; ++i)
 	{
-		unsigned int id = (keys[source[i]] >> bitoff) & 1023;
+		unsigned int id = unsigned(keys[source[i]] >> bitoff) & 255;

 		destination[hist[id][pass]++] = source[i];
 	}
 }

+static void partitionPoints(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
+{
+	size_t l = 0, r = split;
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned char side = sides[order[i]];
+		target[side ? r : l] = order[i];
+		l += 1;
+		l -= side;
+		r += side;
+	}
+
+	assert(l == split && r == count);
+}
+
+static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, const unsigned long long* keys, size_t count, void* scratch, size_t cluster_size)
+{
+	if (count <= cluster_size)
+	{
+		memcpy(destination, orderx, count * sizeof(unsigned int));
+		return;
+	}
+
+	unsigned int* axes[3] = {orderx, ordery, orderz};
+
+	int bestk = -1;
+	unsigned int bestdim = 0;
+
+	for (int k = 0; k < 3; ++k)
+	{
+		const unsigned int mask = (1 << 20) - 1;
+		unsigned int dim = (unsigned(keys[axes[k][count - 1]] >> (k * 20)) & mask) - (unsigned(keys[axes[k][0]] >> (k * 20)) & mask);
+
+		if (dim >= bestdim)
+		{
+			bestk = k;
+			bestdim = dim;
+		}
+	}
+
+	assert(bestk >= 0);
+
+	// split roughly in half, with the left split always being aligned to cluster size
+	size_t split = ((count / 2) + cluster_size - 1) / cluster_size * cluster_size;
+	assert(split > 0 && split < count);
+
+	// mark sides of split for partitioning
+	unsigned char* sides = static_cast<unsigned char*>(scratch) + count * sizeof(unsigned int);
+
+	for (size_t i = 0; i < split; ++i)
+		sides[axes[bestk][i]] = 0;
+
+	for (size_t i = split; i < count; ++i)
+		sides[axes[bestk][i]] = 1;
+
+	// partition all axes into two sides, maintaining order
+	unsigned int* temp = static_cast<unsigned int*>(scratch);
+
+	for (int k = 0; k < 3; ++k)
+	{
+		if (k == bestk)
+			continue;
+
+		unsigned int* axis = axes[k];
+		memcpy(temp, axis, sizeof(unsigned int) * count);
+		partitionPoints(axis, temp, sides, split, count);
+	}
+
+	// recursion depth is logarithmic and bounded as we always split in approximately half
+	splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size);
+	splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size);
+}
+
 } // namespace meshopt

 void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
@@ -118,21 +224,26 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos

 	meshopt_Allocator allocator;

-	unsigned int* keys = allocator.allocate<unsigned int>(vertex_count);
-	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride);
+	unsigned long long* keys = allocator.allocate<unsigned long long>(vertex_count);
+	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ true);

-	unsigned int hist[1024][3];
-	computeHistogram(hist, keys, vertex_count);
-
-	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count);
+	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count * 2); // 4b for order + 2b for keys
+	unsigned short* keyk = (unsigned short*)(scratch + vertex_count);

 	for (size_t i = 0; i < vertex_count; ++i)
 		destination[i] = unsigned(i);

-	// 3-pass radix sort computes the resulting order into scratch
-	radixPass(scratch, destination, keys, vertex_count, hist, 0);
-	radixPass(destination, scratch, keys, vertex_count, hist, 1);
-	radixPass(scratch, destination, keys, vertex_count, hist, 2);
+	unsigned int* order[] = {scratch, destination};
+
+	// 5-pass radix sort computes the resulting order into scratch
+	for (int k = 0; k < 5; ++k)
+	{
+		// copy 10-bit key segments into keyk to reduce cache pressure during radix pass
+		for (size_t i = 0; i < vertex_count; ++i)
+			keyk[i] = (unsigned short)((keys[i] >> (k * 10)) & 1023);
+
+		radixSort10(order[k % 2], order[(k + 1) % 2], keyk, vertex_count);
+	}

 	// since our remap table is mapping old=>new, we need to reverse it
 	for (size_t i = 0; i < vertex_count; ++i)
@@ -192,3 +303,39 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int*
 		destination[r * 3 + 2] = c;
 	}
 }
+
+void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(cluster_size > 0);
+
+	meshopt_Allocator allocator;
+
+	unsigned long long* keys = allocator.allocate<unsigned long long>(vertex_count);
+	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ false);
+
+	unsigned int* order = allocator.allocate<unsigned int>(vertex_count * 3);
+	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count * 2); // 4b for order + 1b for side or 2b for keys
+	unsigned short* keyk = reinterpret_cast<unsigned short*>(scratch + vertex_count);
+
+	for (int k = 0; k < 3; ++k)
+	{
+		// copy 16-bit key segments into keyk to reduce cache pressure during radix pass
+		for (size_t i = 0; i < vertex_count; ++i)
+			keyk[i] = (unsigned short)(keys[i] >> (k * 20));
+
+		unsigned int hist[256][2];
+		computeHistogram(hist, keyk, vertex_count);
+
+		for (size_t i = 0; i < vertex_count; ++i)
+			order[k * vertex_count + i] = unsigned(i);
+
+		radixPass(scratch, order + k * vertex_count, keyk, vertex_count, hist, 0);
+		radixPass(order + k * vertex_count, scratch, keyk, vertex_count, hist, 1);
+	}
+
+	splitPoints(destination, order, order + vertex_count, order + 2 * vertex_count, keys, vertex_count, scratch, cluster_size);
+}
--- a/Source/ThirdParty/meshoptimizer/stripifier.cpp
+++ b/Source/ThirdParty/meshoptimizer/stripifier.cpp
@@ -10,14 +10,14 @@
 namespace meshopt
 {

-static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned int* valence)
+static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned char* valence)
 {
 	unsigned int index = 0;
 	unsigned int iv = ~0u;

 	for (size_t i = 0; i < buffer_size; ++i)
 	{
-		unsigned int va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
+		unsigned char va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
 		unsigned int v = (va < vb && va < vc) ? va : (vb < vc ? vb : vc);

 		if (v < iv)
@@ -71,8 +71,9 @@ size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices,
 	size_t strip_size = 0;

 	// compute vertex valence; this is used to prioritize starting triangle for strips
-	unsigned int* valence = allocator.allocate<unsigned int>(vertex_count);
-	memset(valence, 0, vertex_count * sizeof(unsigned int));
+	// note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic
+	unsigned char* valence = allocator.allocate<unsigned char>(vertex_count);
+	memset(valence, 0, vertex_count);

 	for (size_t i = 0; i < index_count; ++i)
 	{
@@ -151,7 +152,7 @@ size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices,
 		{
 			// if we didn't find anything, we need to find the next new triangle
 			// we use a heuristic to maximize the strip length
-			unsigned int i = findStripFirst(buffer, buffer_size, &valence[0]);
+			unsigned int i = findStripFirst(buffer, buffer_size, valence);
 			unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];

 			// ordered removal from the buffer
--- a/Source/ThirdParty/meshoptimizer/vertexcodec.cpp
+++ b/Source/ThirdParty/meshoptimizer/vertexcodec.cpp
--- a/Source/ThirdParty/meshoptimizer/vertexfilter.cpp
+++ b/Source/ThirdParty/meshoptimizer/vertexfilter.cpp
@@ -109,28 +109,33 @@ static void decodeFilterOct(T* data, size_t count)

 static void decodeFilterQuat(short* data, size_t count)
 {
-	const float scale = 1.f / sqrtf(2.f);
+	const float scale = 32767.f / sqrtf(2.f);

 	for (size_t i = 0; i < count; ++i)
 	{
 		// recover scale from the high byte of the component
 		int sf = data[i * 4 + 3] | 3;
-		float ss = scale / float(sf);
+		float s = float(sf);

-		// convert x/y/z to [-1..1] (scaled...)
-		float x = float(data[i * 4 + 0]) * ss;
-		float y = float(data[i * 4 + 1]) * ss;
-		float z = float(data[i * 4 + 2]) * ss;
+		// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+		float x = float(data[i * 4 + 0]);
+		float y = float(data[i * 4 + 1]);
+		float z = float(data[i * 4 + 2]);

-		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
-		float ww = 1.f - x * x - y * y - z * z;
+		// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
+		float ws = s * s;
+		float ww = ws * 2.f - x * x - y * y - z * z;
 		float w = sqrtf(ww >= 0.f ? ww : 0.f);

+		// compute final scale; note that all computations above are unscaled
+		// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+		float ss = scale / s;
+
 		// rounded signed float->int
-		int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
-		int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
-		int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
-		int wf = int(w * 32767.f + 0.5f);
+		int xf = int(x * ss + (x >= 0.f ? 0.5f : -0.5f));
+		int yf = int(y * ss + (y >= 0.f ? 0.5f : -0.5f));
+		int zf = int(z * ss + (z >= 0.f ? 0.5f : -0.5f));
+		int wf = int(w * ss + 0.5f);

 		int qc = data[i * 4 + 3] & 3;

@@ -165,6 +170,47 @@ static void decodeFilterExp(unsigned int* data, size_t count)
 		data[i] = u.ui;
 	}
 }
+
+template <typename ST, typename T>
+static void decodeFilterColor(T* data, size_t count)
+{
+	const float max = float((1 << (sizeof(T) * 8)) - 1);
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		// recover scale from alpha high bit
+		int as = data[i * 4 + 3];
+		as |= as >> 1;
+		as |= as >> 2;
+		as |= as >> 4;
+		as |= as >> 8; // noop for 8-bit
+
+		// convert to RGB in fixed point (co/cg are sign extended)
+		int y = data[i * 4 + 0], co = ST(data[i * 4 + 1]), cg = ST(data[i * 4 + 2]);
+
+		int r = y + co - cg;
+		int g = y + cg;
+		int b = y - co - cg;
+
+		// expand alpha by one bit to match other components
+		int a = data[i * 4 + 3];
+		a = ((a << 1) & as) | (a & 1);
+
+		// compute scaling factor
+		float ss = max / float(as);
+
+		// rounded float->int
+		int rf = int(float(r) * ss + 0.5f);
+		int gf = int(float(g) * ss + 0.5f);
+		int bf = int(float(b) * ss + 0.5f);
+		int af = int(float(a) * ss + 0.5f);
+
+		data[i * 4 + 0] = T(rf);
+		data[i * 4 + 1] = T(gf);
+		data[i * 4 + 2] = T(bf);
+		data[i * 4 + 3] = T(af);
+	}
+}
 #endif

 #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
@@ -201,7 +247,7 @@ inline uint64_t rotateleft64(uint64_t v, int x)
 #endif

 #ifdef SIMD_SSE
-static void decodeFilterOctSimd(signed char* data, size_t count)
+static void decodeFilterOctSimd8(signed char* data, size_t count)
 {
 	const __m128 sign = _mm_set1_ps(-0.f);

@@ -246,7 +292,7 @@ static void decodeFilterOctSimd(signed char* data, size_t count)
 	}
 }

-static void decodeFilterOctSimd(short* data, size_t count)
+static void decodeFilterOctSimd16(short* data, size_t count)
 {
 	const __m128 sign = _mm_set1_ps(-0.f);

@@ -295,8 +341,9 @@ static void decodeFilterOctSimd(short* data, size_t count)
 		__m128i res_1 = _mm_unpackhi_epi16(xzr, y0r);

 		// patch in .w
-		res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000)));
-		res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000)));
+		__m128i maskw = _mm_set_epi32(0xffff0000, 0, 0xffff0000, 0);
+		res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), maskw));
+		res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), maskw));

 		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
 		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
@@ -305,7 +352,7 @@ static void decodeFilterOctSimd(short* data, size_t count)

 static void decodeFilterQuatSimd(short* data, size_t count)
 {
-	const float scale = 1.f / sqrtf(2.f);
+	const float scale = 32767.f / sqrtf(2.f);

 	for (size_t i = 0; i < count; i += 4)
 	{
@@ -324,24 +371,27 @@ static void decodeFilterQuatSimd(short* data, size_t count)

 		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
 		__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
-		__m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
+		__m128 s = _mm_cvtepi32_ps(sf);

-		// convert x/y/z to [-1..1] (scaled...)
-		__m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
-		__m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
-		__m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
+		// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+		__m128 x = _mm_cvtepi32_ps(xf);
+		__m128 y = _mm_cvtepi32_ps(yf);
+		__m128 z = _mm_cvtepi32_ps(zf);

-		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
-		__m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
+		// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
+		__m128 ws = _mm_mul_ps(s, _mm_add_ps(s, s)); // s*2s instead of 2*(s*s) to work around clang bug with integer multiplication
+		__m128 ww = _mm_sub_ps(ws, _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
 		__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));

-		__m128 s = _mm_set1_ps(32767.f);
+		// compute final scale; note that all computations above are unscaled
+		// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+		__m128 ss = _mm_div_ps(_mm_set1_ps(scale), s);

 		// rounded signed float->int
-		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
-		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
-		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
-		__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, ss));
+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, ss));
+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, ss));
+		__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, ss));

 		// mix x/z and w/y to make 16-bit unpack easier
 		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
@@ -385,6 +435,105 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
 		_mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
 	}
 }
+
+static void decodeFilterColorSimd8(unsigned char* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128i c4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
+
+		// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
+		__m128i yf = _mm_and_si128(c4, _mm_set1_epi32(0xff));
+		__m128i cof = _mm_srai_epi32(_mm_slli_epi32(c4, 16), 24);
+		__m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4, 8), 24);
+		__m128i af = _mm_srli_epi32(c4, 24);
+
+		// recover scale from alpha high bit
+		__m128i as = af;
+		as = _mm_or_si128(as, _mm_srli_epi32(as, 1));
+		as = _mm_or_si128(as, _mm_srli_epi32(as, 2));
+		as = _mm_or_si128(as, _mm_srli_epi32(as, 4));
+
+		// expand alpha by one bit to match other components
+		af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1)));
+
+		// compute scaling factor
+		__m128 ss = _mm_mul_ps(_mm_set1_ps(255.f), _mm_rcp_ps(_mm_cvtepi32_ps(as)));
+
+		// convert to RGB in fixed point
+		__m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf));
+		__m128i gf = _mm_add_epi32(yf, cgf);
+		__m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf));
+
+		// rounded signed float->int
+		__m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss));
+		__m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss));
+		__m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss));
+		__m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss));
+
+		// repack rgba into final value
+		__m128i res = rr;
+		res = _mm_or_si128(res, _mm_slli_epi32(gr, 8));
+		res = _mm_or_si128(res, _mm_slli_epi32(br, 16));
+		res = _mm_or_si128(res, _mm_slli_epi32(ar, 24));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
+	}
+}
+
+static void decodeFilterColorSimd16(unsigned short* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128i c4_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]));
+		__m128i c4_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]));
+
+		// gather both y/co 16-bit pairs in each 32-bit lane
+		__m128i c4_yco = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(2, 0, 2, 0)));
+		__m128i c4_cga = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(3, 1, 3, 1)));
+
+		// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
+		__m128i yf = _mm_and_si128(c4_yco, _mm_set1_epi32(0xffff));
+		__m128i cof = _mm_srai_epi32(c4_yco, 16);
+		__m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4_cga, 16), 16);
+		__m128i af = _mm_srli_epi32(c4_cga, 16);
+
+		// recover scale from alpha high bit
+		__m128i as = af;
+		as = _mm_or_si128(as, _mm_srli_epi32(as, 1));
+		as = _mm_or_si128(as, _mm_srli_epi32(as, 2));
+		as = _mm_or_si128(as, _mm_srli_epi32(as, 4));
+		as = _mm_or_si128(as, _mm_srli_epi32(as, 8));
+
+		// expand alpha by one bit to match other components
+		af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1)));
+
+		// compute scaling factor
+		__m128 ss = _mm_div_ps(_mm_set1_ps(65535.f), _mm_cvtepi32_ps(as));
+
+		// convert to RGB in fixed point
+		__m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf));
+		__m128i gf = _mm_add_epi32(yf, cgf);
+		__m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf));
+
+		// rounded signed float->int
+		__m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss));
+		__m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss));
+		__m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss));
+		__m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss));
+
+		// mix r/b and g/a to make 16-bit unpack easier
+		__m128i rbr = _mm_or_si128(_mm_and_si128(rr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(br, 16));
+		__m128i gar = _mm_or_si128(_mm_and_si128(gr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(ar, 16));
+
+		// pack r/g/b/a using 16-bit unpacks
+		__m128i res_0 = _mm_unpacklo_epi16(rbr, gar);
+		__m128i res_1 = _mm_unpackhi_epi16(rbr, gar);
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
+	}
+}
 #endif

 #if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
@@ -401,10 +550,17 @@ inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
 	r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
 	return vmulq_f32(x, r);
 }
+
+#ifndef __ARM_FEATURE_FMA
+inline float32x4_t vfmaq_f32(float32x4_t x, float32x4_t y, float32x4_t z)
+{
+	return vaddq_f32(x, vmulq_f32(y, z));
+}
+#endif
 #endif

 #ifdef SIMD_NEON
-static void decodeFilterOctSimd(signed char* data, size_t count)
+static void decodeFilterOctSimd8(signed char* data, size_t count)
 {
 	const int32x4_t sign = vdupq_n_s32(0x80000000);

@@ -431,29 +587,27 @@ static void decodeFilterOctSimd(signed char* data, size_t count)
 		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));

 		// compute normal length & scale
-		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
+		float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
 		float32x4_t rl = vrsqrteq_f32(ll);
 		float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);

 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
-		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
 		const float32x4_t fsnap = vdupq_n_f32(3 << 22);

-		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
-		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
-		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+		int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
+		int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
+		int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));

 		// combine xr/yr/zr into final value
-		int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
-		res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
-		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
-		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
+		int32x4_t res = vsliq_n_s32(xr, vsliq_n_s32(yr, zr, 8), 8);
+		res = vbslq_s32(vdupq_n_u32(0xff000000), n4, res);

 		vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
 	}
 }

-static void decodeFilterOctSimd(short* data, size_t count)
+static void decodeFilterOctSimd16(short* data, size_t count)
 {
 	const int32x4_t sign = vdupq_n_s32(0x80000000);

@@ -485,21 +639,25 @@ static void decodeFilterOctSimd(short* data, size_t count)
 		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));

 		// compute normal length & scale
-		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
+		float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
+#if !defined(__aarch64__) && !defined(_M_ARM64)
 		float32x4_t rl = vrsqrteq_f32(ll);
 		rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
 		float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
+#else
+		float32x4_t s = vdivq_f32(vdupq_n_f32(32767.f), vsqrtq_f32(ll));
+#endif

 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
 		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
 		const float32x4_t fsnap = vdupq_n_f32(3 << 22);

-		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
-		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
-		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+		int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
+		int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
+		int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));

 		// mix x/z and y/0 to make 16-bit unpack easier
-		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
+		int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
 		int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));

 		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
@@ -517,7 +675,7 @@ static void decodeFilterOctSimd(short* data, size_t count)

 static void decodeFilterQuatSimd(short* data, size_t count)
 {
-	const float scale = 1.f / sqrtf(2.f);
+	const float scale = 32767.f / sqrtf(2.f);

 	for (size_t i = 0; i < count; i += 4)
 	{
@@ -536,43 +694,52 @@ static void decodeFilterQuatSimd(short* data, size_t count)

 		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
 		int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
-		float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
+		float32x4_t s = vcvtq_f32_s32(sf);

-		// convert x/y/z to [-1..1] (scaled...)
-		float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
-		float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
-		float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
+		// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+		float32x4_t x = vcvtq_f32_s32(xf);
+		float32x4_t y = vcvtq_f32_s32(yf);
+		float32x4_t z = vcvtq_f32_s32(zf);

-		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
-		float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
+		// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
+		float32x4_t ws = vmulq_f32(s, s);
+		float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z));
 		float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));

-		float32x4_t s = vdupq_n_f32(32767.f);
+		// compute final scale; note that all computations above are unscaled
+		// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+		float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), s);

 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
 		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
 		const float32x4_t fsnap = vdupq_n_f32(3 << 22);

-		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
-		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
-		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
-		int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
+		int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, ss));
+		int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, ss));
+		int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, ss));
+		int32x4_t wr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, w, ss));

 		// mix x/z and w/y to make 16-bit unpack easier
-		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
-		int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
+		int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
+		int32x4_t wyr = vsliq_n_s32(wr, yr, 16);

 		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
-		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
-		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
+		uint64x2_t res_0 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
+		uint64x2_t res_1 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
+
+		// store results to stack so that we can rotate using scalar instructions
+		// TODO: volatile works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/166808
+		volatile uint64_t res[4];
+		vst1q_u64(const_cast<uint64_t*>(&res[0]), res_0);
+		vst1q_u64(const_cast<uint64_t*>(&res[2]), res_1);

 		// rotate and store
-		uint64_t* out = (uint64_t*)&data[i * 4];
+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);

-		out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
-		out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
-		out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
-		out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
+		out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
+		out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
+		out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
+		out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
 	}
 }

@@ -595,10 +762,112 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
 		vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
 	}
 }
+
+static void decodeFilterColorSimd8(unsigned char* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t c4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]));
+
+		// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
+		int32x4_t yf = vandq_s32(c4, vdupq_n_s32(0xff));
+		int32x4_t cof = vshrq_n_s32(vshlq_n_s32(c4, 16), 24);
+		int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4, 8), 24);
+		int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4), 24));
+
+		// recover scale from alpha high bit
+		int32x4_t as = af;
+		as = vorrq_s32(as, vshrq_n_s32(as, 1));
+		as = vorrq_s32(as, vshrq_n_s32(as, 2));
+		as = vorrq_s32(as, vshrq_n_s32(as, 4));
+
+		// expand alpha by one bit to match other components
+		af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1)));
+
+		// compute scaling factor
+		float32x4_t ss = vmulq_f32(vdupq_n_f32(255.f), vrecpeq_f32(vcvtq_f32_s32(as)));
+
+		// convert to RGB in fixed point
+		int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf));
+		int32x4_t gf = vaddq_s32(yf, cgf);
+		int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+		int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
+		int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
+		int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
+		int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
+
+		// repack rgba into final value
+		int32x4_t res = vsliq_n_s32(rr, vsliq_n_s32(gr, vsliq_n_s32(br, ar, 8), 8), 8);
+
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
+	}
+}
+
+static void decodeFilterColorSimd16(unsigned short* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t c4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
+		int32x4_t c4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
+
+		// gather both y/co 16-bit pairs in each 32-bit lane
+		int32x4_t c4_yco = vuzpq_s32(c4_0, c4_1).val[0];
+		int32x4_t c4_cga = vuzpq_s32(c4_0, c4_1).val[1];
+
+		// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
+		int32x4_t yf = vandq_s32(c4_yco, vdupq_n_s32(0xffff));
+		int32x4_t cof = vshrq_n_s32(c4_yco, 16);
+		int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4_cga, 16), 16);
+		int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4_cga), 16));
+
+		// recover scale from alpha high bit
+		int32x4_t as = af;
+		as = vorrq_s32(as, vshrq_n_s32(as, 1));
+		as = vorrq_s32(as, vshrq_n_s32(as, 2));
+		as = vorrq_s32(as, vshrq_n_s32(as, 4));
+		as = vorrq_s32(as, vshrq_n_s32(as, 8));
+
+		// expand alpha by one bit to match other components
+		af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1)));
+
+		// compute scaling factor
+		float32x4_t ss = vdivq_f32(vdupq_n_f32(65535.f), vcvtq_f32_s32(as));
+
+		// convert to RGB in fixed point
+		int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf));
+		int32x4_t gf = vaddq_s32(yf, cgf);
+		int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+		int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
+		int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
+		int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
+		int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
+
+		// mix r/b and g/a to make 16-bit unpack easier
+		int32x4_t rbr = vsliq_n_s32(rr, br, 16);
+		int32x4_t gar = vsliq_n_s32(gr, ar, 16);
+
+		// pack r/g/b/a using 16-bit unpacks
+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[0]);
+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[1]);
+
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0);
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1);
+	}
+}
 #endif

 #ifdef SIMD_WASM
-static void decodeFilterOctSimd(signed char* data, size_t count)
+static void decodeFilterOctSimd8(signed char* data, size_t count)
 {
 	const v128_t sign = wasm_f32x4_splat(-0.f);

@@ -647,10 +916,11 @@ static void decodeFilterOctSimd(signed char* data, size_t count)
 	}
 }

-static void decodeFilterOctSimd(short* data, size_t count)
+static void decodeFilterOctSimd16(short* data, size_t count)
 {
 	const v128_t sign = wasm_f32x4_splat(-0.f);
-	const v128_t zmask = wasm_i32x4_splat(0x7fff);
+	// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
+	volatile v128_t zmask = wasm_i32x4_splat(0x7fff);

 	for (size_t i = 0; i < count; i += 4)
 	{
@@ -711,7 +981,7 @@ static void decodeFilterOctSimd(short* data, size_t count)

 static void decodeFilterQuatSimd(short* data, size_t count)
 {
-	const float scale = 1.f / sqrtf(2.f);
+	const float scale = 32767.f / sqrtf(2.f);

 	for (size_t i = 0; i < count; i += 4)
 	{
@@ -730,28 +1000,31 @@ static void decodeFilterQuatSimd(short* data, size_t count)

 		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
 		v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
-		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
+		v128_t s = wasm_f32x4_convert_i32x4(sf);

-		// convert x/y/z to [-1..1] (scaled...)
-		v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
-		v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
-		v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
+		// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+		v128_t x = wasm_f32x4_convert_i32x4(xf);
+		v128_t y = wasm_f32x4_convert_i32x4(yf);
+		v128_t z = wasm_f32x4_convert_i32x4(zf);

-		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
 		// note: i32x4_max with 0 is equivalent to f32x4_max
-		v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
+		v128_t ws = wasm_f32x4_mul(s, s);
+		v128_t ww = wasm_f32x4_sub(wasm_f32x4_add(ws, ws), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
 		v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));

-		v128_t s = wasm_f32x4_splat(32767.f);
+		// compute final scale; note that all computations above are unscaled
+		// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), s);

 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
 		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
 		const v128_t fsnap = wasm_f32x4_splat(3 << 22);

-		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
-		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
-		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
-		v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, ss), fsnap);
+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, ss), fsnap);
+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, ss), fsnap);
+		v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, ss), fsnap);

 		// mix x/z and w/y to make 16-bit unpack easier
 		v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
@@ -762,8 +1035,7 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 		v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);

 		// compute component index shifted left by 4 (and moved into i32x4 slot)
-		// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449
-		volatile v128_t cm = wasm_i32x4_shl(cf, 4);
+		v128_t cm = wasm_i32x4_shl(cf, 4);

 		// rotate and store
 		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
@@ -794,6 +1066,117 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
 		wasm_v128_store(&data[i], r);
 	}
 }
+
+static void decodeFilterColorSimd8(unsigned char* data, size_t count)
+{
+	// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
+	volatile v128_t zero = wasm_i32x4_splat(0);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t c4 = wasm_v128_load(&data[i * 4]);
+
+		// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
+		v128_t yf = wasm_v128_and(c4, wasm_i32x4_splat(0xff));
+		v128_t cof = wasm_i32x4_shr(wasm_i32x4_shl(c4, 16), 24);
+		v128_t cgf = wasm_i32x4_shr(wasm_i32x4_shl(c4, 8), 24);
+		v128_t af = wasm_v128_or(zero, wasm_u32x4_shr(c4, 24));
+
+		// recover scale from alpha high bit
+		v128_t as = af;
+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 1));
+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 2));
+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 4));
+
+		// expand alpha by one bit to match other components
+		af = wasm_v128_or(wasm_v128_and(wasm_i32x4_shl(af, 1), as), wasm_v128_and(af, wasm_i32x4_splat(1)));
+
+		// compute scaling factor
+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(255.f), wasm_f32x4_convert_i32x4(as));
+
+		// convert to RGB in fixed point
+		v128_t rf = wasm_i32x4_add(yf, wasm_i32x4_sub(cof, cgf));
+		v128_t gf = wasm_i32x4_add(yf, cgf);
+		v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+		v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);
+		v128_t gr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(gf), ss), fsnap);
+		v128_t br = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(bf), ss), fsnap);
+		v128_t ar = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(af), ss), fsnap);
+
+		// repack rgba into final value
+		v128_t res = wasm_v128_and(rr, wasm_i32x4_splat(0xff));
+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(gr, wasm_i32x4_splat(0xff)), 8));
+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(br, wasm_i32x4_splat(0xff)), 16));
+		res = wasm_v128_or(res, wasm_i32x4_shl(ar, 24));
+
+		wasm_v128_store(&data[i * 4], res);
+	}
+}
+
+static void decodeFilterColorSimd16(unsigned short* data, size_t count)
+{
+	// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
+	volatile v128_t zero = wasm_i32x4_splat(0);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t c4_0 = wasm_v128_load(&data[(i + 0) * 4]);
+		v128_t c4_1 = wasm_v128_load(&data[(i + 2) * 4]);
+
+		// gather both y/co 16-bit pairs in each 32-bit lane
+		v128_t c4_yco = wasmx_unziplo_v32x4(c4_0, c4_1);
+		v128_t c4_cga = wasmx_unziphi_v32x4(c4_0, c4_1);
+
+		// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
+		v128_t yf = wasm_v128_and(c4_yco, wasm_i32x4_splat(0xffff));
+		v128_t cof = wasm_i32x4_shr(c4_yco, 16);
+		v128_t cgf = wasm_i32x4_shr(wasm_i32x4_shl(c4_cga, 16), 16);
+		v128_t af = wasm_v128_or(zero, wasm_u32x4_shr(c4_cga, 16));
+
+		// recover scale from alpha high bit
+		v128_t as = af;
+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 1));
+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 2));
+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 4));
+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 8));
+
+		// expand alpha by one bit to match other components
+		af = wasm_v128_or(wasm_v128_and(wasm_i32x4_shl(af, 1), as), wasm_v128_and(af, wasm_i32x4_splat(1)));
+
+		// compute scaling factor
+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(65535.f), wasm_f32x4_convert_i32x4(as));
+
+		// convert to RGB in fixed point
+		v128_t rf = wasm_i32x4_add(yf, wasm_i32x4_sub(cof, cgf));
+		v128_t gf = wasm_i32x4_add(yf, cgf);
+		v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+		v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);
+		v128_t gr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(gf), ss), fsnap);
+		v128_t br = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(bf), ss), fsnap);
+		v128_t ar = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(af), ss), fsnap);
+
+		// mix r/b and g/a to make 16-bit unpack easier
+		v128_t rbr = wasm_v128_or(wasm_v128_and(rr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(br, 16));
+		v128_t gar = wasm_v128_or(wasm_v128_and(gr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(ar, 16));
+
+		// pack r/g/b/a using 16-bit unpacks
+		v128_t res_0 = wasmx_unpacklo_v16x8(rbr, gar);
+		v128_t res_1 = wasmx_unpackhi_v16x8(rbr, gar);
+
+		wasm_v128_store(&data[(i + 0) * 4], res_0);
+		wasm_v128_store(&data[(i + 2) * 4], res_1);
+	}
+}
 #endif

 // optimized variant of frexp
@@ -807,7 +1190,7 @@ inline int optlog2(float v)

 	u.f = v;
 	// +1 accounts for implicit 1. in mantissa; denormalized numbers will end up clamped to min_exp by calling code
-	return u.ui == 0 ? 0 : int((u.ui >> 23) & 0xff) - 127 + 1;
+	return v == 0 ? 0 : int((u.ui >> 23) & 0xff) - 127 + 1;
 }

 // optimized variant of ldexp
@@ -833,9 +1216,9 @@ void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride)

 #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
 	if (stride == 4)
-		dispatchSimd(decodeFilterOctSimd, static_cast<signed char*>(buffer), count, 4);
+		dispatchSimd(decodeFilterOctSimd8, static_cast<signed char*>(buffer), count, 4);
 	else
-		dispatchSimd(decodeFilterOctSimd, static_cast<short*>(buffer), count, 4);
+		dispatchSimd(decodeFilterOctSimd16, static_cast<short*>(buffer), count, 4);
 #else
 	if (stride == 4)
 		decodeFilterOct(static_cast<signed char*>(buffer), count);
@@ -871,10 +1254,29 @@ void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride)
 #endif
 }

+void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride)
+{
+	using namespace meshopt;
+
+	assert(stride == 4 || stride == 8);
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	if (stride == 4)
+		dispatchSimd(decodeFilterColorSimd8, static_cast<unsigned char*>(buffer), count, 4);
+	else
+		dispatchSimd(decodeFilterColorSimd16, static_cast<unsigned short*>(buffer), count, 4);
+#else
+	if (stride == 4)
+		decodeFilterColor<signed char>(static_cast<unsigned char*>(buffer), count);
+	else
+		decodeFilterColor<short>(static_cast<unsigned short*>(buffer), count);
+#endif
+}
+
 void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data)
 {
 	assert(stride == 4 || stride == 8);
-	assert(bits >= 1 && bits <= 16);
+	assert(bits >= 2 && bits <= 16);

 	signed char* d8 = static_cast<signed char*>(destination);
 	short* d16 = static_cast<short*>(destination);
@@ -1010,6 +1412,20 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in
 				component_exp[j] = (min_exp < e) ? e : min_exp;
 			}
 		}
+		else if (mode == meshopt_EncodeExpClamped)
+		{
+			for (size_t j = 0; j < stride_float; ++j)
+			{
+				int e = optlog2(v[j]);
+
+				component_exp[j] = (0 < e) ? e : 0;
+			}
+		}
+		else
+		{
+			// the code below assumes component_exp is initialized outside of the loop
+			assert(mode == meshopt_EncodeExpSharedComponent);
+		}

 		for (size_t j = 0; j < stride_float; ++j)
 		{
@@ -1020,7 +1436,6 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in

 			// compute renormalized rounded mantissa for each component
 			int mmask = (1 << 24) - 1;
-
 			int m = int(v[j] * optexp2(-exp) + (v[j] >= 0 ? 0.5f : -0.5f));

 			d[j] = (m & mmask) | (unsigned(exp) << 24);
@@ -1028,6 +1443,51 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in
 	}
 }

+void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data)
+{
+	assert(stride == 4 || stride == 8);
+	assert(bits >= 2 && bits <= 16);
+
+	unsigned char* d8 = static_cast<unsigned char*>(destination);
+	unsigned short* d16 = static_cast<unsigned short*>(destination);
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		const float* c = &data[i * 4];
+
+		int fr = meshopt_quantizeUnorm(c[0], bits);
+		int fg = meshopt_quantizeUnorm(c[1], bits);
+		int fb = meshopt_quantizeUnorm(c[2], bits);
+
+		// YCoCg-R encoding with truncated Co/Cg ensures that decoding can be done using integers
+		int fco = (fr - fb) / 2;
+		int tmp = fb + fco;
+		int fcg = (fg - tmp) / 2;
+		int fy = tmp + fcg;
+
+		// validate that R/G/B can be reconstructed with K bit integers
+		assert(unsigned((fy + fco - fcg) | (fy + fcg) | (fy - fco - fcg)) < (1u << bits));
+
+		// alpha: K-1-bit encoding with high bit set to 1
+		int fa = meshopt_quantizeUnorm(c[3], bits - 1) | (1 << (bits - 1));
+
+		if (stride == 4)
+		{
+			d8[i * 4 + 0] = (unsigned char)(fy);
+			d8[i * 4 + 1] = (unsigned char)(fco);
+			d8[i * 4 + 2] = (unsigned char)(fcg);
+			d8[i * 4 + 3] = (unsigned char)(fa);
+		}
+		else
+		{
+			d16[i * 4 + 0] = (unsigned short)(fy);
+			d16[i * 4 + 1] = (unsigned short)(fco);
+			d16[i * 4 + 2] = (unsigned short)(fcg);
+			d16[i * 4 + 3] = (unsigned short)(fa);
+		}
+	}
+}
+
 #undef SIMD_SSE
 #undef SIMD_NEON
 #undef SIMD_WASM
--- a/Source/ThirdParty/meshoptimizer/vfetchanalyzer.cpp
+++ b/Source/ThirdParty/meshoptimizer/vfetchanalyzer.cpp
@@ -1,58 +0,0 @@
-// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
-#include "meshoptimizer.h"
-
-#include <assert.h>
-#include <string.h>
-
-meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
-{
-	assert(index_count % 3 == 0);
-	assert(vertex_size > 0 && vertex_size <= 256);
-
-	meshopt_Allocator allocator;
-
-	meshopt_VertexFetchStatistics result = {};
-
-	unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count);
-	memset(vertex_visited, 0, vertex_count);
-
-	const size_t kCacheLine = 64;
-	const size_t kCacheSize = 128 * 1024;
-
-	// simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
-	size_t cache[kCacheSize / kCacheLine] = {};
-
-	for (size_t i = 0; i < index_count; ++i)
-	{
-		unsigned int index = indices[i];
-		assert(index < vertex_count);
-
-		vertex_visited[index] = 1;
-
-		size_t start_address = index * vertex_size;
-		size_t end_address = start_address + vertex_size;
-
-		size_t start_tag = start_address / kCacheLine;
-		size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
-
-		assert(start_tag < end_tag);
-
-		for (size_t tag = start_tag; tag < end_tag; ++tag)
-		{
-			size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
-
-			// we store +1 since cache is filled with 0 by default
-			result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
-			cache[line] = tag + 1;
-		}
-	}
-
-	size_t unique_vertex_count = 0;
-
-	for (size_t i = 0; i < vertex_count; ++i)
-		unique_vertex_count += vertex_visited[i];
-
-	result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
-
-	return result;
-}
--- a/Source/Tools/Flax.Build/Bindings/BindingsGenerator.CSharp.cs
+++ b/Source/Tools/Flax.Build/Bindings/BindingsGenerator.CSharp.cs
@@ -703,6 +703,8 @@ namespace Flax.Build.Bindings
                else if (nativeType.EndsWith("[]"))
                {
                    parameterMarshalType = $"MarshalUsing(typeof(FlaxEngine.Interop.ArrayMarshaller<,>))";
+                    if (!parameterInfo.IsOut && !parameterInfo.IsRef)
+                        parameterMarshalType += ", In"; // The usage of 'LibraryImportAttribute' does not follow recommendations. It is recommended to use explicit '[In]' and '[Out]' attributes on array parameters.
                }

                if (!string.IsNullOrEmpty(parameterMarshalType))
--- a/Source/Tools/Flax.Build/Deps/Dependencies/AGS.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/AGS.cs
@@ -18,6 +18,23 @@ namespace Flax.Deps.Dependencies
            get => new[] { TargetPlatform.Windows };
        }

+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
        /// <inheritdoc />
        public override void Build(BuildOptions options)
        {
@@ -30,7 +47,7 @@ namespace Flax.Deps.Dependencies
            // Copy files
            foreach (var platform in options.Platforms)
            {
-                BuildStarted(platform);
+                BuildStarted(platform, TargetArchitecture.x64);
                var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
                Utilities.FileCopy(Path.Combine(root, "ags_lib/lib/amd_ags_x64.lib"), Path.Combine(depsFolder, "amd_ags_x64.lib"));
                Utilities.FileCopy(Path.Combine(root, "ags_lib/lib/amd_ags_x64.dll"), Path.Combine(depsFolder, "amd_ags_x64.dll"));
--- a/Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs
@@ -2,6 +2,7 @@

 using System.Collections.Generic;
 using System.IO;
+using System.Linq;
 using Flax.Build;

 namespace Flax.Deps.Dependencies
@@ -39,6 +40,36 @@ namespace Flax.Deps.Dependencies
            }
        }

+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        //TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
        /// <inheritdoc />
        public override void Build(BuildOptions options)
        {
@@ -91,22 +122,22 @@ namespace Flax.Deps.Dependencies

            foreach (var platform in options.Platforms)
            {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                {
-                case TargetPlatform.Windows:
-                {
-                    var configuration = "Release";
-                    var binariesWin = new[]
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                    {
-                        Path.Combine("bin", configuration, "assimp-vc140-md.dll"),
-                        Path.Combine("lib", configuration, "assimp-vc140-md.lib"),
-                    };
+                    case TargetPlatform.Windows:
+                    {
+                        var configuration = "Release";
+                        var binariesWin = new[]
+                        {
+                            Path.Combine("bin", configuration, "assimp-vc140-md.dll"),
+                            Path.Combine("lib", configuration, "assimp-vc140-md.lib"),
+                        };

-                    // Build for Windows
-                    File.Delete(Path.Combine(root, "CMakeCache.txt"));
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
-                    {
+                        // Build for Windows
+                        File.Delete(Path.Combine(root, "CMakeCache.txt"));
                        var buildDir = Path.Combine(root, "build-" + architecture);
                        var solutionPath = Path.Combine(buildDir, "Assimp.sln");
                        SetupDirectory(buildDir, true);
@@ -116,42 +147,40 @@ namespace Flax.Deps.Dependencies
                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                        foreach (var file in binariesWin)
                            Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, Path.GetFileName(file)));
+                        break;
                    }
-
-                    break;
-                }
-                case TargetPlatform.Linux:
-                {
-                    var envVars = new Dictionary<string, string>
+                    case TargetPlatform.Linux:
                    {
-                        { "CC", "clang-" + Configuration.LinuxClangMinVer },
-                        { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
-                        { "CXX", "clang++-" + Configuration.LinuxClangMinVer },
-                        { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
-                    };
+                        var envVars = new Dictionary<string, string>
+                        {
+                            { "CC", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CXX", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
+                        };

-                    // Build for Linux
-                    RunCmake(root, platform, TargetArchitecture.x64, " -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF " + globalConfig, envVars);
-                    Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool, envVars);
-                    configHeaderFilePath = Path.Combine(root, "include", "assimp", "config.h");
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    Utilities.FileCopy(Path.Combine(root, "lib", "libassimp.a"), Path.Combine(depsFolder, "libassimp.a"));
-                    break;
-                }
-                case TargetPlatform.Mac:
-                {
-                    // Build for Mac
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                        // Build for Linux
+                        File.Delete(Path.Combine(root, "CMakeCache.txt"));
+                        RunCmake(root, platform, architecture, " -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF " + globalConfig, envVars);
+                        Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool, envVars);
+                        configHeaderFilePath = Path.Combine(root, "include", "assimp", "config.h");
+                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
+                        Utilities.FileCopy(Path.Combine(root, "lib", "libassimp.a"), Path.Combine(depsFolder, "libassimp.a"));
+                        break;
+                    }
+                    case TargetPlatform.Mac:
                    {
+                        // Build for Mac
+                        File.Delete(Path.Combine(root, "CMakeCache.txt"));
                        RunCmake(root, platform, architecture, " -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF " + globalConfig);
                        Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool);
                        configHeaderFilePath = Path.Combine(root, "include", "assimp", "config.h");
                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                        Utilities.FileCopy(Path.Combine(root, "lib", "libassimp.a"), Path.Combine(depsFolder, "libassimp.a"));
                        Utilities.Run("make", "clean", null, root, Utilities.RunOptions.DefaultTool);
+                        break;
+                    }
                    }
-                    break;
-                }
                }
            }

--- a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXMesh.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXMesh.cs
@@ -28,6 +28,24 @@ namespace Flax.Deps.Dependencies
            }
        }

+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
        /// <inheritdoc />
        public override void Build(BuildOptions options)
        {
@@ -46,12 +64,12 @@ namespace Flax.Deps.Dependencies

            foreach (var platform in options.Platforms)
            {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                {
-                case TargetPlatform.Windows:
-                {
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    BuildStarted(platform, architecture);
+                    switch (platform)
+                    {
+                    case TargetPlatform.Windows:
                    {
                        Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString());
                        var depsFolder = GetThirdPartyFolder(options, TargetPlatform.Windows, architecture);
@@ -61,7 +79,7 @@ namespace Flax.Deps.Dependencies
                        }
                    }
                    break;
-                }
+                    }
                }
            }

--- a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXShaderCompiler.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXShaderCompiler.cs
@@ -1,6 +1,5 @@
 // Copyright (c) Wojciech Figat. All rights reserved.

-using System;
 using System.IO;
 using System.Linq;
 using Flax.Build;
@@ -31,22 +30,40 @@ namespace Flax.Deps.Dependencies
            }
        }

+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
        /// <inheritdoc />
        public override void Build(BuildOptions options)
        {
            foreach (var platform in options.Platforms)
            {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                {
-                case TargetPlatform.Windows:
-                {
-                    var sdk = WindowsPlatformBase.GetSDKs().Last();
-                    var sdkLibLocation = Path.Combine(sdk.Value, "Lib", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString(), "um");
-                    string binLocation = Path.Combine(sdk.Value, "bin", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString());
-
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                    {
+                    case TargetPlatform.Windows:
+                    {
+                        var sdk = WindowsPlatformBase.GetSDKs().Last();
+                        var sdkLibLocation = Path.Combine(sdk.Value, "Lib", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString(), "um");
+                        string binLocation = Path.Combine(sdk.Value, "bin", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString());
+
                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);

                        string dxilLocation = @$"{binLocation}\{architecture}\dxil.dll";
@@ -60,9 +77,9 @@ namespace Flax.Deps.Dependencies
                        string d3dcompilerLibLocation = @$"{sdkLibLocation}\{architecture}\d3dcompiler.lib";
                        Utilities.FileCopy(dxcompilerLibLocation, Path.Combine(depsFolder, Path.GetFileName(dxcompilerLibLocation)));
                        Utilities.FileCopy(d3dcompilerLibLocation, Path.Combine(depsFolder, "d3dcompiler_47.lib"));
+                        break;
+                    }
                    }
-                    break;
-                }
                }
            }
        }
--- a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXTex.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXTex.cs
@@ -30,6 +30,30 @@ namespace Flax.Deps.Dependencies
            }
        }

+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.XboxOne:
+                case TargetPlatform.XboxScarlett:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
        /// <inheritdoc />
        public override void Build(BuildOptions options)
        {
@@ -47,44 +71,44 @@ namespace Flax.Deps.Dependencies

            foreach (var platform in options.Platforms)
            {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                {
-                case TargetPlatform.Windows:
-                {
-                    var solutionPath = Path.Combine(root, "DirectXTex_Desktop_2022_Win10.sln");
-                    var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Desktop_2022_Win10");
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                    {
+                    case TargetPlatform.Windows:
+                    {
+                        var solutionPath = Path.Combine(root, "DirectXTex_Desktop_2022_Win10.sln");
+                        var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Desktop_2022_Win10");
                        Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString());
                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                        foreach (var file in outputFileNames)
                            Utilities.FileCopy(Path.Combine(binFolder, architecture.ToString(), configuration, file), Path.Combine(depsFolder, file));
+                        break;
+                    }
+                    case TargetPlatform.UWP:
+                    {
+                        var solutionPath = Path.Combine(root, "DirectXTex_Windows10_2019.sln");
+                        var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Windows10_2019");
+                        Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, "x64");
+                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
+                        foreach (var file in outputFileNames)
+                            Utilities.FileCopy(Path.Combine(binFolder, "x64", configuration, file), Path.Combine(depsFolder, file));
+                        break;
+                    }
+                    case TargetPlatform.XboxOne:
+                    case TargetPlatform.XboxScarlett:
+                    {
+                        var solutionPath = Path.Combine(root, "DirectXTex_GDK_2022.sln");
+                        var binFolder = Path.Combine(root, "DirectXTex", "Bin", "GDK_2022");
+                        var xboxName = platform == TargetPlatform.XboxOne ? "Gaming.Xbox.XboxOne.x64" : "Gaming.Xbox.Scarlett.x64";
+                        Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, xboxName);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
+                        foreach (var file in outputFileNames)
+                            Utilities.FileCopy(Path.Combine(binFolder, xboxName, configuration, file), Path.Combine(depsFolder, file));
+                        break;
+                    }
                    }
-                    break;
-                }
-                case TargetPlatform.UWP:
-                {
-                    var solutionPath = Path.Combine(root, "DirectXTex_Windows10_2019.sln");
-                    var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Windows10_2019");
-                    Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, "x64");
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    foreach (var file in outputFileNames)
-                        Utilities.FileCopy(Path.Combine(binFolder, "x64", configuration, file), Path.Combine(depsFolder, file));
-                    break;
-                }
-                case TargetPlatform.XboxOne:
-                case TargetPlatform.XboxScarlett:
-                {
-                    var solutionPath = Path.Combine(root, "DirectXTex_GDK_2022.sln");
-                    var binFolder = Path.Combine(root, "DirectXTex", "Bin", "GDK_2022");
-                    var xboxName = platform == TargetPlatform.XboxOne ? "Gaming.Xbox.XboxOne.x64" : "Gaming.Xbox.Scarlett.x64";
-                    Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, xboxName);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    foreach (var file in outputFileNames)
-                        Utilities.FileCopy(Path.Combine(binFolder, xboxName, configuration, file), Path.Combine(depsFolder, file));
-                    break;
-                }
                }
            }

--- a/Source/Tools/Flax.Build/Deps/Dependencies/EnvDTE.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/EnvDTE.cs
@@ -0,0 +1,92 @@
+// Copyright (c) Wojciech Figat. All rights reserved.
+
+using System.IO;
+using System.IO.Compression;
+using Flax.Build;
+
+namespace Flax.Deps.Dependencies
+{
+    /// <summary>
+    /// Visual Studio EnvDTE COM library. https://learn.microsoft.com/en-us/dotnet/api/envdte?view=visualstudiosdk-2022
+    /// </summary>
+    /// <seealso cref="Flax.Deps.Dependency" />
+    class EnvDTE : Dependency
+    {
+        /// <inheritdoc />
+        public override TargetPlatform[] Platforms
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetPlatform.Windows,
+                    };
+                default: return new TargetPlatform[0];
+                }
+            }
+        }
+
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
+        /// <inheritdoc />
+        public override void Build(BuildOptions options)
+        {
+            options.IntermediateFolder.Replace("/" + GetType().Name, "/Microsoft.VisualStudio.Setup.Configuration.Native");
+
+            // Get the source
+            var root = options.IntermediateFolder;
+            var packagePath = Path.Combine(root, $"package.zip");
+            if (!File.Exists(packagePath))
+            {
+                Downloader.DownloadFileFromUrlToPath("https://www.nuget.org/api/v2/package/Microsoft.VisualStudio.Setup.Configuration.Native/3.14.2075", packagePath);
+            }
+            var extractedPath = Path.Combine(root, "extracted");
+            if (!Directory.Exists(extractedPath))
+            {
+                using (ZipArchive archive = ZipFile.Open(packagePath, ZipArchiveMode.Read))
+                    archive.ExtractToDirectory(extractedPath);
+            }
+            root = extractedPath;
+
+            foreach (var platform in options.Platforms)
+            {
+                foreach (var architecture in options.Architectures)
+                {
+                    BuildStarted(platform, architecture);
+                    switch (platform)
+                    {
+                    case TargetPlatform.Windows:
+                    {
+                        var bin = Path.Combine(root, "lib", "native", "v141", architecture.ToString().ToLower());
+                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
+                        Utilities.FileCopy(Path.Combine(bin, "Microsoft.VisualStudio.Setup.Configuration.Native.lib"), Path.Combine(depsFolder, "Microsoft.VisualStudio.Setup.Configuration.Native.lib"));
+
+                        var include = Path.Combine(root, "lib", "native", "include");
+                        Utilities.FileCopy(Path.Combine(include, "Setup.Configuration.h"), Path.Combine(options.ThirdPartyFolder, "Microsoft.VisualStudio.Setup.Configuration.Native", "Setup.Configuration.h"));
+                        break;
+                    }
+                    }
+                }
+            }
+        }
+    }
+}
--- a/Source/Tools/Flax.Build/Deps/Dependencies/NewtonsoftJson.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/NewtonsoftJson.cs
@@ -36,6 +36,24 @@ namespace Flax.Deps.Dependencies
            }
        }

+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
        /// <inheritdoc />
        public override void Build(BuildOptions options)
        {
--- a/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs
@@ -1,5 +1,6 @@
 // Copyright (c) Wojciech Figat. All rights reserved.

+using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
@@ -16,40 +17,6 @@ namespace Flax.Deps.Dependencies
    {
        private string root, nvCloth;

-        /// <inheritdoc />
-        public override TargetPlatform[] Platforms
-        {
-            get
-            {
-                switch (BuildPlatform)
-                {
-                case TargetPlatform.Windows:
-                    return new[]
-                    {
-                        TargetPlatform.Windows,
-                        TargetPlatform.XboxOne,
-                        TargetPlatform.XboxScarlett,
-                        TargetPlatform.PS4,
-                        TargetPlatform.PS5,
-                        TargetPlatform.Switch,
-                        TargetPlatform.Android,
-                    };
-                case TargetPlatform.Linux:
-                    return new[]
-                    {
-                        TargetPlatform.Linux,
-                    };
-                case TargetPlatform.Mac:
-                    return new[]
-                    {
-                        TargetPlatform.Mac,
-                        TargetPlatform.iOS,
-                    };
-                default: return new TargetPlatform[0];
-                }
-            }
-        }
-
        /// <inheritdoc />
        public override void Build(BuildOptions options)
        {
@@ -59,41 +26,51 @@ namespace Flax.Deps.Dependencies
            // Get the source
            CloneGitRepoSingleBranch(root, "https://github.com/FlaxEngine/NvCloth.git", "master");

+            // Patch the CMakeLists.txt to support custom compilation flags
+            foreach (var os in new[] { "android", "ios", "linux", "mac", "windows", })
+            {
+                var filePath = Path.Combine(nvCloth, "compiler", "cmake", os, "CMakeLists.txt");
+                var appendLine = "SET(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} ${NVCLOTH_CXX_FLAGS}\")";
+                if (!File.ReadAllText(filePath).Contains(appendLine))
+                    File.AppendAllText(filePath, Environment.NewLine + appendLine + Environment.NewLine);
+            }
+
            foreach (var platform in options.Platforms)
            {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                {
-                case TargetPlatform.Windows:
-                    Build(options, platform, TargetArchitecture.x64);
-                    Build(options, platform, TargetArchitecture.ARM64);
-                    break;
-                case TargetPlatform.XboxOne:
-                case TargetPlatform.XboxScarlett:
-                    Build(options, platform, TargetArchitecture.x64);
-                    break;
-                case TargetPlatform.PS4:
-                case TargetPlatform.PS5:
-                    Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true);
-                    Build(options, platform, TargetArchitecture.x64);
-                    break;
-                case TargetPlatform.Switch:
-                    Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true);
-                    Build(options, platform, TargetArchitecture.ARM64);
-                    break;
-                case TargetPlatform.Android:
-                    Build(options, platform, TargetArchitecture.ARM64);
-                    break;
-                case TargetPlatform.Mac:
-                    Build(options, platform, TargetArchitecture.x64);
-                    Build(options, platform, TargetArchitecture.ARM64);
-                    break;
-                case TargetPlatform.iOS:
-                    Build(options, platform, TargetArchitecture.ARM64);
-                    break;
-                case TargetPlatform.Linux:
-                    Build(options, platform, TargetArchitecture.x64);
-                    break;
+                    BuildStarted(platform, architecture);
+                    switch (platform)
+                    {
+                    case TargetPlatform.Windows:
+                        Build(options, platform, architecture);
+                        break;
+                    case TargetPlatform.XboxOne:
+                    case TargetPlatform.XboxScarlett:
+                        Build(options, platform, TargetArchitecture.x64);
+                        break;
+                    case TargetPlatform.PS4:
+                    case TargetPlatform.PS5:
+                        Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true);
+                        Build(options, platform, TargetArchitecture.x64);
+                        break;
+                    case TargetPlatform.Switch:
+                        Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true);
+                        Build(options, platform, TargetArchitecture.ARM64);
+                        break;
+                    case TargetPlatform.Android:
+                        Build(options, platform, TargetArchitecture.ARM64);
+                        break;
+                    case TargetPlatform.Mac:
+                        Build(options, platform, architecture);
+                        break;
+                    case TargetPlatform.iOS:
+                        Build(options, platform, TargetArchitecture.ARM64);
+                        break;
+                    case TargetPlatform.Linux:
+                        Build(options, platform, architecture);
+                        break;
+                    }
                }
            }

@@ -110,7 +87,7 @@ namespace Flax.Deps.Dependencies
            // Peek options
            var binariesPrefix = string.Empty;
            var binariesPostfix = string.Empty;
-            var cmakeArgs = "-DNV_CLOTH_ENABLE_DX11=0 -DNV_CLOTH_ENABLE_CUDA=0 -DPX_GENERATE_GPU_PROJECTS=0";
+            var cmakeArgs = "-DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DNV_CLOTH_ENABLE_DX11=0 -DNV_CLOTH_ENABLE_CUDA=0 -DPX_GENERATE_GPU_PROJECTS=0";
            var cmakeName = string.Empty;
            var buildFolder = Path.Combine(nvCloth, "compiler", platform.ToString() + '_' + architecture.ToString());
            var envVars = new Dictionary<string, string>();
@@ -154,7 +131,7 @@ namespace Flax.Deps.Dependencies
                }
                break;
            case TargetPlatform.Mac:
-                cmakeArgs += " -DTARGET_BUILD_PLATFORM=mac";
+                cmakeArgs += " -DTARGET_BUILD_PLATFORM=mac -DNVCLOTH_CXX_FLAGS=\"-Wno-error=poison-system-directories -Wno-error=missing-include-dirs\"";
                cmakeName = "mac";
                binariesPrefix = "lib";
                break;
@@ -164,7 +141,7 @@ namespace Flax.Deps.Dependencies
                binariesPrefix = "lib";
                break;
            case TargetPlatform.Linux:
-                cmakeArgs += " -DTARGET_BUILD_PLATFORM=linux";
+                cmakeArgs += " -DTARGET_BUILD_PLATFORM=linux -DNVCLOTH_CXX_FLAGS=\"-Wno-error=poison-system-directories -Wno-error=missing-include-dirs\"";
                cmakeName = "linux";
                binariesPrefix = "lib";
                envVars.Add("CC", "clang-" + Configuration.LinuxClangMinVer);
--- a/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs
@@ -1,5 +1,5 @@
 // Copyright (c) Wojciech Figat. All rights reserved.
-
+//#define USE_GIT_REPOSITORY
 using System;
 using System.Collections.Generic;
 using System.IO;
@@ -45,132 +45,75 @@ namespace Flax.Deps.Dependencies
            }
        }

+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        //TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.iOS:
+                    return new[]
+                    {
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Android:
+                    return new[]
+                    {
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
        /// <inheritdoc />
        public override void Build(BuildOptions options)
        {
            var root = options.IntermediateFolder;
            var version = "1.24.3";
            var configuration = "Release";
+            var cmakeArgs = "-DCMAKE_POLICY_VERSION_MINIMUM=3.5";
            var dstIncludePath = Path.Combine(options.ThirdPartyFolder, "OpenAL");
            var noSSL = true; // OpenAL Soft website has broken certs

-            foreach (var platform in options.Platforms)
-            {
-                BuildStarted(platform);
-                switch (platform)
-                {
-                case TargetPlatform.Windows:
-                {
-                    var binariesToCopy = new[]
-                    {
-                        "OpenAL32.lib",
-                        "OpenAL32.dll",
-                    };
-
-                    // Get the source
-                    CloneGitRepo(root, "https://github.com/kcat/openal-soft.git");
-                    GitCheckout(root, "master", "dc7d7054a5b4f3bec1dc23a42fd616a0847af948"); // 1.24.3
-
-                    // Build for Win64 and ARM64
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
-                    {
-                        var buildDir = Path.Combine(root, "build-" + architecture.ToString());
-                        var solutionPath = Path.Combine(buildDir, "OpenAL.sln");
-
-                        RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DBUILD_SHARED_LIBS=OFF -DCMAKE_C_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\" -DCMAKE_CXX_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\"");
-                        Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString());
-                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
-                        foreach (var file in binariesToCopy)
-                            Utilities.FileCopy(Path.Combine(buildDir, configuration, file), Path.Combine(depsFolder, Path.GetFileName(file)));
-                    }
-                    
-#if false
-                    // Get the binaries
-                    var packagePath = Path.Combine(root, "package.zip");
-                    if (!File.Exists(packagePath))
-                        Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-binaries/openal-soft-" + version + "-bin.zip", packagePath, noSSL);
-                    using (ZipArchive archive = ZipFile.Open(packagePath, ZipArchiveMode.Read))
-                    {
-                        if (!Directory.Exists(root))
-                            archive.ExtractToDirectory(root);
-                        root = Path.Combine(root, archive.Entries.First().FullName);
-                    }
-
-                    // Deploy Win64 binaries
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    Utilities.FileCopy(Path.Combine(root, "bin", "Win64", "soft_oal.dll"), Path.Combine(depsFolder, "OpenAL32.dll"));
-                    Utilities.FileCopy(Path.Combine(root, "libs", "Win64", "OpenAL32.lib"), Path.Combine(depsFolder, "OpenAL32.lib"));
-
-                    // Deploy license
-                    Utilities.FileCopy(Path.Combine(root, "COPYING"), Path.Combine(dstIncludePath, "COPYING"), true);
-
-                    // Deploy header files
-                    var files = Directory.GetFiles(Path.Combine(root, "include", "AL"));
-                    foreach (var file in files)
-                    {
-                        Utilities.FileCopy(file, Path.Combine(dstIncludePath, Path.GetFileName(file)));
-                    }
+#if !USE_GIT_REPOSITORY
+            if (options.Platforms.Contains(TargetPlatform.Windows))
 #endif
-                    break;
-                }
-                case TargetPlatform.Linux:
+            {
+                // Get the source
+                CloneGitRepo(root, "https://github.com/kcat/openal-soft.git");
+                GitCheckout(root, "master", "dc7d7054a5b4f3bec1dc23a42fd616a0847af948"); // 1.24.3
+            }
+#if !USE_GIT_REPOSITORY
+            else
+            {
+                // Get the source
+                var packagePath = Path.Combine(root, $"package-{version}.zip");
+                if (!File.Exists(packagePath))
                {
-                    var binariesToCopy = new[]
-                    {
-                        "libopenal.a",
-                    };
-                    var envVars = new Dictionary<string, string>
-                    {
-                        { "CC", "clang-" + Configuration.LinuxClangMinVer },
-                        { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
-                        { "CXX", "clang++-" + Configuration.LinuxClangMinVer },
-                        { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
-                    };
-                    var config = $"-DALSOFT_REQUIRE_ALSA=ON " +
-                                 $"-DALSOFT_REQUIRE_OSS=ON " +
-                                 $"-DALSOFT_REQUIRE_PORTAUDIO=ON " +
-                                 $"-DALSOFT_REQUIRE_PULSEAUDIO=ON " +
-                                 $"-DALSOFT_REQUIRE_JACK=ON " +
-                                 $"-DALSOFT_REQUIRE_PIPEWIRE=ON " +
-                                 $"-DALSOFT_EMBED_HRTF_DATA=YES ";
-
-                    // Get the source
-                    var packagePath = Path.Combine(root, "package.zip");
-                    File.Delete(packagePath);
-                    Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL);
-                    Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
-
-                    // Use separate build directory
-                    root = Path.Combine(root, "openal-soft-" + version);
-                    var buildDir = Path.Combine(root, "build");
-                    SetupDirectory(buildDir, true);
-
-                    // Build for Linux
-                    Utilities.Run("cmake", $"-G \"Unix Makefiles\" -DCMAKE_BUILD_TYPE={configuration} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DLIBTYPE=STATIC {config} ..", null, buildDir, Utilities.RunOptions.ConsoleLogOutput, envVars);
-                    BuildCmake(buildDir, configuration, envVars);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    foreach (var file in binariesToCopy)
-                        Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
-                    break;
-                }
-                case TargetPlatform.Android:
-                {
-                    var binariesToCopy = new[]
-                    {
-                        "libopenal.a",
-                    };
-                    var envVars = new Dictionary<string, string>
-                    {
-                        { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
-                    };
-                    var config = " -DALSOFT_REQUIRE_OBOE=OFF -DALSOFT_REQUIRE_OPENSL=ON -DALSOFT_EMBED_HRTF_DATA=YES";
-
-                    // Get the source
-                    var packagePath = Path.Combine(root, "package.zip");
-                    File.Delete(packagePath);
                    Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL);
                    if (Platform.BuildTargetPlatform == TargetPlatform.Windows)
                    {
+                        // TODO: Maybe use PowerShell Expand-Archive instead?
                        var sevenZip = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.ProgramFiles), "7-Zip", "7z.exe");
                        Utilities.Run(sevenZip, "x package.zip", null, root);
                        Utilities.Run(sevenZip, "x package", null, root);
@@ -179,89 +122,167 @@ namespace Flax.Deps.Dependencies
                    {
                        Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
                    }
-
-                    // Use separate build directory
-                    root = Path.Combine(root, "openal-soft-" + version);
-                    var buildDir = Path.Combine(root, "build");
-                    SetupDirectory(buildDir, true);
-
-                    // Build
-                    RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
-                    BuildCmake(buildDir, envVars);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
-                    foreach (var file in binariesToCopy)
-                        Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
-                    break;
                }
-                case TargetPlatform.Mac:
+            }
+#endif
+
+            foreach (var platform in options.Platforms)
+            {
+                foreach (var architecture in options.Architectures)
                {
-                    var binariesToCopy = new[]
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                    {
-                        "libopenal.a",
-                    };
-                    var envVars = new Dictionary<string, string>
+                    case TargetPlatform.Windows:
                    {
-                        { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
-                    };
-                    var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES";
+                        var binariesToCopy = new[]
+                        {
+                            "OpenAL32.lib",
+                            "OpenAL32.dll",
+                        };

-                    // Get the source
-                    var packagePath = Path.Combine(root, "package.zip");
-                    File.Delete(packagePath);
-                    Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL);
-                    Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
-
-                    // Use separate build directory
-                    root = Path.Combine(root, "openal-soft-" + version);
-                    var buildDir = Path.Combine(root, "build");
-
-                    // Build for Mac
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
-                    {
+                        // Build for Windows
+                        var buildDir = Path.Combine(root, "build-" + architecture.ToString());
+                        var solutionPath = Path.Combine(buildDir, "OpenAL.sln");
                        SetupDirectory(buildDir, true);
-                        RunCmake(buildDir, platform, architecture, ".. -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
+                        RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DBUILD_SHARED_LIBS=OFF -DCMAKE_C_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\" -DCMAKE_CXX_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\" " + cmakeArgs);
+                        Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString());
+                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
+                        foreach (var file in binariesToCopy)
+                            Utilities.FileCopy(Path.Combine(buildDir, configuration, file), Path.Combine(depsFolder, Path.GetFileName(file)));
+                        break;
+                    }
+                    case TargetPlatform.Linux:
+                    {
+                        var binariesToCopy = new[]
+                        {
+                            "libopenal.a",
+                        };
+                        var envVars = new Dictionary<string, string>
+                        {
+                            { "CC", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CXX", "clang++-" + Configuration.LinuxClangMinVer },
+                            { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
+                        };
+                        var config = $"-DALSOFT_REQUIRE_ALSA=ON " +
+                                     $"-DALSOFT_REQUIRE_OSS=ON " +
+                                     $"-DALSOFT_REQUIRE_PORTAUDIO=ON " +
+                                     $"-DALSOFT_REQUIRE_PULSEAUDIO=ON " +
+                                     $"-DALSOFT_REQUIRE_JACK=ON " +
+                                     $"-DALSOFT_REQUIRE_PIPEWIRE=ON " +
+                                     $"-DALSOFT_EMBED_HRTF_DATA=YES "
+                                     + cmakeArgs;
+
+                        // Use separate build directory
+#if !USE_GIT_REPOSITORY
+                        root = Path.Combine(root, "openal-soft-" + version);
+#endif
+                        var buildDir = Path.Combine(root, "build-" + architecture.ToString());
+                        SetupDirectory(buildDir, true);
+
+                        // Build for Linux
+                        RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DLIBTYPE=STATIC -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
+                        BuildCmake(buildDir, configuration, envVars);
+                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
+                        foreach (var file in binariesToCopy)
+                            Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
+                        break;
+                    }
+                    case TargetPlatform.Android:
+                    {
+                        var binariesToCopy = new[]
+                        {
+                            "libopenal.a",
+                        };
+                        var envVars = new Dictionary<string, string>
+                        {
+                            { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
+                        };
+                        var config = "-DALSOFT_REQUIRE_OBOE=OFF -DALSOFT_REQUIRE_OPENSL=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
+
+                        // Use separate build directory
+#if !USE_GIT_REPOSITORY
+                        root = Path.Combine(root, "openal-soft-" + version);
+#endif
+                        var buildDir = Path.Combine(root, "build-" + architecture.ToString());
+                        SetupDirectory(buildDir, true);
+
+                        // Build
+                        RunCmake(root, platform, TargetArchitecture.ARM64, $"-B\"{buildDir}\" -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
+                        BuildCmake(buildDir, envVars);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
+                        foreach (var file in binariesToCopy)
+                            Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
+                        break;
+                    }
+                    case TargetPlatform.Mac:
+                    {
+                        var binariesToCopy = new[]
+                        {
+                            "libopenal.a",
+                        };
+                        var envVars = new Dictionary<string, string>
+                        {
+                            { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
+                        };
+                        var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
+
+                        // Use separate build directory
+#if !USE_GIT_REPOSITORY
+                        root = Path.Combine(root, "openal-soft-" + version);
+#endif
+                        var buildDir = Path.Combine(root, "build-" + architecture.ToString());
+                        SetupDirectory(buildDir, true);
+
+                        // Build for Mac
+                        RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
                        BuildCmake(buildDir, envVars);
                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                        foreach (var file in binariesToCopy)
                            Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
+                        break;
                    }
-                    break;
-                }
-                case TargetPlatform.iOS:
-                {
-                    var binariesToCopy = new[]
+                    case TargetPlatform.iOS:
                    {
-                        "libopenal.a",
-                    };
-                    var envVars = new Dictionary<string, string>
-                    {
-                        { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
-                    };
-                    var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES";
+                        var binariesToCopy = new[]
+                        {
+                            "libopenal.a",
+                        };
+                        var envVars = new Dictionary<string, string>
+                        {
+                            { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
+                        };
+                        var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;

-                    // Get the source
-                    var packagePath = Path.Combine(root, "package.zip");
-                    if (!File.Exists(packagePath))
-                    {
-                        Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL);
-                        Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
+                        // Use separate build directory
+#if !USE_GIT_REPOSITORY
+                        root = Path.Combine(root, "openal-soft-" + version);
+#endif
+                        var buildDir = Path.Combine(root, "build-" + architecture.ToString());
+                        SetupDirectory(buildDir, true);
+
+                        // Build for iOS
+                        RunCmake(root, platform, TargetArchitecture.ARM64, $"-B\"{buildDir}\" -DCMAKE_SYSTEM_NAME=iOS -DALSOFT_OSX_FRAMEWORK=ON -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
+                        BuildCmake(buildDir, envVars);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
+                        foreach (var file in binariesToCopy)
+                            Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
+                        break;
+                    }
                    }
-
-                    // Use separate build directory
-                    root = Path.Combine(root, "openal-soft-" + version);
-                    var buildDir = Path.Combine(root, "build");
-
-                    // Build for iOS
-                    SetupDirectory(buildDir, true);
-                    RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_SYSTEM_NAME=iOS -DALSOFT_OSX_FRAMEWORK=ON -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
-                    BuildCmake(buildDir, envVars);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
-                    foreach (var file in binariesToCopy)
-                        Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
-                    break;
-                }
                }
            }
+
+            // Deploy license
+            Utilities.FileCopy(Path.Combine(root, "COPYING"), Path.Combine(dstIncludePath, "COPYING"), true);
+
+            // Deploy header files
+            var files = Directory.GetFiles(Path.Combine(root, "include", "AL"));
+            foreach (var file in files)
+            {
+                Utilities.FileCopy(file, Path.Combine(dstIncludePath, Path.GetFileName(file)));
+            }
        }
    }
 }
--- a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
@@ -17,40 +17,6 @@ namespace Flax.Deps.Dependencies
    /// <seealso cref="Flax.Deps.Dependency" />
    class PhysX : Dependency
    {
-        /// <inheritdoc />
-        public override TargetPlatform[] Platforms
-        {
-            get
-            {
-                switch (BuildPlatform)
-                {
-                case TargetPlatform.Windows:
-                    return new[]
-                    {
-                        TargetPlatform.Windows,
-                        TargetPlatform.XboxOne,
-                        TargetPlatform.PS4,
-                        TargetPlatform.PS5,
-                        TargetPlatform.XboxScarlett,
-                        TargetPlatform.Android,
-                        TargetPlatform.Switch,
-                    };
-                case TargetPlatform.Linux:
-                    return new[]
-                    {
-                        TargetPlatform.Linux,
-                    };
-                case TargetPlatform.Mac:
-                    return new[]
-                    {
-                        TargetPlatform.Mac,
-                        TargetPlatform.iOS,
-                    };
-                default: return new TargetPlatform[0];
-                }
-            }
-        }
-
        private string root;
        private string projectGenDir;
        private string projectGenPath;
@@ -65,8 +31,13 @@ namespace Flax.Deps.Dependencies
                if (cmakeSwitch.HasAttribute("name") && cmakeSwitch.Attributes["name"].Value == name)
                {
                    cmakeSwitch.Attributes["value"].Value = value;
+                    return;
                }
            }
+            var child = cmakeSwitches.OwnerDocument.CreateElement(cmakeSwitches.ChildNodes[0].Name);
+            child.SetAttribute("name", name);
+            child.SetAttribute("value", value);
+            cmakeSwitches.AppendChild(child);
        }

        private void Build(BuildOptions options, string preset, TargetPlatform targetPlatform, TargetArchitecture architecture)
@@ -94,11 +65,14 @@ namespace Flax.Deps.Dependencies
            case TargetPlatform.Windows:
                if (architecture == TargetArchitecture.ARM64)
                {
-                    // Windows ARM64 doesn't have GPU support, so avoid copying those DLLs around
+                    // Windows ARM64 doesn't have precompiled files for GPU support, so avoid copying those DLLs around
                    ConfigureCmakeSwitch(cmakeSwitches, "PX_COPY_EXTERNAL_DLL", "OFF");
                    ConfigureCmakeSwitch(cmakeParams, "PX_COPY_EXTERNAL_DLL", "OFF");
                }
                break;
+            case TargetPlatform.Linux:
+                ConfigureCmakeSwitch(cmakeParams, "PHYSX_CXX_FLAGS", "\"-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs\"");
+                break;
            case TargetPlatform.Android:
                ConfigureCmakeSwitch(cmakeParams, "CMAKE_INSTALL_PREFIX", $"install/android-{Configuration.AndroidPlatformApi}/PhysX");
                ConfigureCmakeSwitch(cmakeParams, "ANDROID_NATIVE_API_LEVEL", $"android-{Configuration.AndroidPlatformApi}");
@@ -106,6 +80,7 @@ namespace Flax.Deps.Dependencies
                break;
            case TargetPlatform.Mac:
                ConfigureCmakeSwitch(cmakeParams, "CMAKE_OSX_DEPLOYMENT_TARGET", Configuration.MacOSXMinVer);
+                ConfigureCmakeSwitch(cmakeParams, "PHYSX_CXX_FLAGS", "\"-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs\"");
                break;
            case TargetPlatform.iOS:
                ConfigureCmakeSwitch(cmakeParams, "CMAKE_OSX_DEPLOYMENT_TARGET", Configuration.iOSMinVer);
@@ -122,10 +97,11 @@ namespace Flax.Deps.Dependencies
            string bits;
            string arch;
            string binariesSubDir;
-            string buildPlatform;
+            string buildPlatform = architecture == TargetArchitecture.x86 ? "Win32" : architecture.ToString();
            bool suppressBitsPostfix = false;
            string binariesPrefix = string.Empty;
            var envVars = new Dictionary<string, string>();
+            envVars.Add("CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel);
            switch (architecture)
            {
            case TargetArchitecture.x86:
@@ -146,15 +122,6 @@ namespace Flax.Deps.Dependencies
                break;
            default: throw new InvalidArchitectureException(architecture);
            }
-            switch (architecture)
-            {
-            case TargetArchitecture.x86:
-                buildPlatform = "Win32";
-                break;
-            default:
-                buildPlatform = architecture.ToString();
-                break;
-            }
            var msBuildProps = new Dictionary<string, string>();
            switch (targetPlatform)
            {
@@ -385,60 +352,84 @@ namespace Flax.Deps.Dependencies

            foreach (var platform in options.Platforms)
            {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                {
-                case TargetPlatform.Windows:
-                {
-                    Build(options, "vc17win64", platform, TargetArchitecture.x64);
-                    Build(options, "vc17win-arm64", platform, TargetArchitecture.ARM64);
-                    break;
-                }
-                case TargetPlatform.Linux:
-                {
-                    Build(options, "linux", platform, TargetArchitecture.x64);
-                    break;
-                }
-                case TargetPlatform.PS4:
-                {
-                    Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
-                    Build(options, "ps4", platform, TargetArchitecture.x64);
-                    break;
-                }
-                case TargetPlatform.PS5:
-                {
-                    Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
-                    Build(options, "ps5", platform, TargetArchitecture.x64);
-                    break;
-                }
-                case TargetPlatform.XboxScarlett:
-                case TargetPlatform.XboxOne:
-                {
-                    Build(options, "vc16win64", platform, TargetArchitecture.x64);
-                    break;
-                }
-                case TargetPlatform.Android:
-                {
-                    Build(options, "android", platform, TargetArchitecture.ARM64);
-                    break;
-                }
-                case TargetPlatform.Switch:
-                {
-                    Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
-                    Build(options, "switch64", platform, TargetArchitecture.ARM64);
-                    break;
-                }
-                case TargetPlatform.Mac:
-                {
-                    Build(options, "mac64", platform, TargetArchitecture.x64);
-                    Build(options, "mac-arm64", platform, TargetArchitecture.ARM64);
-                    break;
-                }
-                case TargetPlatform.iOS:
-                {
-                    Build(options, "ios64", platform, TargetArchitecture.ARM64);
-                    break;
-                }
+                    BuildStarted(platform, architecture);
+                    switch (platform)
+                    {
+                    case TargetPlatform.Windows:
+                    {
+                        if (architecture == TargetArchitecture.x64 || architecture == TargetArchitecture.ARM64)
+                        {
+                            if (WindowsPlatform.GetToolsets().Any(x => x.Key == WindowsPlatformToolset.v145))
+                            {
+                                try
+                                {
+                                    Build(options, architecture == TargetArchitecture.x64 ? "vc18win64" : "vc18win-arm64", platform, architecture);
+                                }
+                                catch (Exception e)
+                                {
+                                    Log.Warning($"Failed to generate VS2026 solution for PhysX, fallback to VS2022: {e.Message}");
+                                    Build(options, architecture == TargetArchitecture.x64 ? "vc17win64" : "vc17win-arm64", platform, architecture);
+                                }
+                            }
+                            else
+                                Build(options, architecture == TargetArchitecture.x64 ? "vc17win64" : "vc17win-arm64", platform, architecture);
+                        }
+                        else
+                            throw new InvalidArchitectureException(architecture);
+                        break;
+                    }
+                    case TargetPlatform.Linux:
+                    {
+                        Build(options, "linux", platform, architecture);
+                        break;
+                    }
+                    case TargetPlatform.PS4:
+                    {
+                        Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
+                        Build(options, "ps4", platform, TargetArchitecture.x64);
+                        break;
+                    }
+                    case TargetPlatform.PS5:
+                    {
+                        Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
+                        Build(options, "ps5", platform, TargetArchitecture.x64);
+                        break;
+                    }
+                    case TargetPlatform.XboxScarlett:
+                    case TargetPlatform.XboxOne:
+                    {
+                        Build(options, "vc16win64", platform, TargetArchitecture.x64);
+                        break;
+                    }
+                    case TargetPlatform.Android:
+                    {
+                        Build(options, "android", platform, TargetArchitecture.ARM64);
+                        break;
+                    }
+                    case TargetPlatform.Switch:
+                    {
+                        Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
+                        Build(options, "switch64", platform, TargetArchitecture.ARM64);
+                        break;
+                    }
+                    case TargetPlatform.Mac:
+                    {
+                        if (architecture == TargetArchitecture.x64)
+                            Build(options, "mac64", platform, architecture);
+                        else if (architecture == TargetArchitecture.ARM64)
+                            Build(options, "mac-arm64", platform, architecture);
+                        else
+                            throw new InvalidArchitectureException(architecture);
+                        break;
+                    }
+                    case TargetPlatform.iOS:
+                    {
+                        Build(options, "ios64", platform, TargetArchitecture.ARM64);
+                        break;
+                    }
+                    }
                }
            }

@@ -446,7 +437,7 @@ namespace Flax.Deps.Dependencies
            var dstIncludePath = Path.Combine(options.ThirdPartyFolder, "PhysX");
            Directory.GetFiles(dstIncludePath, "*.h", SearchOption.AllDirectories).ToList().ForEach(File.Delete);
            Utilities.FileCopy(Path.Combine(root, "LICENSE.md"), Path.Combine(dstIncludePath, "License.txt"));
-            Utilities.DirectoryCopy(Path.Combine(root, "physx", "include"), dstIncludePath);
+            Utilities.DirectoryCopy(Path.Combine(root, "physx", "include"), dstIncludePath, true, true);
        }
    }
 }
--- a/Source/Tools/Flax.Build/Deps/Dependencies/UVAtlas.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/UVAtlas.cs
@@ -29,6 +29,24 @@ namespace Flax.Deps.Dependencies
            }
        }

+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
        /// <inheritdoc />
        public override void Build(BuildOptions options)
        {
@@ -47,23 +65,23 @@ namespace Flax.Deps.Dependencies

            foreach (var platform in options.Platforms)
            {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                {
-                case TargetPlatform.Windows:
-                {
-                    // Build for Win64
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                    {
+                    case TargetPlatform.Windows:
+                    {
+                        // Build for Windows
                        Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString(), new Dictionary<string, string>() { { "RestorePackagesConfig", "true" } });
                        var depsFolder = GetThirdPartyFolder(options, TargetPlatform.Windows, architecture);
                        foreach (var file in outputFileNames)
                        {
                            Utilities.FileCopy(Path.Combine(binFolder, architecture.ToString(), "Release", file), Path.Combine(depsFolder, file));
                        }
+                        break;
+                    }
                    }
-                    break;
-                }
                }
            }

--- a/Show More
+++ b/Show More