From 215467ae42fe55947718fb2147283b4d89826d75 Mon Sep 17 00:00:00 2001
From: Saas <kaktusreiniger@gmail.com>
Date: Sun, 14 Sep 2025 18:14:38 +0200
Subject: [PATCH 01/33] add ribbon menu buttons to quick open product local
 folder

---
 Source/Editor/Modules/UIModule.cs | 12 ++++++++++++
 1 file changed, 12 insertions(+)
diff --git a/Source/Editor/Modules/UIModule.cs b/Source/Editor/Modules/UIModule.cs
index 19a0c1142..4839f321f 100644
--- a/Source/Editor/Modules/UIModule.cs
+++ b/Source/Editor/Modules/UIModule.cs
@@ -78,6 +78,7 @@ namespace FlaxEditor.Modules
         private ContextMenuButton _menuToolsProfilerWindow;
         private ContextMenuButton _menuToolsSetTheCurrentSceneViewAsDefault;
         private ContextMenuButton _menuToolsTakeScreenshot;
+        private ContextMenuButton _menuToolsOpenLocalFolder;
         private ContextMenuChildMenu _menuWindowApplyWindowLayout;
 
         private ToolStripButton _toolStripSaveAll;
@@ -623,6 +624,13 @@ namespace FlaxEditor.Modules
             _menuToolsTakeScreenshot = cm.AddButton("Take screenshot", inputOptions.TakeScreenshot, Editor.Windows.TakeScreenshot);
             cm.AddSeparator();
             cm.AddButton("Plugins", () => Editor.Windows.PluginsWin.Show());
+            cm.AddSeparator();
+            var childMenu = cm.AddChildMenu("Open product local folder");
+            childMenu.ContextMenu.AddButton("Editor", () => FileSystem.ShowFileExplorer(Globals.ProductLocalFolder));
+            string localAppData = Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData);
+            GameSettings settings = GameSettings.Load<GameSettings>();
+            string path = Path.Combine(localAppData, settings.CompanyName, settings.ProductName);
+            _menuToolsOpenLocalFolder = childMenu.ContextMenu.AddButton("Cooked game", () => FileSystem.ShowFileExplorer(path));
 
             // Window
             MenuWindow = MainMenu.AddButton("Window");
@@ -949,6 +957,10 @@ namespace FlaxEditor.Modules
             _menuToolsBuildNavMesh.Enabled = canEdit;
             _menuToolsCancelBuilding.Enabled = GameCooker.IsRunning;
             _menuToolsSetTheCurrentSceneViewAsDefault.Enabled = Level.ScenesCount > 0;
+            string localAppData = Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData);
+            GameSettings settings = GameSettings.Load<GameSettings>();
+            string path = Path.Combine(localAppData, settings.CompanyName, settings.ProductName);
+            _menuToolsOpenLocalFolder.Enabled = Directory.Exists(path);
 
             c.PerformLayout();
         }

From 241a8bc7643bf033ebc99ac8c2f22d34e590597e Mon Sep 17 00:00:00 2001
From: Saas <kaktusreiniger@gmail.com>
Date: Sat, 11 Oct 2025 20:30:51 +0200
Subject: [PATCH 02/33] add incrementing/ decrementing value boxe's value with
 arrow keys

---
 Source/Editor/GUI/Input/ValueBox.cs | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/Source/Editor/GUI/Input/ValueBox.cs b/Source/Editor/GUI/Input/ValueBox.cs
index 674ee0697..db3ae5413 100644
--- a/Source/Editor/GUI/Input/ValueBox.cs
+++ b/Source/Editor/GUI/Input/ValueBox.cs
@@ -99,6 +99,11 @@ namespace FlaxEditor.GUI.Input
         /// </summary>
         public event Action SlidingEnd;
 
+        /// <summary>
+        /// If enabled, pressing the arrow up or down key increments/ decrements the value.
+        /// </summary>
+        public bool ArrowKeysIncrement = true;
+
         /// <summary>
         /// Gets or sets the slider speed. Use value 0 to disable and hide slider UI.
         /// </summary>
@@ -239,6 +244,27 @@ namespace FlaxEditor.GUI.Input
             ResetViewOffset();
         }
 
+        /// <inheritdoc />
+        public override bool OnKeyDown(KeyboardKeys key)
+        {
+            if (ArrowKeysIncrement && (key == KeyboardKeys.ArrowUp || key == KeyboardKeys.ArrowDown))
+            {
+                bool altDown = Root.GetKey(KeyboardKeys.Alt);
+                bool shiftDown = Root.GetKey(KeyboardKeys.Shift);
+                bool controlDown = Root.GetKey(KeyboardKeys.Control);
+                float deltaValue = altDown ? 0.1f : (shiftDown ? 10f : (controlDown ? 100f : 1));
+                float slideDelta = key == KeyboardKeys.ArrowUp ? deltaValue : -deltaValue;
+
+                _startSlideValue = Value;
+                ApplySliding(slideDelta);
+                EndSliding();
+                Focus();
+                return true;
+            }
+
+            return base.OnKeyDown(key);
+        }
+
         /// <inheritdoc />
         public override bool OnMouseDown(Float2 location, MouseButton button)
         {

From b2b855200f633154973bbfb1e697298a4e132b2c Mon Sep 17 00:00:00 2001
From: Saas <kaktusreiniger@gmail.com>
Date: Sat, 11 Oct 2025 22:02:14 +0200
Subject: [PATCH 03/33] allow entering numbers with digit separator into value
 boxes (f.e. 1_000)

---
 Source/Editor/Utilities/ShuntingYardParser.cs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Source/Editor/Utilities/ShuntingYardParser.cs b/Source/Editor/Utilities/ShuntingYardParser.cs
index 47e2275e5..fe473389c 100644
--- a/Source/Editor/Utilities/ShuntingYardParser.cs
+++ b/Source/Editor/Utilities/ShuntingYardParser.cs
@@ -444,6 +444,9 @@ namespace FlaxEditor.Utilities
         /// <returns>The result value.</returns>
         public static double Parse(string text)
         {
+            // Hack to allow parsing numbers while using "_" as a separator (like this: 1_000)
+            text = text.Replace("_", string.Empty);
+
             var tokens = Tokenize(text);
             var rpn = OrderTokens(tokens);
             return EvaluateRPN(rpn);

From 76f0768b99dea2a6dac3be1e4e40f9701d0fb2d9 Mon Sep 17 00:00:00 2001
From: Saas <kaktusreiniger@gmail.com>
Date: Sun, 12 Oct 2025 16:14:07 +0200
Subject: [PATCH 04/33] float

---
 Source/Editor/GUI/Input/ValueBox.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/Editor/GUI/Input/ValueBox.cs b/Source/Editor/GUI/Input/ValueBox.cs
index db3ae5413..88ec9a4ee 100644
--- a/Source/Editor/GUI/Input/ValueBox.cs
+++ b/Source/Editor/GUI/Input/ValueBox.cs
@@ -252,7 +252,7 @@ namespace FlaxEditor.GUI.Input
                 bool altDown = Root.GetKey(KeyboardKeys.Alt);
                 bool shiftDown = Root.GetKey(KeyboardKeys.Shift);
                 bool controlDown = Root.GetKey(KeyboardKeys.Control);
-                float deltaValue = altDown ? 0.1f : (shiftDown ? 10f : (controlDown ? 100f : 1));
+                float deltaValue = altDown ? 0.1f : (shiftDown ? 10f : (controlDown ? 100f : 1f));
                 float slideDelta = key == KeyboardKeys.ArrowUp ? deltaValue : -deltaValue;
 
                 _startSlideValue = Value;

From 48100cf9fc4bbb577bd68c47df651e9f9d5e605d Mon Sep 17 00:00:00 2001
From: Saas <kaktusreiniger@gmail.com>
Date: Mon, 13 Oct 2025 11:16:07 +0200
Subject: [PATCH 05/33] fix group element text clipping

fixes group element header text clipping outside of the header and into the settings icon (if there is one).
Also some extra code to handle script editor, as that displays some extra icons in the header.
---
 Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs      | 3 ++-
 .../Editor/CustomEditors/Elements/Container/GroupElement.cs | 3 ++-
 Source/Engine/UI/GUI/Panels/DropPanel.cs                    | 6 ++++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs b/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs
index 356ae5ee4..36a993e2c 100644
--- a/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs
+++ b/Source/Editor/CustomEditors/Dedicated/ScriptsEditor.cs
@@ -909,7 +909,8 @@ namespace FlaxEditor.CustomEditors.Dedicated
                 settingsButton.Tag = script;
                 settingsButton.Clicked += OnSettingsButtonClicked;
 
-                group.Panel.HeaderTextMargin = new Margin(scriptDrag.Right - 12, 15, 2, 2);
+                // Adjust margin to not overlap with other ui elements in the header
+                group.Panel.HeaderTextMargin = group.Panel.HeaderTextMargin with { Left = scriptDrag.Right - 12, Right = settingsButton.Width + Utilities.Constants.UIMargin };
                 group.Object(values, editor);
                 // Remove drop down arrows and containment lines if no objects in the group
                 if (group.Children.Count == 0)
diff --git a/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs b/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs
index 64bc9080b..055c6a29d 100644
--- a/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs
+++ b/Source/Editor/CustomEditors/Elements/Container/GroupElement.cs
@@ -44,7 +44,8 @@ namespace FlaxEditor.CustomEditors.Elements
         {
             var style = Style.Current;
             var settingsButtonSize = Panel.HeaderHeight;
-            return new Image
+            Panel.HeaderTextMargin = Panel.HeaderTextMargin with { Right = settingsButtonSize + Utilities.Constants.UIMargin };
+;           return new Image
             {
                 TooltipText = "Settings",
                 AutoFocus = true,
diff --git a/Source/Engine/UI/GUI/Panels/DropPanel.cs b/Source/Engine/UI/GUI/Panels/DropPanel.cs
index 0bfa799c2..c71223cb4 100644
--- a/Source/Engine/UI/GUI/Panels/DropPanel.cs
+++ b/Source/Engine/UI/GUI/Panels/DropPanel.cs
@@ -361,7 +361,7 @@ namespace FlaxEngine.GUI
             var style = Style.Current;
             var enabled = EnabledInHierarchy;
 
-            // Paint Background
+            // Draw Background
             var backgroundColor = BackgroundColor;
             if (backgroundColor.A > 0.0f)
             {
@@ -388,7 +388,7 @@ namespace FlaxEngine.GUI
                     ArrowImageOpened?.Draw(dropDownRect, arrowColor);
             }
 
-            // Text
+            // Header text
             var textRect = new Rectangle(textLeft, 0, Width - textLeft, HeaderHeight);
             _headerTextMargin.ShrinkRectangle(ref textRect);
             var textColor = HeaderTextColor;
@@ -397,7 +397,9 @@ namespace FlaxEngine.GUI
                 textColor *= 0.6f;
             }
 
+            Render2D.PushClip(textRect);
             Render2D.DrawText(HeaderTextFont.GetFont(), HeaderTextMaterial, HeaderText, textRect, textColor, TextAlignment.Near, TextAlignment.Center);
+            Render2D.PopClip();
 
             if (!_isClosed && EnableContainmentLines)
             {

From 79351f0c4d94af5158e3120814a2df237fb5bcde Mon Sep 17 00:00:00 2001
From: Saas <kaktusreiniger@gmail.com>
Date: Wed, 15 Oct 2025 21:09:24 +0200
Subject: [PATCH 06/33] Hide array editor "Size" text when array title overlaps
 it

---
 .../CustomEditors/Editors/CollectionEditor.cs | 25 ++++++++++++++++---
 Source/Engine/UI/GUI/Panels/DropPanel.cs      |  7 +++++-
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/Source/Editor/CustomEditors/Editors/CollectionEditor.cs b/Source/Editor/CustomEditors/Editors/CollectionEditor.cs
index b977dab63..38f088e8c 100644
--- a/Source/Editor/CustomEditors/Editors/CollectionEditor.cs
+++ b/Source/Editor/CustomEditors/Editors/CollectionEditor.cs
@@ -450,6 +450,7 @@ namespace FlaxEditor.CustomEditors.Editors
         protected bool NotNullItems;
 
         private IntValueBox _sizeBox;
+        private Label _label;
         private Color _background;
         private int _elementsCount, _minCount, _maxCount;
         private bool _readOnly;
@@ -566,7 +567,7 @@ namespace FlaxEditor.CustomEditors.Editors
                     Parent = dropPanel,
                 };
 
-                var label = new Label
+                _label = new Label
                 {
                     Text = "Size",
                     AnchorPreset = AnchorPresets.TopRight,
@@ -650,7 +651,7 @@ namespace FlaxEditor.CustomEditors.Editors
                 panel.Panel.Size = new Float2(0, 18);
                 panel.Panel.Margin = new Margin(0, 0, Utilities.Constants.UIMargin, 0);
 
-                var removeButton = panel.Button("-", "Remove the last item");
+                var removeButton = panel.Button("-", "Remove the last item.");
                 removeButton.Button.Size = new Float2(16, 16);
                 removeButton.Button.Enabled = size > _minCount;
                 removeButton.Button.AnchorPreset = AnchorPresets.TopRight;
@@ -661,7 +662,7 @@ namespace FlaxEditor.CustomEditors.Editors
                     Resize(Count - 1);
                 };
 
-                var addButton = panel.Button("+", "Add a new item");
+                var addButton = panel.Button("+", "Add a new item.");
                 addButton.Button.Size = new Float2(16, 16);
                 addButton.Button.Enabled = (!NotNullItems || size > 0) && size < _maxCount;
                 addButton.Button.AnchorPreset = AnchorPresets.TopRight;
@@ -672,8 +673,10 @@ namespace FlaxEditor.CustomEditors.Editors
                     Resize(Count + 1);
                 };
             }
-        }
 
+            Layout.ContainerControl.SizeChanged += OnLayoutSizeChanged;
+        }
+        
         private void OnSetupContextMenu(ContextMenu menu, DropPanel panel)
         {
             if (menu.Items.Any(x => x is ContextMenuButton b && b.Text.Equals("Open All", StringComparison.Ordinal)))
@@ -696,10 +699,24 @@ namespace FlaxEditor.CustomEditors.Editors
             });
         }
 
+        private void OnLayoutSizeChanged(Control control)
+        {
+            if (Layout.ContainerControl is DropPanel dropPanel)
+            {
+                // Hide "Size" text when array editor title overlaps
+                var headerTextSize = dropPanel.HeaderTextFont.GetFont().MeasureText(dropPanel.HeaderText);
+                if (headerTextSize.X + DropPanel.DropDownIconSize >= _label.Left)
+                    _label.TextColor = _label.TextColorHighlighted = Color.Transparent;
+                else
+                    _label.TextColor = _label.TextColorHighlighted = FlaxEngine.GUI.Style.Current.Foreground;
+            }
+        }
+
         /// <inheritdoc />
         protected override void Deinitialize()
         {
             _sizeBox = null;
+            Layout.ContainerControl.SizeChanged -= OnLayoutSizeChanged;
 
             base.Deinitialize();
         }
diff --git a/Source/Engine/UI/GUI/Panels/DropPanel.cs b/Source/Engine/UI/GUI/Panels/DropPanel.cs
index 0bfa799c2..650acce53 100644
--- a/Source/Engine/UI/GUI/Panels/DropPanel.cs
+++ b/Source/Engine/UI/GUI/Panels/DropPanel.cs
@@ -11,6 +11,11 @@ namespace FlaxEngine.GUI
     [ActorToolbox("GUI")]
     public class DropPanel : ContainerControl
     {
+        /// <summary>
+        /// Size of the drop down icon. 
+        /// </summary>
+        public const float DropDownIconSize = 14.0f;
+
         /// <summary>
         /// The header height.
         /// </summary>
@@ -379,7 +384,7 @@ namespace FlaxEngine.GUI
             float textLeft = 0;
             if (EnableDropDownIcon)
             {
-                textLeft += 14;
+                textLeft += DropDownIconSize;
                 var dropDownRect = new Rectangle(2, (HeaderHeight - 12) / 2, 12, 12);
                 var arrowColor = _mouseOverHeader ? style.Foreground : style.ForegroundGrey;
                 if (_isClosed)

From 5ec2476aec23971bfbd6782033ebb8896acba6b6 Mon Sep 17 00:00:00 2001
From: Saas <kaktusreiniger@gmail.com>
Date: Thu, 16 Oct 2025 21:05:10 +0200
Subject: [PATCH 07/33] add "*" to editor settings window title when settings
 are dirty

---
 Source/Editor/Windows/EditorOptionsWindow.cs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Source/Editor/Windows/EditorOptionsWindow.cs b/Source/Editor/Windows/EditorOptionsWindow.cs
index 0ee9a92d7..c6bf2fd16 100644
--- a/Source/Editor/Windows/EditorOptionsWindow.cs
+++ b/Source/Editor/Windows/EditorOptionsWindow.cs
@@ -45,7 +45,7 @@ namespace FlaxEditor.Windows
             {
                 Parent = this
             };
-            _saveButton = (ToolStripButton)toolstrip.AddButton(editor.Icons.Save64, SaveData).LinkTooltip("Save");
+            _saveButton = (ToolStripButton)toolstrip.AddButton(editor.Icons.Save64, SaveData).LinkTooltip("Save.");
             _saveButton.Enabled = false;
 
             _tabs = new Tabs
@@ -104,6 +104,8 @@ namespace FlaxEditor.Windows
             {
                 _saveButton.Enabled = true;
                 _isDataDirty = true;
+                if (!Title.EndsWith('*'))
+                    Title += "*";
             }
         }
 
@@ -113,6 +115,8 @@ namespace FlaxEditor.Windows
             {
                 _saveButton.Enabled = false;
                 _isDataDirty = false;
+                if (Title.EndsWith('*'))
+                    Title = Title.Remove(Title.Length - 1);
             }
         }
 

From 26261a209096a6df824e1ceaa5577a8356a44a9c Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sun, 12 Oct 2025 14:42:22 +0300
Subject: [PATCH 08/33] Support Visual Studio 2026 as a generator for CMake
 dependencies

---
 .../Tools/Flax.Build/Deploy/VCEnvironment.cs  |  6 ++++-
 .../Flax.Build/Deps/Dependencies/PhysX.cs     | 26 +++++++++----------
 Source/Tools/Flax.Build/Deps/Dependency.cs    | 26 ++++++++++++++++++-
 3 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/Source/Tools/Flax.Build/Deploy/VCEnvironment.cs b/Source/Tools/Flax.Build/Deploy/VCEnvironment.cs
index cfcbf9866..afe5c8e3c 100644
--- a/Source/Tools/Flax.Build/Deploy/VCEnvironment.cs
+++ b/Source/Tools/Flax.Build/Deploy/VCEnvironment.cs
@@ -241,7 +241,11 @@ namespace Flax.Deploy
 
             if (!File.Exists(solutionFile))
             {
-                throw new Exception(string.Format("Unable to build solution {0}. Solution file not found.", solutionFile));
+                // CMake VS2026 generator prefers .slnx solution files, just swap the extension for CMake dependencies
+                if (File.Exists(Path.ChangeExtension(solutionFile, "slnx")))
+                    solutionFile = Path.ChangeExtension(solutionFile, "slnx");
+                else
+                    throw new Exception(string.Format("Unable to build solution {0}. Solution file not found.", solutionFile));
             }
 
             string cmdLine = string.Format("\"{0}\" /m /t:Restore,Build /p:Configuration=\"{1}\" /p:Platform=\"{2}\" {3} /nologo", solutionFile, buildConfig, buildPlatform, Verbosity);
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
index 39f7ad975..fe7dd39f7 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
@@ -94,7 +94,7 @@ namespace Flax.Deps.Dependencies
             case TargetPlatform.Windows:
                 if (architecture == TargetArchitecture.ARM64)
                 {
-                    // Windows ARM64 doesn't have GPU support, so avoid copying those DLLs around
+                    // Windows ARM64 doesn't have precompiled files for GPU support, so avoid copying those DLLs around
                     ConfigureCmakeSwitch(cmakeSwitches, "PX_COPY_EXTERNAL_DLL", "OFF");
                     ConfigureCmakeSwitch(cmakeParams, "PX_COPY_EXTERNAL_DLL", "OFF");
                 }
@@ -122,7 +122,7 @@ namespace Flax.Deps.Dependencies
             string bits;
             string arch;
             string binariesSubDir;
-            string buildPlatform;
+            string buildPlatform = architecture == TargetArchitecture.x86 ? "Win32" : architecture.ToString();
             bool suppressBitsPostfix = false;
             string binariesPrefix = string.Empty;
             var envVars = new Dictionary<string, string>();
@@ -146,15 +146,6 @@ namespace Flax.Deps.Dependencies
                 break;
             default: throw new InvalidArchitectureException(architecture);
             }
-            switch (architecture)
-            {
-            case TargetArchitecture.x86:
-                buildPlatform = "Win32";
-                break;
-            default:
-                buildPlatform = architecture.ToString();
-                break;
-            }
             var msBuildProps = new Dictionary<string, string>();
             switch (targetPlatform)
             {
@@ -390,8 +381,17 @@ namespace Flax.Deps.Dependencies
                 {
                 case TargetPlatform.Windows:
                 {
-                    Build(options, "vc17win64", platform, TargetArchitecture.x64);
-                    Build(options, "vc17win-arm64", platform, TargetArchitecture.ARM64);
+                    try
+                    {
+                        Build(options, "vc18win64", platform, architecture);
+                        Build(options, "vc18win-arm64", platform, architecture);
+                    }
+                    catch
+                    {
+                        Log.Verbose("Failed to generate VS2026 solution for PhysX, fallback to VS2022");
+                        Build(options, "vc17win64", platform, architecture);
+                        Build(options, "vc17win-arm64", platform, architecture);
+                    }
                     break;
                 }
                 case TargetPlatform.Linux:
diff --git a/Source/Tools/Flax.Build/Deps/Dependency.cs b/Source/Tools/Flax.Build/Deps/Dependency.cs
index 43cdcc146..010a45175 100644
--- a/Source/Tools/Flax.Build/Deps/Dependency.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependency.cs
@@ -47,6 +47,24 @@ namespace Flax.Deps
         /// </summary>
         protected static TargetPlatform BuildPlatform => Platform.BuildPlatform.Target;
 
+
+        private static Version? _cmakeVersion;
+        protected static Version CMakeVersion
+        {
+            get
+            {
+                if (_cmakeVersion == null)
+                {
+                    var versionOutput = Utilities.ReadProcessOutput("cmake", "--version");
+                    var versionStart = versionOutput.IndexOf("cmake version ") + "cmake version ".Length;
+                    var versionEnd = versionOutput.IndexOfAny(['-', '\n', '\r'], versionStart); // End of line or dash before Git hash
+                    var versionString = versionOutput.Substring(versionStart, versionEnd - versionStart);
+                    _cmakeVersion = new Version(versionString);
+                }
+                return _cmakeVersion;
+            }
+        }
+
         /// <summary>
         /// Gets the platforms list supported by this dependency to build on the current build platform (based on <see cref="Platform.BuildPlatform"/>).
         /// </summary>
@@ -309,7 +327,13 @@ namespace Flax.Deps
                     break;
                 default: throw new InvalidArchitectureException(architecture);
                 }
-                cmdLine = string.Format("CMakeLists.txt -G \"Visual Studio 17 2022\" -A {0}", arch);
+                if (CMakeVersion.Major > 4 || (CMakeVersion.Major == 4 && CMakeVersion.Minor >= 2))
+                {
+                    // This generates both .sln and .slnx solution files
+                    cmdLine = string.Format("CMakeLists.txt -G \"Visual Studio 17 2022\" -G \"Visual Studio 18 2026\" -A {0}", arch);
+                }
+                else
+                    cmdLine = string.Format("CMakeLists.txt -G \"Visual Studio 17 2022\" -A {0}", arch);
                 break;
             }
             case TargetPlatform.PS4:

From 0c462315f080c0fbe0d93367772c2d25d400d792 Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sun, 12 Oct 2025 17:00:58 +0300
Subject: [PATCH 09/33] Fix CMake compatibility errors with dependencies

---
 Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs |  2 +-
 Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs  | 12 +++++++-----
 .../Tools/Flax.Build/Deps/Dependencies/freetype.cs   | 10 +++++-----
 Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs  |  4 ++--
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs b/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs
index 1120a94f8..f296bc9b9 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs
@@ -110,7 +110,7 @@ namespace Flax.Deps.Dependencies
             // Peek options
             var binariesPrefix = string.Empty;
             var binariesPostfix = string.Empty;
-            var cmakeArgs = "-DNV_CLOTH_ENABLE_DX11=0 -DNV_CLOTH_ENABLE_CUDA=0 -DPX_GENERATE_GPU_PROJECTS=0";
+            var cmakeArgs = "-DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DNV_CLOTH_ENABLE_DX11=0 -DNV_CLOTH_ENABLE_CUDA=0 -DPX_GENERATE_GPU_PROJECTS=0";
             var cmakeName = string.Empty;
             var buildFolder = Path.Combine(nvCloth, "compiler", platform.ToString() + '_' + architecture.ToString());
             var envVars = new Dictionary<string, string>();
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs b/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs
index 319ad70b3..79be778ef 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs
@@ -51,6 +51,7 @@ namespace Flax.Deps.Dependencies
             var root = options.IntermediateFolder;
             var version = "1.24.3";
             var configuration = "Release";
+            var cmakeArgs = "-DCMAKE_POLICY_VERSION_MINIMUM=3.5";
             var dstIncludePath = Path.Combine(options.ThirdPartyFolder, "OpenAL");
             var noSSL = true; // OpenAL Soft website has broken certs
 
@@ -77,7 +78,7 @@ namespace Flax.Deps.Dependencies
                         var buildDir = Path.Combine(root, "build-" + architecture.ToString());
                         var solutionPath = Path.Combine(buildDir, "OpenAL.sln");
 
-                        RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DBUILD_SHARED_LIBS=OFF -DCMAKE_C_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\" -DCMAKE_CXX_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\"");
+                        RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DBUILD_SHARED_LIBS=OFF -DCMAKE_C_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\" -DCMAKE_CXX_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\" " + cmakeArgs);
                         Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString());
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         foreach (var file in binariesToCopy)
@@ -132,7 +133,8 @@ namespace Flax.Deps.Dependencies
                                  $"-DALSOFT_REQUIRE_PULSEAUDIO=ON " +
                                  $"-DALSOFT_REQUIRE_JACK=ON " +
                                  $"-DALSOFT_REQUIRE_PIPEWIRE=ON " +
-                                 $"-DALSOFT_EMBED_HRTF_DATA=YES ";
+                                 $"-DALSOFT_EMBED_HRTF_DATA=YES "
+                                 + cmakeArgs;
 
                     // Get the source
                     var packagePath = Path.Combine(root, "package.zip");
@@ -163,7 +165,7 @@ namespace Flax.Deps.Dependencies
                     {
                         { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
                     };
-                    var config = " -DALSOFT_REQUIRE_OBOE=OFF -DALSOFT_REQUIRE_OPENSL=ON -DALSOFT_EMBED_HRTF_DATA=YES";
+                    var config = " -DALSOFT_REQUIRE_OBOE=OFF -DALSOFT_REQUIRE_OPENSL=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
 
                     // Get the source
                     var packagePath = Path.Combine(root, "package.zip");
@@ -203,7 +205,7 @@ namespace Flax.Deps.Dependencies
                     {
                         { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
                     };
-                    var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES";
+                    var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
 
                     // Get the source
                     var packagePath = Path.Combine(root, "package.zip");
@@ -237,7 +239,7 @@ namespace Flax.Deps.Dependencies
                     {
                         { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
                     };
-                    var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES";
+                    var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
 
                     // Get the source
                     var packagePath = Path.Combine(root, "package.zip");
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs b/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs
index 89ed09a72..0692ad8e9 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs
@@ -143,7 +143,7 @@ namespace Flax.Deps.Dependencies
                     // Build for Linux
                     SetupDirectory(buildDir, true);
                     var toolchain = UnixToolchain.GetToolchainName(platform, TargetArchitecture.x64);
-                    Utilities.Run("cmake", string.Format("-G \"Unix Makefiles\" -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER_TARGET={0} ..", toolchain), null, buildDir, Utilities.RunOptions.DefaultTool, envVars);
+                    Utilities.Run("cmake", string.Format("-G \"Unix Makefiles\" -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER_TARGET={0} ..", toolchain), null, buildDir, Utilities.RunOptions.DefaultTool, envVars);
                     Utilities.Run("cmake", "--build .", null, buildDir, Utilities.RunOptions.DefaultTool, envVars);
                     var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
                     Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName));
@@ -214,7 +214,7 @@ namespace Flax.Deps.Dependencies
 
                     // Build for Android
                     SetupDirectory(buildDir, true);
-                    RunCmake(buildDir, TargetPlatform.Android, TargetArchitecture.ARM64, ".. -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF -DCMAKE_BUILD_TYPE=Release");
+                    RunCmake(buildDir, TargetPlatform.Android, TargetArchitecture.ARM64, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF -DCMAKE_BUILD_TYPE=Release");
                     BuildCmake(buildDir);
                     var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
                     Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName));
@@ -224,7 +224,7 @@ namespace Flax.Deps.Dependencies
                 {
                     // Build for Switch
                     SetupDirectory(buildDir, true);
-                    RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release");
+                    RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE=Release");
                     BuildCmake(buildDir);
                     var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
                     Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName));
@@ -236,7 +236,7 @@ namespace Flax.Deps.Dependencies
                     foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
                     {
                         SetupDirectory(buildDir, true);
-                        RunCmake(buildDir, platform, architecture, ".. -DCMAKE_BUILD_TYPE=Release");
+                        RunCmake(buildDir, platform, architecture, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE=Release");
                         BuildCmake(buildDir);
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName));
@@ -253,7 +253,7 @@ namespace Flax.Deps.Dependencies
 
                     // Build for iOS
                     SetupDirectory(buildDir, true);
-                    RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DIOS_PLATFORM=OS -DCMAKE_SYSTEM_NAME=iOS -DCMAKE_BUILD_TYPE=Release -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF");
+                    RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DIOS_PLATFORM=OS -DCMAKE_SYSTEM_NAME=iOS -DCMAKE_BUILD_TYPE=Release -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF");
                     BuildCmake(buildDir);
                     var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
                     Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName));
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs b/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
index d22f8696f..f2dc8af3b 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
@@ -297,7 +297,7 @@ namespace Flax.Deps.Dependencies
             {
                 var solutionPath = Path.Combine(oggBuildDir, "ogg.sln");
 
-                RunCmake(oggRoot, platform, architecture, $"-B\"{oggBuildDir}\" -DBUILD_SHARED_LIBS=OFF");
+                RunCmake(oggRoot, platform, architecture, $"-B\"{oggBuildDir}\" -DBUILD_SHARED_LIBS=OFF -DCMAKE_POLICY_VERSION_MINIMUM=3.5");
                 Deploy.VCEnvironment.BuildSolution(solutionPath, configurationMsvc, architecture.ToString());
                 foreach (var file in oggBinariesToCopyWindowsCmake)
                     binariesToCopy.Add((Path.Combine(oggBuildDir, configurationMsvc, file.Item1), file.Item2));
@@ -308,7 +308,7 @@ namespace Flax.Deps.Dependencies
                 var oggLibraryPath = Path.Combine(oggBuildDir, configurationMsvc, "ogg" + ext);
                 var solutionPath = Path.Combine(vorbisBuildDir, "vorbis.sln");
 
-                RunCmake(vorbisRoot, platform, architecture, $"-B\"{vorbisBuildDir}\" -DOGG_INCLUDE_DIR=\"{Path.Combine(oggRoot, "include")}\" -DOGG_LIBRARY=\"{oggLibraryPath}\" -DBUILD_SHARED_LIBS=OFF");
+                RunCmake(vorbisRoot, platform, architecture, $"-B\"{vorbisBuildDir}\" -DOGG_INCLUDE_DIR=\"{Path.Combine(oggRoot, "include")}\" -DOGG_LIBRARY=\"{oggLibraryPath}\" -DBUILD_SHARED_LIBS=OFF -DCMAKE_POLICY_VERSION_MINIMUM=3.5");
                 Deploy.VCEnvironment.BuildSolution(solutionPath, configurationMsvc, architecture.ToString());
                 foreach (var file in vorbisBinariesToCopyWindowsCmake)
                     binariesToCopy.Add((Path.Combine(vorbisBuildDir, "lib", configurationMsvc, file.Item1), file.Item2));

From 028f3a7871aecdff3cc46df707f16643c1e4328b Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sat, 18 Oct 2025 02:33:22 +0300
Subject: [PATCH 10/33] Add support for building dependencies with specific
 architecture

---
 .../Tools/Flax.Build/Deps/Dependencies/AGS.cs |  20 +-
 .../Flax.Build/Deps/Dependencies/Assimp.cs    | 103 +++--
 .../Deps/Dependencies/DirectXMesh.cs          |  30 +-
 .../Dependencies/DirectXShaderCompiler.cs     |  40 +-
 .../Deps/Dependencies/DirectXTex.cs           |  80 ++--
 .../Deps/Dependencies/NewtonsoftJson.cs       |  18 +
 .../Flax.Build/Deps/Dependencies/NvCloth.cs   |  95 +++--
 .../Flax.Build/Deps/Dependencies/OpenAL.cs    | 365 +++++++++---------
 .../Flax.Build/Deps/Dependencies/PhysX.cs     | 158 +++++---
 .../Flax.Build/Deps/Dependencies/UVAtlas.cs   |  46 ++-
 .../Flax.Build/Deps/Dependencies/astc.cs      |  38 +-
 .../Flax.Build/Deps/Dependencies/curl.cs      | 140 ++++---
 .../Flax.Build/Deps/Dependencies/dbghelp.cs   |  34 +-
 .../Flax.Build/Deps/Dependencies/freetype.cs  | 314 ++++++++-------
 .../Flax.Build/Deps/Dependencies/glslang.cs   | 171 ++++----
 .../Flax.Build/Deps/Dependencies/mono.cs      |  30 ++
 .../Flax.Build/Deps/Dependencies/nethost.cs   |  64 ++-
 .../Flax.Build/Deps/Dependencies/nvapi.cs     |  19 +-
 .../Flax.Build/Deps/Dependencies/vorbis.cs    | 259 +++++++------
 Source/Tools/Flax.Build/Deps/Dependency.cs    |  14 +-
 Source/Tools/Flax.Build/Deps/DepsBuilder.cs   |  31 +-
 21 files changed, 1280 insertions(+), 789 deletions(-)

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/AGS.cs b/Source/Tools/Flax.Build/Deps/Dependencies/AGS.cs
index 60be17f0b..c756bb28b 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/AGS.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/AGS.cs
@@ -18,6 +18,24 @@ namespace Flax.Deps.Dependencies
             get => new[] { TargetPlatform.Windows };
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
@@ -30,7 +48,7 @@ namespace Flax.Deps.Dependencies
             // Copy files
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
+                BuildStarted(platform, TargetArchitecture.x64);
                 var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
                 Utilities.FileCopy(Path.Combine(root, "ags_lib/lib/amd_ags_x64.lib"), Path.Combine(depsFolder, "amd_ags_x64.lib"));
                 Utilities.FileCopy(Path.Combine(root, "ags_lib/lib/amd_ags_x64.dll"), Path.Combine(depsFolder, "amd_ags_x64.dll"));
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs b/Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs
index bb1c4fa3c..e31a3a059 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs
@@ -2,6 +2,7 @@
 
 using System.Collections.Generic;
 using System.IO;
+using System.Linq;
 using Flax.Build;
 
 namespace Flax.Deps.Dependencies
@@ -39,6 +40,36 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        //TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
@@ -91,22 +122,22 @@ namespace Flax.Deps.Dependencies
 
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.Windows:
-                {
-                    var configuration = "Release";
-                    var binariesWin = new[]
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                     {
-                        Path.Combine("bin", configuration, "assimp-vc140-md.dll"),
-                        Path.Combine("lib", configuration, "assimp-vc140-md.lib"),
-                    };
+                    case TargetPlatform.Windows:
+                    {
+                        var configuration = "Release";
+                        var binariesWin = new[]
+                        {
+                            Path.Combine("bin", configuration, "assimp-vc140-md.dll"),
+                            Path.Combine("lib", configuration, "assimp-vc140-md.lib"),
+                        };
 
-                    // Build for Windows
-                    File.Delete(Path.Combine(root, "CMakeCache.txt"));
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
-                    {
+                        // Build for Windows
+                        File.Delete(Path.Combine(root, "CMakeCache.txt"));
                         var buildDir = Path.Combine(root, "build-" + architecture);
                         var solutionPath = Path.Combine(buildDir, "Assimp.sln");
                         SetupDirectory(buildDir, true);
@@ -116,42 +147,38 @@ namespace Flax.Deps.Dependencies
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         foreach (var file in binariesWin)
                             Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, Path.GetFileName(file)));
+                        break;
                     }
-
-                    break;
-                }
-                case TargetPlatform.Linux:
-                {
-                    var envVars = new Dictionary<string, string>
+                    case TargetPlatform.Linux:
                     {
-                        { "CC", "clang-" + Configuration.LinuxClangMinVer },
-                        { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
-                        { "CXX", "clang++-" + Configuration.LinuxClangMinVer },
-                        { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
-                    };
+                        var envVars = new Dictionary<string, string>
+                        {
+                            { "CC", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CXX", "clang++-" + Configuration.LinuxClangMinVer },
+                            { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
+                        };
 
-                    // Build for Linux
-                    RunCmake(root, platform, TargetArchitecture.x64, " -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF " + globalConfig, envVars);
-                    Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool, envVars);
-                    configHeaderFilePath = Path.Combine(root, "include", "assimp", "config.h");
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    Utilities.FileCopy(Path.Combine(root, "lib", "libassimp.a"), Path.Combine(depsFolder, "libassimp.a"));
-                    break;
-                }
-                case TargetPlatform.Mac:
-                {
-                    // Build for Mac
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                        // Build for Linux
+                        RunCmake(root, platform, architecture, " -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF " + globalConfig, envVars);
+                        Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool, envVars);
+                        configHeaderFilePath = Path.Combine(root, "include", "assimp", "config.h");
+                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
+                        Utilities.FileCopy(Path.Combine(root, "lib", "libassimp.a"), Path.Combine(depsFolder, "libassimp.a"));
+                        break;
+                    }
+                    case TargetPlatform.Mac:
                     {
+                        // Build for Mac
                         RunCmake(root, platform, architecture, " -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF " + globalConfig);
                         Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool);
                         configHeaderFilePath = Path.Combine(root, "include", "assimp", "config.h");
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         Utilities.FileCopy(Path.Combine(root, "lib", "libassimp.a"), Path.Combine(depsFolder, "libassimp.a"));
                         Utilities.Run("make", "clean", null, root, Utilities.RunOptions.DefaultTool);
+                        break;
+                    }
                     }
-                    break;
-                }
                 }
             }
 
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXMesh.cs b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXMesh.cs
index e631b280b..0da78e580 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXMesh.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXMesh.cs
@@ -28,6 +28,24 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
@@ -46,12 +64,12 @@ namespace Flax.Deps.Dependencies
 
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.Windows:
-                {
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    BuildStarted(platform, architecture);
+                    switch (platform)
+                    {
+                    case TargetPlatform.Windows:
                     {
                         Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString());
                         var depsFolder = GetThirdPartyFolder(options, TargetPlatform.Windows, architecture);
@@ -61,7 +79,7 @@ namespace Flax.Deps.Dependencies
                         }
                     }
                     break;
-                }
+                    }
                 }
             }
 
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXShaderCompiler.cs b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXShaderCompiler.cs
index 894af3840..3c48290ee 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXShaderCompiler.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXShaderCompiler.cs
@@ -31,22 +31,40 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.Windows:
-                {
-                    var sdk = WindowsPlatformBase.GetSDKs().Last();
-                    var sdkLibLocation = Path.Combine(sdk.Value, "Lib", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString(), "um");
-                    string binLocation = Path.Combine(sdk.Value, "bin", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString());
-
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                     {
+                    case TargetPlatform.Windows:
+                    {
+                        var sdk = WindowsPlatformBase.GetSDKs().Last();
+                        var sdkLibLocation = Path.Combine(sdk.Value, "Lib", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString(), "um");
+                        string binLocation = Path.Combine(sdk.Value, "bin", WindowsPlatformBase.GetSDKVersion(sdk.Key).ToString());
+
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
 
                         string dxilLocation = @$"{binLocation}\{architecture}\dxil.dll";
@@ -60,9 +78,9 @@ namespace Flax.Deps.Dependencies
                         string d3dcompilerLibLocation = @$"{sdkLibLocation}\{architecture}\d3dcompiler.lib";
                         Utilities.FileCopy(dxcompilerLibLocation, Path.Combine(depsFolder, Path.GetFileName(dxcompilerLibLocation)));
                         Utilities.FileCopy(d3dcompilerLibLocation, Path.Combine(depsFolder, "d3dcompiler_47.lib"));
+                        break;
+                    }
                     }
-                    break;
-                }
                 }
             }
         }
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXTex.cs b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXTex.cs
index c0d1a461f..3a842b48c 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXTex.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXTex.cs
@@ -30,6 +30,24 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
@@ -47,44 +65,44 @@ namespace Flax.Deps.Dependencies
 
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.Windows:
-                {
-                    var solutionPath = Path.Combine(root, "DirectXTex_Desktop_2022_Win10.sln");
-                    var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Desktop_2022_Win10");
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                     {
+                    case TargetPlatform.Windows:
+                    {
+                        var solutionPath = Path.Combine(root, "DirectXTex_Desktop_2022_Win10.sln");
+                        var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Desktop_2022_Win10");
                         Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString());
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         foreach (var file in outputFileNames)
                             Utilities.FileCopy(Path.Combine(binFolder, architecture.ToString(), configuration, file), Path.Combine(depsFolder, file));
+                        break;
+                    }
+                    case TargetPlatform.UWP:
+                    {
+                        var solutionPath = Path.Combine(root, "DirectXTex_Windows10_2019.sln");
+                        var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Windows10_2019");
+                        Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, "x64");
+                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
+                        foreach (var file in outputFileNames)
+                            Utilities.FileCopy(Path.Combine(binFolder, "x64", configuration, file), Path.Combine(depsFolder, file));
+                        break;
+                    }
+                    case TargetPlatform.XboxOne:
+                    case TargetPlatform.XboxScarlett:
+                    {
+                        var solutionPath = Path.Combine(root, "DirectXTex_GDK_2022.sln");
+                        var binFolder = Path.Combine(root, "DirectXTex", "Bin", "GDK_2022");
+                        var xboxName = platform == TargetPlatform.XboxOne ? "Gaming.Xbox.XboxOne.x64" : "Gaming.Xbox.Scarlett.x64";
+                        Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, xboxName);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
+                        foreach (var file in outputFileNames)
+                            Utilities.FileCopy(Path.Combine(binFolder, xboxName, configuration, file), Path.Combine(depsFolder, file));
+                        break;
+                    }
                     }
-                    break;
-                }
-                case TargetPlatform.UWP:
-                {
-                    var solutionPath = Path.Combine(root, "DirectXTex_Windows10_2019.sln");
-                    var binFolder = Path.Combine(root, "DirectXTex", "Bin", "Windows10_2019");
-                    Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, "x64");
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    foreach (var file in outputFileNames)
-                        Utilities.FileCopy(Path.Combine(binFolder, "x64", configuration, file), Path.Combine(depsFolder, file));
-                    break;
-                }
-                case TargetPlatform.XboxOne:
-                case TargetPlatform.XboxScarlett:
-                {
-                    var solutionPath = Path.Combine(root, "DirectXTex_GDK_2022.sln");
-                    var binFolder = Path.Combine(root, "DirectXTex", "Bin", "GDK_2022");
-                    var xboxName = platform == TargetPlatform.XboxOne ? "Gaming.Xbox.XboxOne.x64" : "Gaming.Xbox.Scarlett.x64";
-                    Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, xboxName);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    foreach (var file in outputFileNames)
-                        Utilities.FileCopy(Path.Combine(binFolder, xboxName, configuration, file), Path.Combine(depsFolder, file));
-                    break;
-                }
                 }
             }
 
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/NewtonsoftJson.cs b/Source/Tools/Flax.Build/Deps/Dependencies/NewtonsoftJson.cs
index 495de4734..58fb21b25 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/NewtonsoftJson.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/NewtonsoftJson.cs
@@ -36,6 +36,24 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs b/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs
index f296bc9b9..25557cb70 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs
@@ -50,6 +50,36 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        //TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
@@ -61,39 +91,40 @@ namespace Flax.Deps.Dependencies
 
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.Windows:
-                    Build(options, platform, TargetArchitecture.x64);
-                    Build(options, platform, TargetArchitecture.ARM64);
-                    break;
-                case TargetPlatform.XboxOne:
-                case TargetPlatform.XboxScarlett:
-                    Build(options, platform, TargetArchitecture.x64);
-                    break;
-                case TargetPlatform.PS4:
-                case TargetPlatform.PS5:
-                    Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true);
-                    Build(options, platform, TargetArchitecture.x64);
-                    break;
-                case TargetPlatform.Switch:
-                    Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true);
-                    Build(options, platform, TargetArchitecture.ARM64);
-                    break;
-                case TargetPlatform.Android:
-                    Build(options, platform, TargetArchitecture.ARM64);
-                    break;
-                case TargetPlatform.Mac:
-                    Build(options, platform, TargetArchitecture.x64);
-                    Build(options, platform, TargetArchitecture.ARM64);
-                    break;
-                case TargetPlatform.iOS:
-                    Build(options, platform, TargetArchitecture.ARM64);
-                    break;
-                case TargetPlatform.Linux:
-                    Build(options, platform, TargetArchitecture.x64);
-                    break;
+                    BuildStarted(platform, architecture);
+                    switch (platform)
+                    {
+                    case TargetPlatform.Windows:
+                        Build(options, platform, architecture);
+                        break;
+                    case TargetPlatform.XboxOne:
+                    case TargetPlatform.XboxScarlett:
+                        Build(options, platform, TargetArchitecture.x64);
+                        break;
+                    case TargetPlatform.PS4:
+                    case TargetPlatform.PS5:
+                        Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true);
+                        Build(options, platform, TargetArchitecture.x64);
+                        break;
+                    case TargetPlatform.Switch:
+                        Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "NvCloth"), root, true, true);
+                        Build(options, platform, TargetArchitecture.ARM64);
+                        break;
+                    case TargetPlatform.Android:
+                        Build(options, platform, TargetArchitecture.ARM64);
+                        break;
+                    case TargetPlatform.Mac:
+                        Build(options, platform, architecture);
+                        break;
+                    case TargetPlatform.iOS:
+                        Build(options, platform, TargetArchitecture.ARM64);
+                        break;
+                    case TargetPlatform.Linux:
+                        Build(options, platform, architecture);
+                        break;
+                    }
                 }
             }
 
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs b/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs
index 79be778ef..c87931179 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs
@@ -45,6 +45,36 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        //TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
@@ -55,26 +85,55 @@ namespace Flax.Deps.Dependencies
             var dstIncludePath = Path.Combine(options.ThirdPartyFolder, "OpenAL");
             var noSSL = true; // OpenAL Soft website has broken certs
 
+            if (options.Platforms.Contains(TargetPlatform.Windows))
+            {
+                // Get the source
+                CloneGitRepo(root, "https://github.com/kcat/openal-soft.git");
+                GitCheckout(root, "master", "d3875f333fb6abe2f39d82caca329414871ae53b"); // 1.23.1
+            }
+            else
+            {
+                // Get the source
+                var packagePath = Path.Combine(root, $"package-{version}.zip");
+                if (!File.Exists(packagePath))
+                {
+                    Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL);
+                    using (ZipArchive archive = ZipFile.Open(packagePath, ZipArchiveMode.Read))
+                    {
+                        if (!Directory.Exists(root))
+                            archive.ExtractToDirectory(root);
+                        root = Path.Combine(root, archive.Entries.First().FullName);
+                    }
+                }
+                /*if (Platform.BuildTargetPlatform == TargetPlatform.Windows)
+                {
+                    // TODO: Maybe use PowerShell Expand-Archive instead?
+                    var sevenZip = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.ProgramFiles), "7-Zip", "7z.exe");
+                    Utilities.Run(sevenZip, "x package.zip", null, root);
+                    Utilities.Run(sevenZip, "x package", null, root);
+                }
+                else
+                {
+                    Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
+                }*/
+            }
+
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.Windows:
-                {
-                    var binariesToCopy = new[]
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                     {
-                        "OpenAL32.lib",
-                        "OpenAL32.dll",
-                    };
-
-                    // Get the source
-                    CloneGitRepo(root, "https://github.com/kcat/openal-soft.git");
-                    GitCheckout(root, "master", "dc7d7054a5b4f3bec1dc23a42fd616a0847af948"); // 1.24.3
-
-                    // Build for Win64 and ARM64
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    case TargetPlatform.Windows:
                     {
+                        var binariesToCopy = new[]
+                        {
+                            "OpenAL32.lib",
+                            "OpenAL32.dll",
+                        };
+
+                        // Build for Windows
                         var buildDir = Path.Combine(root, "build-" + architecture.ToString());
                         var solutionPath = Path.Combine(buildDir, "OpenAL.sln");
 
@@ -83,185 +142,147 @@ namespace Flax.Deps.Dependencies
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         foreach (var file in binariesToCopy)
                             Utilities.FileCopy(Path.Combine(buildDir, configuration, file), Path.Combine(depsFolder, Path.GetFileName(file)));
-                    }
-                    
+
 #if false
-                    // Get the binaries
-                    var packagePath = Path.Combine(root, "package.zip");
-                    if (!File.Exists(packagePath))
-                        Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-binaries/openal-soft-" + version + "-bin.zip", packagePath, noSSL);
-                    using (ZipArchive archive = ZipFile.Open(packagePath, ZipArchiveMode.Read))
-                    {
-                        if (!Directory.Exists(root))
-                            archive.ExtractToDirectory(root);
-                        root = Path.Combine(root, archive.Entries.First().FullName);
-                    }
+                        // Get the binaries
+                        var packagePath = Path.Combine(root, "package.zip");
+                        if (!File.Exists(packagePath))
+                            Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-binaries/openal-soft-" + version + "-bin.zip", packagePath, noSSL);
+                        using (ZipArchive archive = ZipFile.Open(packagePath, ZipArchiveMode.Read))
+                        {
+                            if (!Directory.Exists(root))
+                                archive.ExtractToDirectory(root);
+                            root = Path.Combine(root, archive.Entries.First().FullName);
+                        }
 
-                    // Deploy Win64 binaries
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    Utilities.FileCopy(Path.Combine(root, "bin", "Win64", "soft_oal.dll"), Path.Combine(depsFolder, "OpenAL32.dll"));
-                    Utilities.FileCopy(Path.Combine(root, "libs", "Win64", "OpenAL32.lib"), Path.Combine(depsFolder, "OpenAL32.lib"));
+                        // Deploy Win64 binaries
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
+                        Utilities.FileCopy(Path.Combine(root, "bin", "Win64", "soft_oal.dll"), Path.Combine(depsFolder, "OpenAL32.dll"));
+                        Utilities.FileCopy(Path.Combine(root, "libs", "Win64", "OpenAL32.lib"), Path.Combine(depsFolder, "OpenAL32.lib"));
 
-                    // Deploy license
-                    Utilities.FileCopy(Path.Combine(root, "COPYING"), Path.Combine(dstIncludePath, "COPYING"), true);
+                        // Deploy license
+                        Utilities.FileCopy(Path.Combine(root, "COPYING"), Path.Combine(dstIncludePath, "COPYING"), true);
 
-                    // Deploy header files
-                    var files = Directory.GetFiles(Path.Combine(root, "include", "AL"));
-                    foreach (var file in files)
-                    {
-                        Utilities.FileCopy(file, Path.Combine(dstIncludePath, Path.GetFileName(file)));
-                    }
+                        // Deploy header files
+                        var files = Directory.GetFiles(Path.Combine(root, "include", "AL"));
+                        foreach (var file in files)
+                        {
+                            Utilities.FileCopy(file, Path.Combine(dstIncludePath, Path.GetFileName(file)));
+                        }
 #endif
-                    break;
-                }
-                case TargetPlatform.Linux:
-                {
-                    var binariesToCopy = new[]
-                    {
-                        "libopenal.a",
-                    };
-                    var envVars = new Dictionary<string, string>
-                    {
-                        { "CC", "clang-" + Configuration.LinuxClangMinVer },
-                        { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
-                        { "CXX", "clang++-" + Configuration.LinuxClangMinVer },
-                        { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
-                    };
-                    var config = $"-DALSOFT_REQUIRE_ALSA=ON " +
-                                 $"-DALSOFT_REQUIRE_OSS=ON " +
-                                 $"-DALSOFT_REQUIRE_PORTAUDIO=ON " +
-                                 $"-DALSOFT_REQUIRE_PULSEAUDIO=ON " +
-                                 $"-DALSOFT_REQUIRE_JACK=ON " +
-                                 $"-DALSOFT_REQUIRE_PIPEWIRE=ON " +
-                                 $"-DALSOFT_EMBED_HRTF_DATA=YES "
-                                 + cmakeArgs;
-
-                    // Get the source
-                    var packagePath = Path.Combine(root, "package.zip");
-                    File.Delete(packagePath);
-                    Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL);
-                    Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
-
-                    // Use separate build directory
-                    root = Path.Combine(root, "openal-soft-" + version);
-                    var buildDir = Path.Combine(root, "build");
-                    SetupDirectory(buildDir, true);
-
-                    // Build for Linux
-                    Utilities.Run("cmake", $"-G \"Unix Makefiles\" -DCMAKE_BUILD_TYPE={configuration} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DLIBTYPE=STATIC {config} ..", null, buildDir, Utilities.RunOptions.ConsoleLogOutput, envVars);
-                    BuildCmake(buildDir, configuration, envVars);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    foreach (var file in binariesToCopy)
-                        Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
-                    break;
-                }
-                case TargetPlatform.Android:
-                {
-                    var binariesToCopy = new[]
-                    {
-                        "libopenal.a",
-                    };
-                    var envVars = new Dictionary<string, string>
-                    {
-                        { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
-                    };
-                    var config = " -DALSOFT_REQUIRE_OBOE=OFF -DALSOFT_REQUIRE_OPENSL=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
-
-                    // Get the source
-                    var packagePath = Path.Combine(root, "package.zip");
-                    File.Delete(packagePath);
-                    Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL);
-                    if (Platform.BuildTargetPlatform == TargetPlatform.Windows)
-                    {
-                        var sevenZip = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.ProgramFiles), "7-Zip", "7z.exe");
-                        Utilities.Run(sevenZip, "x package.zip", null, root);
-                        Utilities.Run(sevenZip, "x package", null, root);
+                        break;
                     }
-                    else
+                    case TargetPlatform.Linux:
                     {
-                        Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
+                        var binariesToCopy = new[]
+                        {
+                            "libopenal.a",
+                        };
+                        var envVars = new Dictionary<string, string>
+                        {
+                            { "CC", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CXX", "clang++-" + Configuration.LinuxClangMinVer },
+                            { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
+                        };
+                        var config = $"-DALSOFT_REQUIRE_ALSA=ON " +
+                                     $"-DALSOFT_REQUIRE_OSS=ON " +
+                                     $"-DALSOFT_REQUIRE_PORTAUDIO=ON " +
+                                     $"-DALSOFT_REQUIRE_PULSEAUDIO=ON " +
+                                     $"-DALSOFT_REQUIRE_JACK=ON " +
+                                     $"-DALSOFT_REQUIRE_PIPEWIRE=ON " +
+                                     $"-DALSOFT_EMBED_HRTF_DATA=YES "
+                                     + cmakeArgs;
+
+                        // Use separate build directory
+                        root = Path.Combine(root, "openal-soft-" + version);
+                        var buildDir = Path.Combine(root, "build");
+                        SetupDirectory(buildDir, true);
+
+                        // Build for Linux
+                        Utilities.Run("cmake", $"-G \"Unix Makefiles\" -DCMAKE_BUILD_TYPE={configuration} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DLIBTYPE=STATIC {config} ..", null, buildDir, Utilities.RunOptions.ConsoleLogOutput, envVars);
+                        BuildCmake(buildDir, configuration, envVars);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
+                        foreach (var file in binariesToCopy)
+                            Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
+                        break;
                     }
-
-                    // Use separate build directory
-                    root = Path.Combine(root, "openal-soft-" + version);
-                    var buildDir = Path.Combine(root, "build");
-                    SetupDirectory(buildDir, true);
-
-                    // Build
-                    RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
-                    BuildCmake(buildDir, envVars);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
-                    foreach (var file in binariesToCopy)
-                        Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
-                    break;
-                }
-                case TargetPlatform.Mac:
-                {
-                    var binariesToCopy = new[]
+                    case TargetPlatform.Android:
                     {
-                        "libopenal.a",
-                    };
-                    var envVars = new Dictionary<string, string>
+                        var binariesToCopy = new[]
+                        {
+                            "libopenal.a",
+                        };
+                        var envVars = new Dictionary<string, string>
+                        {
+                            { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
+                        };
+                        var config = "-DALSOFT_REQUIRE_OBOE=OFF -DALSOFT_REQUIRE_OPENSL=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
+
+                        // Use separate build directory
+                        root = Path.Combine(root, "openal-soft-" + version);
+                        var buildDir = Path.Combine(root, "build");
+                        SetupDirectory(buildDir, true);
+
+                        // Build
+                        RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
+                        BuildCmake(buildDir, envVars);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
+                        foreach (var file in binariesToCopy)
+                            Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
+                        break;
+                    }
+                    case TargetPlatform.Mac:
                     {
-                        { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
-                    };
-                    var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
+                        var binariesToCopy = new[]
+                        {
+                            "libopenal.a",
+                        };
+                        var envVars = new Dictionary<string, string>
+                        {
+                            { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
+                        };
+                        var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
 
-                    // Get the source
-                    var packagePath = Path.Combine(root, "package.zip");
-                    File.Delete(packagePath);
-                    Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL);
-                    Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
+                        // Use separate build directory
+                        root = Path.Combine(root, "openal-soft-" + version);
+                        var buildDir = Path.Combine(root, "build");
 
-                    // Use separate build directory
-                    root = Path.Combine(root, "openal-soft-" + version);
-                    var buildDir = Path.Combine(root, "build");
-
-                    // Build for Mac
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
-                    {
+                        // Build for Mac
                         SetupDirectory(buildDir, true);
                         RunCmake(buildDir, platform, architecture, ".. -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
                         BuildCmake(buildDir, envVars);
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         foreach (var file in binariesToCopy)
                             Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
+                        break;
                     }
-                    break;
-                }
-                case TargetPlatform.iOS:
-                {
-                    var binariesToCopy = new[]
+                    case TargetPlatform.iOS:
                     {
-                        "libopenal.a",
-                    };
-                    var envVars = new Dictionary<string, string>
-                    {
-                        { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
-                    };
-                    var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
+                        var binariesToCopy = new[]
+                        {
+                            "libopenal.a",
+                        };
+                        var envVars = new Dictionary<string, string>
+                        {
+                            { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
+                        };
+                        var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
 
-                    // Get the source
-                    var packagePath = Path.Combine(root, "package.zip");
-                    if (!File.Exists(packagePath))
-                    {
-                        Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL);
-                        Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
+                        // Use separate build directory
+                        root = Path.Combine(root, "openal-soft-" + version);
+                        var buildDir = Path.Combine(root, "build");
+
+                        // Build for iOS
+                        SetupDirectory(buildDir, true);
+                        RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_SYSTEM_NAME=iOS -DALSOFT_OSX_FRAMEWORK=ON -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
+                        BuildCmake(buildDir, envVars);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
+                        foreach (var file in binariesToCopy)
+                            Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
+                        break;
+                    }
                     }
-
-                    // Use separate build directory
-                    root = Path.Combine(root, "openal-soft-" + version);
-                    var buildDir = Path.Combine(root, "build");
-
-                    // Build for iOS
-                    SetupDirectory(buildDir, true);
-                    RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_SYSTEM_NAME=iOS -DALSOFT_OSX_FRAMEWORK=ON -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
-                    BuildCmake(buildDir, envVars);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
-                    foreach (var file in binariesToCopy)
-                        Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
-                    break;
-                }
                 }
             }
         }
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
index fe7dd39f7..c5016c3e9 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
@@ -51,6 +51,36 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        //TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         private string root;
         private string projectGenDir;
         private string projectGenPath;
@@ -376,69 +406,79 @@ namespace Flax.Deps.Dependencies
 
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.Windows:
-                {
-                    try
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                     {
-                        Build(options, "vc18win64", platform, architecture);
-                        Build(options, "vc18win-arm64", platform, architecture);
-                    }
-                    catch
+                    case TargetPlatform.Windows:
                     {
-                        Log.Verbose("Failed to generate VS2026 solution for PhysX, fallback to VS2022");
-                        Build(options, "vc17win64", platform, architecture);
-                        Build(options, "vc17win-arm64", platform, architecture);
+                        if (architecture == TargetArchitecture.x64 || architecture == TargetArchitecture.ARM64)
+                        {
+                            try
+                            {
+                                Build(options, architecture == TargetArchitecture.x64 ? "vc18win64" : "vc18win-arm64", platform, architecture);
+                            }
+                            catch
+                            {
+                                Log.Verbose("Failed to generate VS2026 solution for PhysX, fallback to VS2022");
+                                Build(options, architecture == TargetArchitecture.x64 ? "vc17win64" : "vc17win-arm64", platform, architecture);
+                            }
+                        }
+                        else
+                            throw new InvalidArchitectureException(architecture);
+                        break;
+                    }
+                    case TargetPlatform.Linux:
+                    {
+                        Build(options, "linux", platform, architecture);
+                        break;
+                    }
+                    case TargetPlatform.PS4:
+                    {
+                        Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
+                        Build(options, "ps4", platform, TargetArchitecture.x64);
+                        break;
+                    }
+                    case TargetPlatform.PS5:
+                    {
+                        Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
+                        Build(options, "ps5", platform, TargetArchitecture.x64);
+                        break;
+                    }
+                    case TargetPlatform.XboxScarlett:
+                    case TargetPlatform.XboxOne:
+                    {
+                        Build(options, "vc16win64", platform, TargetArchitecture.x64);
+                        break;
+                    }
+                    case TargetPlatform.Android:
+                    {
+                        Build(options, "android", platform, TargetArchitecture.ARM64);
+                        break;
+                    }
+                    case TargetPlatform.Switch:
+                    {
+                        Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
+                        Build(options, "switch64", platform, TargetArchitecture.ARM64);
+                        break;
+                    }
+                    case TargetPlatform.Mac:
+                    {
+                        if (architecture == TargetArchitecture.x64)
+                            Build(options, "mac64", platform, architecture);
+                        else if (architecture == TargetArchitecture.ARM64)
+                            Build(options, "mac-arm64", platform, architecture);
+                        else
+                            throw new InvalidArchitectureException(architecture);
+                        break;
+                    }
+                    case TargetPlatform.iOS:
+                    {
+                        Build(options, "ios64", platform, TargetArchitecture.ARM64);
+                        break;
+                    }
                     }
-                    break;
-                }
-                case TargetPlatform.Linux:
-                {
-                    Build(options, "linux", platform, TargetArchitecture.x64);
-                    break;
-                }
-                case TargetPlatform.PS4:
-                {
-                    Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
-                    Build(options, "ps4", platform, TargetArchitecture.x64);
-                    break;
-                }
-                case TargetPlatform.PS5:
-                {
-                    Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
-                    Build(options, "ps5", platform, TargetArchitecture.x64);
-                    break;
-                }
-                case TargetPlatform.XboxScarlett:
-                case TargetPlatform.XboxOne:
-                {
-                    Build(options, "vc16win64", platform, TargetArchitecture.x64);
-                    break;
-                }
-                case TargetPlatform.Android:
-                {
-                    Build(options, "android", platform, TargetArchitecture.ARM64);
-                    break;
-                }
-                case TargetPlatform.Switch:
-                {
-                    Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "PhysX"), root, true, true);
-                    Build(options, "switch64", platform, TargetArchitecture.ARM64);
-                    break;
-                }
-                case TargetPlatform.Mac:
-                {
-                    Build(options, "mac64", platform, TargetArchitecture.x64);
-                    Build(options, "mac-arm64", platform, TargetArchitecture.ARM64);
-                    break;
-                }
-                case TargetPlatform.iOS:
-                {
-                    Build(options, "ios64", platform, TargetArchitecture.ARM64);
-                    break;
-                }
                 }
             }
 
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/UVAtlas.cs b/Source/Tools/Flax.Build/Deps/Dependencies/UVAtlas.cs
index 617b82af0..f0d29dba9 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/UVAtlas.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/UVAtlas.cs
@@ -29,6 +29,36 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        //TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
@@ -47,23 +77,23 @@ namespace Flax.Deps.Dependencies
 
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.Windows:
-                {
-                    // Build for Win64
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                     {
+                    case TargetPlatform.Windows:
+                    {
+                        // Build for Windows
                         Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString(), new Dictionary<string, string>() { { "RestorePackagesConfig", "true" } });
                         var depsFolder = GetThirdPartyFolder(options, TargetPlatform.Windows, architecture);
                         foreach (var file in outputFileNames)
                         {
                             Utilities.FileCopy(Path.Combine(binFolder, architecture.ToString(), "Release", file), Path.Combine(depsFolder, file));
                         }
+                        break;
+                    }
                     }
-                    break;
-                }
                 }
             }
 
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/astc.cs b/Source/Tools/Flax.Build/Deps/Dependencies/astc.cs
index d5886810d..40ae9d1e0 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/astc.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/astc.cs
@@ -34,6 +34,30 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
@@ -45,12 +69,12 @@ namespace Flax.Deps.Dependencies
 
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.Windows:
-
-                    foreach (var architecture in new []{ TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    BuildStarted(platform, architecture);
+                    switch (platform)
+                    {
+                    case TargetPlatform.Windows:
                     {
                         string buildDir = Path.Combine(root, "build-" + architecture.ToString());
                         var isa = architecture == TargetArchitecture.ARM64 ? "-DASTCENC_ISA_NEON=ON" : "-DASTCENC_ISA_SSE2=ON";
@@ -62,8 +86,7 @@ namespace Flax.Deps.Dependencies
                         Utilities.FileCopy(Path.Combine(buildDir, "Source/Release", lib), Path.Combine(depsFolder, "astcenc.lib"));
                     }
                     break;
-                case TargetPlatform.Mac:
-                    foreach (var architecture in new []{ TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    case TargetPlatform.Mac:
                     {
                         string buildDir = Path.Combine(root, "build-" + architecture.ToString());
                         var isa = architecture == TargetArchitecture.ARM64 ? "-DASTCENC_ISA_NEON=ON" : "-DASTCENC_ISA_SSE2=ON";
@@ -75,6 +98,7 @@ namespace Flax.Deps.Dependencies
                         Utilities.FileCopy(Path.Combine(buildDir, "Source", lib), Path.Combine(depsFolder, "libastcenc.a"));
                     }
                     break;
+                    }
                 }
             }
 
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs b/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs
index 447f573a7..a559f1c7f 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs
@@ -41,6 +41,36 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        //TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
@@ -69,14 +99,14 @@ namespace Flax.Deps.Dependencies
 
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.Windows:
-                {
-                    // Build for Win64 and ARM64
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                     {
+                    case TargetPlatform.Windows:
+                    {
+                        // Build for Windows
                         var buildDir = Path.Combine(root, "build-" + architecture.ToString());
                         var solutionPath = Path.Combine(buildDir, "CURL.sln");
 
@@ -85,57 +115,55 @@ namespace Flax.Deps.Dependencies
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         foreach (var file in binariesToCopyWin)
                             Utilities.FileCopy(Path.Combine(buildDir, "lib", configuration, file), Path.Combine(depsFolder, Path.GetFileName(file)));
+                        break;
                     }
-                    break;
-                }
-                case TargetPlatform.Linux:
-                {
-                    // Build for Linux
-                    var settings = new[]
+                    case TargetPlatform.Linux:
                     {
-                        "--without-librtmp",
-                        "--without-ssl",
-                        "--with-gnutls",
-                        "--disable-ipv6",
-                        "--disable-manual",
-                        "--disable-verbose",
-                        "--disable-shared",
-                        "--enable-static",
-                        "-disable-ldap --disable-sspi --disable-ftp --disable-file --disable-dict --disable-telnet --disable-tftp --disable-rtsp --disable-pop3 --disable-imap --disable-smtp --disable-gopher --disable-smb",
-                    };
-                    var envVars = new Dictionary<string, string>
-                    {
-                        { "CC", "clang-" + Configuration.LinuxClangMinVer },
-                        { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
-                        { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
-                    };
-                    var buildDir = Path.Combine(root, "build");
-                    SetupDirectory(buildDir, true);
-                    Utilities.Run("chmod", "+x configure", null, root, Utilities.RunOptions.DefaultTool);
-                    Utilities.Run(Path.Combine(root, "configure"), string.Join(" ", settings) + " --prefix=\"" + buildDir + "\"", null, root, Utilities.RunOptions.DefaultTool, envVars);
-                    Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool);
-                    Utilities.Run("make", "install", null, root, Utilities.RunOptions.DefaultTool);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    var filename = "libcurl.a";
-                    Utilities.FileCopy(Path.Combine(buildDir, "lib", filename), Path.Combine(depsFolder, filename));
-                    break;
-                }
-                case TargetPlatform.Mac:
-                {
-                    // Build for Mac
-                    var settings = new[]
-                    {
-                        "--with-secure-transport",
-                        "--without-librtmp",
-                        "--disable-ipv6",
-                        "--disable-manual",
-                        "--disable-verbose",
-                        "--disable-shared",
-                        "--enable-static",
-                        "-disable-ldap --disable-sspi --disable-ftp --disable-file --disable-dict --disable-telnet --disable-tftp --disable-rtsp --disable-pop3 --disable-imap --disable-smtp --disable-gopher --disable-smb",
-                    };
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                        // Build for Linux
+                        var settings = new[]
+                        {
+                            "--without-librtmp",
+                            "--without-ssl",
+                            "--with-gnutls",
+                            "--disable-ipv6",
+                            "--disable-manual",
+                            "--disable-verbose",
+                            "--disable-shared",
+                            "--enable-static",
+                            "-disable-ldap --disable-sspi --disable-ftp --disable-file --disable-dict --disable-telnet --disable-tftp --disable-rtsp --disable-pop3 --disable-imap --disable-smtp --disable-gopher --disable-smb",
+                        };
+                        var envVars = new Dictionary<string, string>
+                        {
+                            { "CC", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
+                        };
+                        var buildDir = Path.Combine(root, "build");
+                        SetupDirectory(buildDir, true);
+                        Utilities.Run("chmod", "+x configure", null, root, Utilities.RunOptions.DefaultTool);
+                        Utilities.Run(Path.Combine(root, "configure"), string.Join(" ", settings) + " --prefix=\"" + buildDir + "\"", null, root, Utilities.RunOptions.DefaultTool, envVars);
+                        Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool);
+                        Utilities.Run("make", "install", null, root, Utilities.RunOptions.DefaultTool);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
+                        var filename = "libcurl.a";
+                        Utilities.FileCopy(Path.Combine(buildDir, "lib", filename), Path.Combine(depsFolder, filename));
+                        break;
+                    }
+                    case TargetPlatform.Mac:
                     {
+                        // Build for Mac
+                        var settings = new[]
+                        {
+                            "--with-secure-transport",
+                            "--without-librtmp",
+                            "--disable-ipv6",
+                            "--disable-manual",
+                            "--disable-verbose",
+                            "--disable-shared",
+                            "--enable-static",
+                            "-disable-ldap --disable-sspi --disable-ftp --disable-file --disable-dict --disable-telnet --disable-tftp --disable-rtsp --disable-pop3 --disable-imap --disable-smtp --disable-gopher --disable-smb",
+                        };
+
                         var arch = GetAppleArchName(architecture);
                         var archName = arch + "-apple-darwin19";
                         if (architecture == TargetArchitecture.ARM64)
@@ -162,9 +190,9 @@ namespace Flax.Deps.Dependencies
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         var filename = "libcurl.a";
                         Utilities.FileCopy(Path.Combine(buildDir, "lib", filename), Path.Combine(depsFolder, filename));
+                        break;
+                    }
                     }
-                    break;
-                }
                 }
             }
 
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/dbghelp.cs b/Source/Tools/Flax.Build/Deps/Dependencies/dbghelp.cs
index 7017560fb..34fac56e0 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/dbghelp.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/dbghelp.cs
@@ -30,27 +30,45 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.Windows:
-                {
-                    var sdk = WindowsPlatformBase.GetSDKs().Last();
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                     {
+                    case TargetPlatform.Windows:
+                    {
+                        var sdk = WindowsPlatformBase.GetSDKs().Last();
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         var libLocation = @$"{sdk.Value}Debuggers\lib\{architecture}\dbghelp.lib";
                         var dllLocation = @$"{sdk.Value}Debuggers\{architecture}\dbghelp.dll";
                         Utilities.FileCopy(libLocation, Path.Combine(depsFolder, Path.GetFileName(libLocation)));
                         Utilities.FileCopy(dllLocation, Path.Combine(depsFolder, Path.GetFileName(dllLocation)));
+                        break;
+                    }
                     }
-                    break;
-                }
                 }
             }
         }
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs b/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs
index 0692ad8e9..ec3ab5e18 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs
@@ -49,6 +49,36 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        //TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
@@ -94,171 +124,167 @@ namespace Flax.Deps.Dependencies
 
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.Windows:
-                {
-                    // Patch the RuntimeLibrary value
-                    File.WriteAllText(vcxprojPath, vcxprojContents);
-
-                    // Build for Windows
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                     {
+                    case TargetPlatform.Windows:
+                    {
+                        // Patch the RuntimeLibrary value
+                        File.WriteAllText(vcxprojPath, vcxprojContents);
+
+                        // Build for Windows
                         Deploy.VCEnvironment.BuildSolution(vsSolutionPath, configurationMsvc, architecture.ToString(), msvcProps);
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         foreach (var filename in binariesToCopyMsvc)
                             Utilities.FileCopy(Path.Combine(root, "objs", architecture.ToString(), configurationMsvc, filename), Path.Combine(depsFolder, filename));
+                        break;
                     }
-                    break;
-                }
-                case TargetPlatform.Linux:
-                {
-                            var envVars = new Dictionary<string, string>
+                    case TargetPlatform.Linux:
                     {
-                        { "CC", "clang-" + Configuration.LinuxClangMinVer },
-                        { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
-                        { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
-                    };
+                        var envVars = new Dictionary<string, string>
+                        {
+                            { "CC", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
+                        };
 
-                    // Fix scripts
-                    Utilities.Run("dos2unix", "autogen.sh", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars);
-                    Utilities.Run("dos2unix", "configure", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars);
-                    //Utilities.Run("sed", "-i -e \'s/\r$//\' autogen.sh", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars);
-                    //Utilities.Run("sed", "-i -e \'s/\r$//\' configure", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars);
-                    Utilities.Run("chmod", "+x autogen.sh", null, root, Utilities.RunOptions.ThrowExceptionOnError);
-                    Utilities.Run("chmod", "+x configure", null, root, Utilities.RunOptions.ThrowExceptionOnError);
+                        // Fix scripts
+                        Utilities.Run("dos2unix", "autogen.sh", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars);
+                        Utilities.Run("dos2unix", "configure", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars);
+                        //Utilities.Run("sed", "-i -e \'s/\r$//\' autogen.sh", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars);
+                        //Utilities.Run("sed", "-i -e \'s/\r$//\' configure", null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars);
+                        Utilities.Run("chmod", "+x autogen.sh", null, root, Utilities.RunOptions.ThrowExceptionOnError);
+                        Utilities.Run("chmod", "+x configure", null, root, Utilities.RunOptions.ThrowExceptionOnError);
 
-                    Utilities.Run(Path.Combine(root, "autogen.sh"), null, null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars);
+                        Utilities.Run(Path.Combine(root, "autogen.sh"), null, null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars);
 
-                    // Disable using libpng even if it's found on the system
-                    var cmakeFile = Path.Combine(root, "CMakeLists.txt");
-                    File.WriteAllText(cmakeFile,
-                                      File.ReadAllText(cmakeFile)
-                                          .Replace("find_package(PNG)", "")
-                                          .Replace("find_package(ZLIB)", "")
-                                          .Replace("find_package(BZip2)", "")
-                                     );
+                        // Disable using libpng even if it's found on the system
+                        var cmakeFile = Path.Combine(root, "CMakeLists.txt");
+                        File.WriteAllText(cmakeFile,
+                                          File.ReadAllText(cmakeFile)
+                                              .Replace("find_package(PNG)", "")
+                                              .Replace("find_package(ZLIB)", "")
+                                              .Replace("find_package(BZip2)", "")
+                                         );
 
-                    // Build for Linux
-                    SetupDirectory(buildDir, true);
-                    var toolchain = UnixToolchain.GetToolchainName(platform, TargetArchitecture.x64);
-                    Utilities.Run("cmake", string.Format("-G \"Unix Makefiles\" -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER_TARGET={0} ..", toolchain), null, buildDir, Utilities.RunOptions.DefaultTool, envVars);
-                    Utilities.Run("cmake", "--build .", null, buildDir, Utilities.RunOptions.DefaultTool, envVars);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName));
-
-                    break;
-                }
-                case TargetPlatform.PS4:
-                {
-                    // Get the build data files
-                    Utilities.DirectoryCopy(
-                                            Path.Combine(GetBinariesFolder(options, platform), "Data", "freetype"),
-                                            Path.Combine(root, "builds", "PS4"), false, true);
-
-                    // Build for PS4
-                    var solutionPath = Path.Combine(root, "builds", "PS4", "freetype.sln");
-                    Deploy.VCEnvironment.BuildSolution(solutionPath, "Release", "ORBIS");
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    Utilities.FileCopy(Path.Combine(root, "lib", "PS4", libraryFileName), Path.Combine(depsFolder, libraryFileName));
-
-                    break;
-                }
-                case TargetPlatform.PS5:
-                {
-                    // Get the build data files
-                    Utilities.DirectoryCopy(
-                                            Path.Combine(GetBinariesFolder(options, platform), "Data", "freetype"),
-                                            Path.Combine(root, "builds", "PS5"), false, true);
-                    Utilities.ReplaceInFile(Path.Combine(root, "include\\freetype\\config\\ftstdlib.h"), "#define ft_getenv  getenv", "char* ft_getenv(const char* n);");
-
-                    // Build for PS5
-                    var solutionPath = Path.Combine(root, "builds", "PS5", "freetype.sln");
-                    Deploy.VCEnvironment.BuildSolution(solutionPath, "Release", "PROSPERO");
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    Utilities.FileCopy(Path.Combine(root, "lib", "PS5", libraryFileName), Path.Combine(depsFolder, libraryFileName));
-
-                    break;
-                }
-                case TargetPlatform.XboxOne:
-                {
-                    // Build for Xbox One x64
-                    Deploy.VCEnvironment.BuildSolution(vsSolutionPath, configurationMsvc, "x64", msvcProps);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    foreach (var filename in binariesToCopyMsvc)
-                        Utilities.FileCopy(Path.Combine(root, "objs", "x64", configurationMsvc, filename), Path.Combine(depsFolder, filename));
-
-                    break;
-                }
-                case TargetPlatform.XboxScarlett:
-                {
-                    // Build for Xbox Scarlett
-                    Deploy.VCEnvironment.BuildSolution(vsSolutionPath, configurationMsvc, "x64", msvcProps);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    foreach (var filename in binariesToCopyMsvc)
-                        Utilities.FileCopy(Path.Combine(root, "objs", "x64", configurationMsvc, filename), Path.Combine(depsFolder, filename));
-
-                    break;
-                }
-                case TargetPlatform.Android:
-                {
-                    // Disable using libpng even if it's found on the system
-                    var cmakeFile = Path.Combine(root, "CMakeLists.txt");
-                    File.WriteAllText(cmakeFile,
-                                      File.ReadAllText(cmakeFile)
-                                          .Replace("find_package(PNG)", "")
-                                          .Replace("find_package(ZLIB)", "")
-                                          .Replace("find_package(BZip2)", "")
-                                     );
-
-                    // Build for Android
-                    SetupDirectory(buildDir, true);
-                    RunCmake(buildDir, TargetPlatform.Android, TargetArchitecture.ARM64, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF -DCMAKE_BUILD_TYPE=Release");
-                    BuildCmake(buildDir);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
-                    Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName));
-                    break;
-                }
-                case TargetPlatform.Switch:
-                {
-                    // Build for Switch
-                    SetupDirectory(buildDir, true);
-                    RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE=Release");
-                    BuildCmake(buildDir);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
-                    Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName));
-                    break;
-                }
-                case TargetPlatform.Mac:
-                {
-                    // Build for Mac
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                        // Build for Linux
+                        SetupDirectory(buildDir, true);
+                        var toolchain = UnixToolchain.GetToolchainName(platform, architecture);
+                        Utilities.Run("cmake", string.Format("-G \"Unix Makefiles\" -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER_TARGET={0} ..", toolchain), null, buildDir, Utilities.RunOptions.DefaultTool, envVars);
+                        Utilities.Run("cmake", "--build .", null, buildDir, Utilities.RunOptions.DefaultTool, envVars);
+                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
+                        Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName));
+                        break;
+                    }
+                    case TargetPlatform.PS4:
                     {
+                        // Get the build data files
+                        Utilities.DirectoryCopy(
+                                                Path.Combine(GetBinariesFolder(options, platform), "Data", "freetype"),
+                                                Path.Combine(root, "builds", "PS4"), false, true);
+
+                        // Build for PS4
+                        var solutionPath = Path.Combine(root, "builds", "PS4", "freetype.sln");
+                        Deploy.VCEnvironment.BuildSolution(solutionPath, "Release", "ORBIS");
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
+                        Utilities.FileCopy(Path.Combine(root, "lib", "PS4", libraryFileName), Path.Combine(depsFolder, libraryFileName));
+
+                        break;
+                    }
+                    case TargetPlatform.PS5:
+                    {
+                        // Get the build data files
+                        Utilities.DirectoryCopy(
+                                                Path.Combine(GetBinariesFolder(options, platform), "Data", "freetype"),
+                                                Path.Combine(root, "builds", "PS5"), false, true);
+                        Utilities.ReplaceInFile(Path.Combine(root, "include\\freetype\\config\\ftstdlib.h"), "#define ft_getenv  getenv", "char* ft_getenv(const char* n);");
+
+                        // Build for PS5
+                        var solutionPath = Path.Combine(root, "builds", "PS5", "freetype.sln");
+                        Deploy.VCEnvironment.BuildSolution(solutionPath, "Release", "PROSPERO");
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
+                        Utilities.FileCopy(Path.Combine(root, "lib", "PS5", libraryFileName), Path.Combine(depsFolder, libraryFileName));
+
+                        break;
+                    }
+                    case TargetPlatform.XboxOne:
+                    {
+                        // Build for Xbox One x64
+                        Deploy.VCEnvironment.BuildSolution(vsSolutionPath, configurationMsvc, "x64", msvcProps);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
+                        foreach (var filename in binariesToCopyMsvc)
+                            Utilities.FileCopy(Path.Combine(root, "objs", "x64", configurationMsvc, filename), Path.Combine(depsFolder, filename));
+
+                        break;
+                    }
+                    case TargetPlatform.XboxScarlett:
+                    {
+                        // Build for Xbox Scarlett
+                        Deploy.VCEnvironment.BuildSolution(vsSolutionPath, configurationMsvc, "x64", msvcProps);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
+                        foreach (var filename in binariesToCopyMsvc)
+                            Utilities.FileCopy(Path.Combine(root, "objs", "x64", configurationMsvc, filename), Path.Combine(depsFolder, filename));
+
+                        break;
+                    }
+                    case TargetPlatform.Android:
+                    {
+                        // Disable using libpng even if it's found on the system
+                        var cmakeFile = Path.Combine(root, "CMakeLists.txt");
+                        File.WriteAllText(cmakeFile,
+                                          File.ReadAllText(cmakeFile)
+                                              .Replace("find_package(PNG)", "")
+                                              .Replace("find_package(ZLIB)", "")
+                                              .Replace("find_package(BZip2)", "")
+                                         );
+
+                        // Build for Android
+                        SetupDirectory(buildDir, true);
+                        RunCmake(buildDir, TargetPlatform.Android, TargetArchitecture.ARM64, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF -DCMAKE_BUILD_TYPE=Release");
+                        BuildCmake(buildDir);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
+                        Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName));
+                        break;
+                    }
+                    case TargetPlatform.Switch:
+                    {
+                        // Build for Switch
+                        SetupDirectory(buildDir, true);
+                        RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE=Release");
+                        BuildCmake(buildDir);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
+                        Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName));
+                        break;
+                    }
+                    case TargetPlatform.Mac:
+                    {
+                        // Build for Mac
                         SetupDirectory(buildDir, true);
                         RunCmake(buildDir, platform, architecture, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE=Release");
                         BuildCmake(buildDir);
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName));
+                        break;
                     }
-                    break;
-                }
-                case TargetPlatform.iOS:
-                {
-                    // Fix archive creation issue due to missing ar tool
-                    Utilities.ReplaceInFile(Path.Combine(root, "builds/cmake/iOS.cmake"), "set(CMAKE_SYSTEM_NAME Darwin)", "set(CMAKE_SYSTEM_NAME Darwin)\nset(CMAKE_AR ar CACHE FILEPATH \"\" FORCE)");
+                    case TargetPlatform.iOS:
+                    {
+                        // Fix archive creation issue due to missing ar tool
+                        Utilities.ReplaceInFile(Path.Combine(root, "builds/cmake/iOS.cmake"), "set(CMAKE_SYSTEM_NAME Darwin)", "set(CMAKE_SYSTEM_NAME Darwin)\nset(CMAKE_AR ar CACHE FILEPATH \"\" FORCE)");
 
-                    // Fix freetype toolchain rejecting min iPhone version
-                    Utilities.ReplaceInFile(Path.Combine(root, "builds/cmake/iOS.cmake"), "set(CMAKE_OSX_DEPLOYMENT_TARGET \"\"", "set(CMAKE_OSX_DEPLOYMENT_TARGET \"${CMAKE_OSX_DEPLOYMENT_TARGET}\"");
+                        // Fix freetype toolchain rejecting min iPhone version
+                        Utilities.ReplaceInFile(Path.Combine(root, "builds/cmake/iOS.cmake"), "set(CMAKE_OSX_DEPLOYMENT_TARGET \"\"", "set(CMAKE_OSX_DEPLOYMENT_TARGET \"${CMAKE_OSX_DEPLOYMENT_TARGET}\"");
 
-                    // Build for iOS
-                    SetupDirectory(buildDir, true);
-                    RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DIOS_PLATFORM=OS -DCMAKE_SYSTEM_NAME=iOS -DCMAKE_BUILD_TYPE=Release -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF");
-                    BuildCmake(buildDir);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
-                    Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName));
-                    break;
-                }
+                        // Build for iOS
+                        SetupDirectory(buildDir, true);
+                        RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DIOS_PLATFORM=OS -DCMAKE_SYSTEM_NAME=iOS -DCMAKE_BUILD_TYPE=Release -DFT_WITH_BZIP2=OFF -DFT_WITH_ZLIB=OFF -DFT_WITH_PNG=OFF");
+                        BuildCmake(buildDir);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
+                        Utilities.FileCopy(Path.Combine(buildDir, libraryFileName), Path.Combine(depsFolder, libraryFileName));
+                        break;
+                    }
+                    }
                 }
             }
 
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/glslang.cs b/Source/Tools/Flax.Build/Deps/Dependencies/glslang.cs
index a876083f8..e5f5e4dfe 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/glslang.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/glslang.cs
@@ -38,13 +38,43 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        //TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
             var root = options.IntermediateFolder;
             var installDir = Path.Combine(root, "install");
             var configuration = "Release";
-            var cmakeArgs = string.Format("-DCMAKE_INSTALL_PREFIX=\"{0}\" -DCMAKE_BUILD_TYPE={1} -DENABLE_RTTI=ON -DENABLE_CTEST=OFF -DENABLE_HLSL=ON -DENABLE_SPVREMAPPER=ON -DENABLE_GLSLANG_BINARIES=OFF", installDir, configuration);
+            var cmakeArgs = $"-DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_INSTALL_PREFIX=\"{installDir}\" -DCMAKE_BUILD_TYPE={configuration} -DENABLE_RTTI=ON -DENABLE_CTEST=OFF -DENABLE_HLSL=ON -DENABLE_SPVREMAPPER=ON -DENABLE_GLSLANG_BINARIES=OFF";
             var libsRoot = Path.Combine(installDir, "lib");
 
             // Get the source
@@ -56,93 +86,88 @@ namespace Flax.Deps.Dependencies
 
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.Windows:
-                {
-                    var outputFiles = new[]
-                    {
-                        Path.Combine(libsRoot, "GenericCodeGen.lib"),
-                        Path.Combine(libsRoot, "MachineIndependent.lib"),
-                        Path.Combine(libsRoot, "HLSL.lib"),
-                        Path.Combine(libsRoot, "OSDependent.lib"),
-                        Path.Combine(libsRoot, "OGLCompiler.lib"),
-                        Path.Combine(libsRoot, "SPIRV-Tools-opt.lib"),
-                        Path.Combine(libsRoot, "SPIRV-Tools.lib"),
-                        Path.Combine(libsRoot, "SPIRV.lib"),
-                        Path.Combine(libsRoot, "glslang.lib"),
-                    };
+                    BuildStarted(platform, architecture);
 
-                    // Build for Windows
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
+                    var buildDir = Path.Combine(root, "build-" + architecture.ToString());
+                    switch (platform)
                     {
-                        var buildDir = Path.Combine(root, "build-" + architecture.ToString());
+                    case TargetPlatform.Windows:
+                    {
+                        var outputFiles = new[]
+                        {
+                            Path.Combine(libsRoot, "GenericCodeGen.lib"),
+                            Path.Combine(libsRoot, "MachineIndependent.lib"),
+                            Path.Combine(libsRoot, "HLSL.lib"),
+                            Path.Combine(libsRoot, "OSDependent.lib"),
+                            Path.Combine(libsRoot, "OGLCompiler.lib"),
+                            Path.Combine(libsRoot, "SPIRV-Tools-opt.lib"),
+                            Path.Combine(libsRoot, "SPIRV-Tools.lib"),
+                            Path.Combine(libsRoot, "SPIRV.lib"),
+                            Path.Combine(libsRoot, "glslang.lib"),
+                        };
+
+                        // Build for Windows
                         var solutionPath = Path.Combine(buildDir, "glslang.sln");
-
                         SetupDirectory(buildDir, false);
-                        RunCmake(root, platform, architecture, cmakeArgs + $" -B\"{buildDir}\"");
-                        Utilities.Run("cmake", string.Format("--build . --config {0} --target install", configuration), null, buildDir, Utilities.RunOptions.ConsoleLogOutput);
+                        RunCmake(root, platform, architecture, $"-B\"{buildDir}\" " + cmakeArgs);
                         Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString());
+                        Utilities.Run("cmake", $"--build \"{buildDir}\" --config {configuration} --target install", null, buildDir, Utilities.RunOptions.ConsoleLogOutput);
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         foreach (var file in outputFiles)
                         {
                             Utilities.FileCopy(file, Path.Combine(depsFolder, Path.GetFileName(file)));
                         }
+                        break;
                     }
-                    break;
-                }
-                case TargetPlatform.Linux:
-                {
-                    var outputFiles = new[]
+                    case TargetPlatform.Linux:
                     {
-                        Path.Combine(libsRoot, "libGenericCodeGen.a"),
-                        Path.Combine(libsRoot, "libMachineIndependent.a"),
-                        Path.Combine(libsRoot, "libHLSL.a"),
-                        Path.Combine(libsRoot, "libOSDependent.a"),
-                        Path.Combine(libsRoot, "libOGLCompiler.a"),
-                        Path.Combine(libsRoot, "libSPIRV-Tools-opt.a"),
-                        Path.Combine(libsRoot, "libSPIRV-Tools.a"),
-                        Path.Combine(libsRoot, "libSPIRV.a"),
-                        Path.Combine(libsRoot, "libglslang.a"),
-                    };
-                    var buildDir = root;
+                        var outputFiles = new[]
+                        {
+                            Path.Combine(libsRoot, "libGenericCodeGen.a"),
+                            Path.Combine(libsRoot, "libMachineIndependent.a"),
+                            Path.Combine(libsRoot, "libHLSL.a"),
+                            Path.Combine(libsRoot, "libOSDependent.a"),
+                            Path.Combine(libsRoot, "libOGLCompiler.a"),
+                            Path.Combine(libsRoot, "libSPIRV-Tools-opt.a"),
+                            Path.Combine(libsRoot, "libSPIRV-Tools.a"),
+                            Path.Combine(libsRoot, "libSPIRV.a"),
+                            Path.Combine(libsRoot, "libglslang.a"),
+                        };
 
-                    // Build for Linux
-                    RunCmake(root, platform, TargetArchitecture.x64, cmakeArgs);
-                    Utilities.Run("cmake", string.Format("--build . --config {0} --target install", configuration), null, buildDir, Utilities.RunOptions.ConsoleLogOutput);
-                    Utilities.Run("make", null, null, root, Utilities.RunOptions.ConsoleLogOutput);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    foreach (var file in outputFiles)
-                    {
-                        var dst = Path.Combine(depsFolder, Path.GetFileName(file));
-                        Utilities.FileCopy(file, dst);
-                        //Utilities.Run("strip", string.Format("-s \"{0}\"", dst), null, null, Utilities.RunOptions.ConsoleLogOutput);
+                        // Build for Linux
+                        RunCmake(root, platform, architecture, $"-B\"{buildDir}\" " + cmakeArgs);
+                        Utilities.Run("make", null, null, buildDir, Utilities.RunOptions.ConsoleLogOutput);
+                        Utilities.Run("cmake", $"--build \"{buildDir}\" --config {configuration} --target install", null, buildDir, Utilities.RunOptions.ConsoleLogOutput);
+                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
+                        foreach (var file in outputFiles)
+                        {
+                            var dst = Path.Combine(depsFolder, Path.GetFileName(file));
+                            Utilities.FileCopy(file, dst);
+                            //Utilities.Run("strip", string.Format("-s \"{0}\"", dst), null, null, Utilities.RunOptions.ConsoleLogOutput);
+                        }
+                        break;
                     }
-                    break;
-                }
-                case TargetPlatform.Mac:
-                {
-                    var outputFiles = new[]
+                    case TargetPlatform.Mac:
                     {
-                        Path.Combine(libsRoot, "libGenericCodeGen.a"),
-                        Path.Combine(libsRoot, "libMachineIndependent.a"),
-                        Path.Combine(libsRoot, "libHLSL.a"),
-                        Path.Combine(libsRoot, "libOSDependent.a"),
-                        Path.Combine(libsRoot, "libOGLCompiler.a"),
-                        Path.Combine(libsRoot, "libSPIRV-Tools-opt.a"),
-                        Path.Combine(libsRoot, "libSPIRV-Tools.a"),
-                        Path.Combine(libsRoot, "libSPIRV.a"),
-                        Path.Combine(libsRoot, "libglslang.a"),
-                    };
-                    var buildDir = root;
+                        var outputFiles = new[]
+                        {
+                            Path.Combine(libsRoot, "libGenericCodeGen.a"),
+                            Path.Combine(libsRoot, "libMachineIndependent.a"),
+                            Path.Combine(libsRoot, "libHLSL.a"),
+                            Path.Combine(libsRoot, "libOSDependent.a"),
+                            Path.Combine(libsRoot, "libOGLCompiler.a"),
+                            Path.Combine(libsRoot, "libSPIRV-Tools-opt.a"),
+                            Path.Combine(libsRoot, "libSPIRV-Tools.a"),
+                            Path.Combine(libsRoot, "libSPIRV.a"),
+                            Path.Combine(libsRoot, "libglslang.a"),
+                        };
 
-                    // Build for Mac
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
-                    {
-                        RunCmake(root, platform, architecture, cmakeArgs);
-                        Utilities.Run("cmake", string.Format("--build . --config {0} --target install", configuration), null, buildDir, Utilities.RunOptions.ConsoleLogOutput);
-                        Utilities.Run("make", null, null, root, Utilities.RunOptions.ConsoleLogOutput);
+                        // Build for Mac
+                        RunCmake(root, platform, architecture, $"-B\"{buildDir}\" " + cmakeArgs);
+                        Utilities.Run("make", null, null, buildDir, Utilities.RunOptions.ConsoleLogOutput);
+                        Utilities.Run("cmake", $"--build \"{buildDir}\" --config {configuration} --target install", null, buildDir, Utilities.RunOptions.ConsoleLogOutput);
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         foreach (var file in outputFiles)
                         {
@@ -150,9 +175,9 @@ namespace Flax.Deps.Dependencies
                             Utilities.FileCopy(file, dst);
                             Utilities.Run("strip", string.Format("\"{0}\"", dst), null, null, Utilities.RunOptions.ConsoleLogOutput);
                         }
+                        break;
+                    }
                     }
-                    break;
-                }
                 }
             }
 
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/mono.cs b/Source/Tools/Flax.Build/Deps/Dependencies/mono.cs
index 57d2f74fe..ad402d3d4 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/mono.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/mono.cs
@@ -53,6 +53,36 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        //TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         private string root;
         private string monoPropsPath;
         private string monoPreprocesorDefines;
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/nethost.cs b/Source/Tools/Flax.Build/Deps/Dependencies/nethost.cs
index 1991aa605..4e79718df 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/nethost.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/nethost.cs
@@ -43,6 +43,39 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        //TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
+        /// <inheritdoc />
+        public override bool BuildByDefault => false;
+
         private string root;
         private bool cleanArtifacts;
 
@@ -331,24 +364,27 @@ namespace Flax.Deps.Dependencies
 
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                var platformData = Path.Combine(GetBinariesFolder(options, platform), "Data", "nethost");
-                if (Directory.Exists(platformData))
-                    Utilities.DirectoryCopy(platformData, root, true, true);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.PS4:
-                case TargetPlatform.PS5:
-                case TargetPlatform.XboxOne:
-                case TargetPlatform.XboxScarlett:
-                    Build(options, platform, TargetArchitecture.x64);
+                    BuildStarted(platform, architecture);
+                    var platformData = Path.Combine(GetBinariesFolder(options, platform), "Data", "nethost");
+                    if (Directory.Exists(platformData))
+                        Utilities.DirectoryCopy(platformData, root, true, true);
+                    switch (platform)
+                    {
+                    case TargetPlatform.PS4:
+                    case TargetPlatform.PS5:
+                    case TargetPlatform.XboxOne:
+                    case TargetPlatform.XboxScarlett:
+                        Build(options, platform, TargetArchitecture.x64);
                     break;
-                case TargetPlatform.Android:
-                    Build(options, platform, TargetArchitecture.ARM64);
+                    case TargetPlatform.Android:
+                        Build(options, platform, TargetArchitecture.ARM64);
                     break;
-                case TargetPlatform.Switch:
-                    Build(options, platform, TargetArchitecture.ARM64);
+                    case TargetPlatform.Switch:
+                        Build(options, platform, TargetArchitecture.ARM64);
                     break;
+                    }
                 }
             }
 
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/nvapi.cs b/Source/Tools/Flax.Build/Deps/Dependencies/nvapi.cs
index d1d94b4c1..6f18a9190 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/nvapi.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/nvapi.cs
@@ -18,6 +18,23 @@ namespace Flax.Deps.Dependencies
             get => new[] { TargetPlatform.Windows };
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
@@ -30,7 +47,7 @@ namespace Flax.Deps.Dependencies
             // Copy files
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
+                BuildStarted(platform, TargetArchitecture.x64);
                 var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
                 Utilities.FileCopy(Path.Combine(root, "amd64/nvapi64.lib"), Path.Combine(depsFolder, "nvapi64.lib"));
             }
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs b/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
index f2dc8af3b..402adbe4f 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
@@ -49,6 +49,36 @@ namespace Flax.Deps.Dependencies
             }
         }
 
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        //TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
         private struct Binary
         {
             public string Filename;
@@ -337,97 +367,98 @@ namespace Flax.Deps.Dependencies
 
             foreach (var platform in options.Platforms)
             {
-                BuildStarted(platform);
-                switch (platform)
+                foreach (var architecture in options.Architectures)
                 {
-                case TargetPlatform.Windows:
-                {
-                    BuildCmake(options, TargetPlatform.Windows, TargetArchitecture.x64);
-                    BuildCmake(options, TargetPlatform.Windows, TargetArchitecture.ARM64);
-                    break;
-                }
-                case TargetPlatform.UWP:
-                {
-                    BuildMsbuild(options, TargetPlatform.UWP, TargetArchitecture.x64);
-                    break;
-                }
-                case TargetPlatform.XboxOne:
-                {
-                    BuildMsbuild(options, TargetPlatform.XboxOne, TargetArchitecture.x64);
-                    break;
-                }
-                case TargetPlatform.Linux:
-                {
-                    // Note: assumes the libogg-dev package is pre-installed on the system
-
-                    // Get the source
-                    CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git");
-
-                    var envVars = new Dictionary<string, string>
+                    BuildStarted(platform, architecture);
+                    switch (platform)
                     {
-                        { "CC", "clang-" + Configuration.LinuxClangMinVer },
-                        { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
-                        { "CXX", "clang++-" + Configuration.LinuxClangMinVer },
-                        { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
-                    };
-                    var buildDir = Path.Combine(root, "build");
+                    case TargetPlatform.Windows:
+                    {
+                        BuildCmake(options, TargetPlatform.Windows, architecture);
+                        break;
+                    }
+                    case TargetPlatform.UWP:
+                    {
+                        BuildMsbuild(options, TargetPlatform.UWP, architecture);
+                        break;
+                    }
+                    case TargetPlatform.XboxOne:
+                    {
+                        BuildMsbuild(options, TargetPlatform.XboxOne, architecture);
+                        break;
+                    }
+                    case TargetPlatform.Linux:
+                    {
+                        // Note: assumes the libogg-dev package is pre-installed on the system
 
-                    Utilities.Run(Path.Combine(root, "autogen.sh"), null, null, root, Utilities.RunOptions.DefaultTool, envVars);
+                        // Get the source
+                        CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git");
 
-                    // Build for Linux
-                    var toolchain = UnixToolchain.GetToolchainName(platform, TargetArchitecture.x64);
-                    Utilities.Run(Path.Combine(root, "configure"), string.Format("--host={0}", toolchain), null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars);
-                    SetupDirectory(buildDir, true);
-                    Utilities.Run("cmake", "-G \"Unix Makefiles\" -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=Release ..", null, buildDir, Utilities.RunOptions.ConsoleLogOutput, envVars);
-                    Utilities.Run("cmake", "--build .", null, buildDir, Utilities.RunOptions.ConsoleLogOutput, envVars);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                    foreach (var file in binariesToCopyUnix)
-                        Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename));
-                    break;
-                }
-                case TargetPlatform.PS4:
-                {
-                    BuildMsbuild(options, TargetPlatform.PS4, TargetArchitecture.x64);
-                    break;
-                }
-                case TargetPlatform.PS5:
-                {
-                    BuildMsbuild(options, TargetPlatform.PS5, TargetArchitecture.x64);
-                    break;
-                }
-                case TargetPlatform.XboxScarlett:
-                {
-                    BuildMsbuild(options, TargetPlatform.XboxScarlett, TargetArchitecture.x64);
-                    break;
-                }
-                case TargetPlatform.Android:
-                {
-                    var oggRoot = Path.Combine(root, "ogg");
-                    var oggBuildDir = Path.Combine(oggRoot, "build");
-                    var buildDir = Path.Combine(root, "build");
+                        var envVars = new Dictionary<string, string>
+                        {
+                            { "CC", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
+                            { "CXX", "clang++-" + Configuration.LinuxClangMinVer },
+                            { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
+                        };
+                        var buildDir = Path.Combine(root, "build");
 
-                    // Get the source
-                    CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git");
-                    CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git");
-                    GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5");
+                        Utilities.Run(Path.Combine(root, "autogen.sh"), null, null, root, Utilities.RunOptions.DefaultTool, envVars);
 
-                    // Build for Android
-                    SetupDirectory(oggBuildDir, true);
-                    RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\"");
-                    Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput);
-                    SetupDirectory(buildDir, true);
-                    RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release  -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot));
-                    BuildCmake(buildDir);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
-                    foreach (var file in binariesToCopyUnix)
-                        Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename));
-                    break;
-                }
-                case TargetPlatform.Switch:
-                {
-                    var oggRoot = Path.Combine(root, "ogg");
-                    var oggBuildDir = Path.Combine(oggRoot, "build");
-                    var buildDir = Path.Combine(root, "build");
+                        // Build for Linux
+                        var toolchain = UnixToolchain.GetToolchainName(platform, architecture);
+                        Utilities.Run(Path.Combine(root, "configure"), string.Format("--host={0}", toolchain), null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars);
+                        SetupDirectory(buildDir, true);
+                        Utilities.Run("cmake", "-G \"Unix Makefiles\" -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=Release ..", null, buildDir, Utilities.RunOptions.ConsoleLogOutput, envVars);
+                        Utilities.Run("cmake", "--build .", null, buildDir, Utilities.RunOptions.ConsoleLogOutput, envVars);
+                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
+                        foreach (var file in binariesToCopyUnix)
+                            Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename));
+                        break;
+                    }
+                    case TargetPlatform.PS4:
+                    {
+                        BuildMsbuild(options, TargetPlatform.PS4, TargetArchitecture.x64);
+                        break;
+                    }
+                    case TargetPlatform.PS5:
+                    {
+                        BuildMsbuild(options, TargetPlatform.PS5, TargetArchitecture.x64);
+                        break;
+                    }
+                    case TargetPlatform.XboxScarlett:
+                    {
+                        BuildMsbuild(options, TargetPlatform.XboxScarlett, TargetArchitecture.x64);
+                        break;
+                    }
+                    case TargetPlatform.Android:
+                    {
+                        var oggRoot = Path.Combine(root, "ogg");
+                        var oggBuildDir = Path.Combine(oggRoot, "build");
+                        var buildDir = Path.Combine(root, "build");
+
+                        // Get the source
+                        CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git");
+                        CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git");
+                        GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5");
+
+                        // Build for Android
+                        SetupDirectory(oggBuildDir, true);
+                        RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\"");
+                        Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput);
+                        SetupDirectory(buildDir, true);
+                        RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release  -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot));
+                        BuildCmake(buildDir);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
+                        foreach (var file in binariesToCopyUnix)
+                            Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename));
+                        break;
+                    }
+                    case TargetPlatform.Switch:
+                    {
+                        var oggRoot = Path.Combine(root, "ogg");
+                        var oggBuildDir = Path.Combine(oggRoot, "build");
+                        var buildDir = Path.Combine(root, "build");
 
                     // Get the source
                     SetupDirectory(oggRoot, false);
@@ -457,14 +488,12 @@ namespace Flax.Deps.Dependencies
                     var oggBuildDir = Path.Combine(oggRoot, "build");
                     var buildDir = Path.Combine(root, "build");
 
-                    // Get the source
-                    CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git");
-                    CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git");
-                    GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5");
+                        // Get the source
+                        CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git");
+                        CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git");
+                        GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5");
 
-                    // Build for Mac
-                    foreach (var architecture in new[] { TargetArchitecture.x64, TargetArchitecture.ARM64 })
-                    {
+                        // Build for Mac
                         SetupDirectory(oggBuildDir, true);
                         RunCmake(oggBuildDir, platform, architecture, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\"");
                         Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput);
@@ -474,32 +503,32 @@ namespace Flax.Deps.Dependencies
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         foreach (var file in binariesToCopyUnix)
                             Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename));
+                        break;
                     }
-                    break;
-                }
-                case TargetPlatform.iOS:
-                {
-                    var oggRoot = Path.Combine(root, "ogg");
-                    var oggBuildDir = Path.Combine(oggRoot, "build");
-                    var buildDir = Path.Combine(root, "build");
+                    case TargetPlatform.iOS:
+                    {
+                        var oggRoot = Path.Combine(root, "ogg");
+                        var oggBuildDir = Path.Combine(oggRoot, "build");
+                        var buildDir = Path.Combine(root, "build");
 
-                    // Get the source
-                    CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git");
-                    CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git");
-                    GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5");
+                        // Get the source
+                        CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git");
+                        CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git");
+                        GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5");
 
-                    // Build for Mac
-                    SetupDirectory(oggBuildDir, true);
-                    RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\"");
-                    Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput);
-                    SetupDirectory(buildDir, true);
-                    RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release  -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot));
-                    BuildCmake(buildDir);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
-                    foreach (var file in binariesToCopyUnix)
-                        Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename));
-                    break;
-                }
+                        // Build for Mac
+                        SetupDirectory(oggBuildDir, true);
+                        RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\"");
+                        Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput);
+                        SetupDirectory(buildDir, true);
+                        RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release  -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot));
+                        BuildCmake(buildDir);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
+                        foreach (var file in binariesToCopyUnix)
+                            Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename));
+                        break;
+                    }
+                    }
                 }
             }
 
diff --git a/Source/Tools/Flax.Build/Deps/Dependency.cs b/Source/Tools/Flax.Build/Deps/Dependency.cs
index 010a45175..381783e29 100644
--- a/Source/Tools/Flax.Build/Deps/Dependency.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependency.cs
@@ -40,6 +40,11 @@ namespace Flax.Deps
             /// The target platforms to build dependency for (contains only platforms supported by the dependency itself).
             /// </summary>
             public TargetPlatform[] Platforms;
+
+            /// <summary>
+            /// The target architectures to build dependency for (contains only platforms supported by the dependency itself).
+            /// </summary>
+            public TargetArchitecture[] Architectures;
         }
 
         /// <summary>
@@ -70,6 +75,11 @@ namespace Flax.Deps
         /// </summary>
         public abstract TargetPlatform[] Platforms { get; }
 
+        /// <summary>
+        /// Gets the architectures list supported by this dependency to build on the current build platform (based on <see cref="Platform.BuildPlatform"/>).
+        /// </summary>
+        public abstract TargetArchitecture[] Architectures { get; }
+
         /// <summary>
         /// True if build dependency by default, otherwise only when explicitly specified via command line.
         /// </summary>
@@ -85,9 +95,9 @@ namespace Flax.Deps
         /// Logs build process start.
         /// </summary>
         /// <param name="platform">Target platform.</param>
-        protected void BuildStarted(TargetPlatform platform)
+        protected void BuildStarted(TargetPlatform platform, TargetArchitecture architecture)
         {
-            Log.Info($"Building {GetType().Name} for {platform}");
+            Log.Info($"Building {GetType().Name} for {platform}{(architecture != TargetArchitecture.AnyCPU ? $" ({architecture})" : "")}");
         }
 
         /// <summary>
diff --git a/Source/Tools/Flax.Build/Deps/DepsBuilder.cs b/Source/Tools/Flax.Build/Deps/DepsBuilder.cs
index c43c39ea3..1b8389080 100644
--- a/Source/Tools/Flax.Build/Deps/DepsBuilder.cs
+++ b/Source/Tools/Flax.Build/Deps/DepsBuilder.cs
@@ -38,20 +38,21 @@ namespace Flax.Deps
             var platforms = Globals.AllPlatforms;
             if (Configuration.BuildPlatforms != null && Configuration.BuildPlatforms.Length != 0)
                 platforms = Configuration.BuildPlatforms;
-            platforms = platforms.Where(x => buildPlatform.CanBuildPlatform(x)).ToArray();
-            Log.Verbose("Building deps for platforms:");
+            platforms = platforms.Where(buildPlatform.CanBuildPlatform).ToArray();
+            var architectures = Globals.AllArchitectures;
+            if (Configuration.BuildArchitectures != null && Configuration.BuildArchitectures.Length != 0)
+                architectures = Configuration.BuildArchitectures;
+            architectures = architectures.Where(buildPlatform.CanBuildArchitecture).ToArray();
+            Log.Verbose($"Building deps for platforms {string.Join(',', platforms)}, {string.Join(',', architectures)}:");
             foreach (var platform in platforms)
             {
-                Log.Verbose(" - " + platform);
+                foreach (var architecture in architectures)
+                {
+                    Log.Verbose($" - {platform} ({architecture})");
 
-                if (Platform.IsPlatformSupported(platform, TargetArchitecture.x64))
-                    SetupDepsOutputFolder(options, platform, TargetArchitecture.x64);
-                if (Platform.IsPlatformSupported(platform, TargetArchitecture.x86))
-                    SetupDepsOutputFolder(options, platform, TargetArchitecture.x86);
-                if (Platform.IsPlatformSupported(platform, TargetArchitecture.ARM))
-                    SetupDepsOutputFolder(options, platform, TargetArchitecture.ARM);
-                if (Platform.IsPlatformSupported(platform, TargetArchitecture.ARM64))
-                    SetupDepsOutputFolder(options, platform, TargetArchitecture.ARM64);
+                    if (Platform.IsPlatformSupported(platform, architecture))
+                        SetupDepsOutputFolder(options, platform, architecture);
+                }
             }
 
             // Get all deps
@@ -80,6 +81,14 @@ namespace Flax.Deps
                     continue;
                 }
 
+                options.Architectures = architectures.Intersect(dependency.Architectures).ToArray();
+                if (options.Architectures.Length == 0)
+                {
+                    Log.Info(string.Format("Skipping {0} ({1}/{2})", name, i + 1, dependencies.Length));
+                    Log.Verbose("Architecture not used on any of the build platforms.");
+                    continue;
+                }
+
                 Log.Info(string.Format("Building {0} ({1}/{2})", name, i + 1, dependencies.Length));
 
                 options.IntermediateFolder = Path.Combine(Environment.CurrentDirectory, "Cache", "Intermediate", "Deps", name).Replace('\\', '/');

From 84c79d5192ac14abfdee0f48d36b7899e07f7bb6 Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sat, 18 Oct 2025 02:33:44 +0300
Subject: [PATCH 11/33] Fix building Assimp on Linux

Versioned clang++ symlinks are not available on Arch
---
 Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs b/Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs
index e31a3a059..629a69070 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/Assimp.cs
@@ -155,11 +155,12 @@ namespace Flax.Deps.Dependencies
                         {
                             { "CC", "clang-" + Configuration.LinuxClangMinVer },
                             { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
-                            { "CXX", "clang++-" + Configuration.LinuxClangMinVer },
+                            { "CXX", "clang-" + Configuration.LinuxClangMinVer },
                             { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
                         };
 
                         // Build for Linux
+                        File.Delete(Path.Combine(root, "CMakeCache.txt"));
                         RunCmake(root, platform, architecture, " -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF " + globalConfig, envVars);
                         Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool, envVars);
                         configHeaderFilePath = Path.Combine(root, "include", "assimp", "config.h");
@@ -170,6 +171,7 @@ namespace Flax.Deps.Dependencies
                     case TargetPlatform.Mac:
                     {
                         // Build for Mac
+                        File.Delete(Path.Combine(root, "CMakeCache.txt"));
                         RunCmake(root, platform, architecture, " -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF " + globalConfig);
                         Utilities.Run("make", null, null, root, Utilities.RunOptions.DefaultTool);
                         configHeaderFilePath = Path.Combine(root, "include", "assimp", "config.h");

From b8b9ba3069bb8d6c441007f9a2f0477413806d16 Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sat, 18 Oct 2025 02:33:54 +0300
Subject: [PATCH 12/33] Fix building curl on Linux

---
 Source/Tools/Flax.Build/Deps/Dependencies/curl.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs b/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs
index a559f1c7f..ceca92798 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs
@@ -123,7 +123,7 @@ namespace Flax.Deps.Dependencies
                         var settings = new[]
                         {
                             "--without-librtmp",
-                            "--without-ssl",
+                            //"--without-ssl",
                             "--with-gnutls",
                             "--disable-ipv6",
                             "--disable-manual",

From d5bd857c45539838add2f5bebeff75e5cdeef51e Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sat, 18 Oct 2025 02:34:04 +0300
Subject: [PATCH 13/33] Support building OpenAL from Git repository in other
 platforms

---
 .../Flax.Build/Deps/Dependencies/OpenAL.cs    | 102 ++++++++----------
 1 file changed, 45 insertions(+), 57 deletions(-)

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs b/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs
index c87931179..5e194edd4 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs
@@ -1,5 +1,5 @@
 // Copyright (c) Wojciech Figat. All rights reserved.
-
+//#define USE_GIT_REPOSITORY
 using System;
 using System.Collections.Generic;
 using System.IO;
@@ -85,12 +85,15 @@ namespace Flax.Deps.Dependencies
             var dstIncludePath = Path.Combine(options.ThirdPartyFolder, "OpenAL");
             var noSSL = true; // OpenAL Soft website has broken certs
 
+#if !USE_GIT_REPOSITORY
             if (options.Platforms.Contains(TargetPlatform.Windows))
+#endif
             {
                 // Get the source
                 CloneGitRepo(root, "https://github.com/kcat/openal-soft.git");
-                GitCheckout(root, "master", "d3875f333fb6abe2f39d82caca329414871ae53b"); // 1.23.1
+                GitCheckout(root, "master", "dc7d7054a5b4f3bec1dc23a42fd616a0847af948"); // 1.24.3
             }
+#if !USE_GIT_REPOSITORY
             else
             {
                 // Get the source
@@ -98,25 +101,20 @@ namespace Flax.Deps.Dependencies
                 if (!File.Exists(packagePath))
                 {
                     Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-releases/openal-soft-" + version + ".tar.bz2", packagePath, noSSL);
-                    using (ZipArchive archive = ZipFile.Open(packagePath, ZipArchiveMode.Read))
+                    if (Platform.BuildTargetPlatform == TargetPlatform.Windows)
                     {
-                        if (!Directory.Exists(root))
-                            archive.ExtractToDirectory(root);
-                        root = Path.Combine(root, archive.Entries.First().FullName);
+                        // TODO: Maybe use PowerShell Expand-Archive instead?
+                        var sevenZip = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.ProgramFiles), "7-Zip", "7z.exe");
+                        Utilities.Run(sevenZip, "x package.zip", null, root);
+                        Utilities.Run(sevenZip, "x package", null, root);
+                    }
+                    else
+                    {
+                        Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
                     }
                 }
-                /*if (Platform.BuildTargetPlatform == TargetPlatform.Windows)
-                {
-                    // TODO: Maybe use PowerShell Expand-Archive instead?
-                    var sevenZip = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.ProgramFiles), "7-Zip", "7z.exe");
-                    Utilities.Run(sevenZip, "x package.zip", null, root);
-                    Utilities.Run(sevenZip, "x package", null, root);
-                }
-                else
-                {
-                    Utilities.Run("tar", "xjf " + packagePath.Replace('\\', '/'), null, root, Utilities.RunOptions.ConsoleLogOutput);
-                }*/
             }
+#endif
 
             foreach (var platform in options.Platforms)
             {
@@ -136,40 +134,12 @@ namespace Flax.Deps.Dependencies
                         // Build for Windows
                         var buildDir = Path.Combine(root, "build-" + architecture.ToString());
                         var solutionPath = Path.Combine(buildDir, "OpenAL.sln");
-
+                        SetupDirectory(buildDir, true);
                         RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DBUILD_SHARED_LIBS=OFF -DCMAKE_C_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\" -DCMAKE_CXX_FLAGS=\"/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR /EHsc\" " + cmakeArgs);
                         Deploy.VCEnvironment.BuildSolution(solutionPath, configuration, architecture.ToString());
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         foreach (var file in binariesToCopy)
                             Utilities.FileCopy(Path.Combine(buildDir, configuration, file), Path.Combine(depsFolder, Path.GetFileName(file)));
-
-#if false
-                        // Get the binaries
-                        var packagePath = Path.Combine(root, "package.zip");
-                        if (!File.Exists(packagePath))
-                            Downloader.DownloadFileFromUrlToPath("https://openal-soft.org/openal-binaries/openal-soft-" + version + "-bin.zip", packagePath, noSSL);
-                        using (ZipArchive archive = ZipFile.Open(packagePath, ZipArchiveMode.Read))
-                        {
-                            if (!Directory.Exists(root))
-                                archive.ExtractToDirectory(root);
-                            root = Path.Combine(root, archive.Entries.First().FullName);
-                        }
-
-                        // Deploy Win64 binaries
-                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
-                        Utilities.FileCopy(Path.Combine(root, "bin", "Win64", "soft_oal.dll"), Path.Combine(depsFolder, "OpenAL32.dll"));
-                        Utilities.FileCopy(Path.Combine(root, "libs", "Win64", "OpenAL32.lib"), Path.Combine(depsFolder, "OpenAL32.lib"));
-
-                        // Deploy license
-                        Utilities.FileCopy(Path.Combine(root, "COPYING"), Path.Combine(dstIncludePath, "COPYING"), true);
-
-                        // Deploy header files
-                        var files = Directory.GetFiles(Path.Combine(root, "include", "AL"));
-                        foreach (var file in files)
-                        {
-                            Utilities.FileCopy(file, Path.Combine(dstIncludePath, Path.GetFileName(file)));
-                        }
-#endif
                         break;
                     }
                     case TargetPlatform.Linux:
@@ -195,14 +165,16 @@ namespace Flax.Deps.Dependencies
                                      + cmakeArgs;
 
                         // Use separate build directory
+#if !USE_GIT_REPOSITORY
                         root = Path.Combine(root, "openal-soft-" + version);
-                        var buildDir = Path.Combine(root, "build");
+#endif
+                        var buildDir = Path.Combine(root, "build-" + architecture.ToString());
                         SetupDirectory(buildDir, true);
 
                         // Build for Linux
-                        Utilities.Run("cmake", $"-G \"Unix Makefiles\" -DCMAKE_BUILD_TYPE={configuration} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DLIBTYPE=STATIC {config} ..", null, buildDir, Utilities.RunOptions.ConsoleLogOutput, envVars);
+                        RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DLIBTYPE=STATIC -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
                         BuildCmake(buildDir, configuration, envVars);
-                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.x64);
+                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         foreach (var file in binariesToCopy)
                             Utilities.FileCopy(Path.Combine(buildDir, file), Path.Combine(depsFolder, file));
                         break;
@@ -220,12 +192,14 @@ namespace Flax.Deps.Dependencies
                         var config = "-DALSOFT_REQUIRE_OBOE=OFF -DALSOFT_REQUIRE_OPENSL=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
 
                         // Use separate build directory
+#if !USE_GIT_REPOSITORY
                         root = Path.Combine(root, "openal-soft-" + version);
-                        var buildDir = Path.Combine(root, "build");
+#endif
+                        var buildDir = Path.Combine(root, "build-" + architecture.ToString());
                         SetupDirectory(buildDir, true);
 
                         // Build
-                        RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
+                        RunCmake(root, platform, TargetArchitecture.ARM64, $"-B\"{buildDir}\" -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
                         BuildCmake(buildDir, envVars);
                         var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
                         foreach (var file in binariesToCopy)
@@ -245,12 +219,14 @@ namespace Flax.Deps.Dependencies
                         var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
 
                         // Use separate build directory
+#if !USE_GIT_REPOSITORY
                         root = Path.Combine(root, "openal-soft-" + version);
-                        var buildDir = Path.Combine(root, "build");
+#endif
+                        var buildDir = Path.Combine(root, "build-" + architecture.ToString());
+                        SetupDirectory(buildDir, true);
 
                         // Build for Mac
-                        SetupDirectory(buildDir, true);
-                        RunCmake(buildDir, platform, architecture, ".. -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
+                        RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
                         BuildCmake(buildDir, envVars);
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         foreach (var file in binariesToCopy)
@@ -270,12 +246,14 @@ namespace Flax.Deps.Dependencies
                         var config = " -DALSOFT_REQUIRE_COREAUDIO=ON -DALSOFT_EMBED_HRTF_DATA=YES " + cmakeArgs;
 
                         // Use separate build directory
+#if !USE_GIT_REPOSITORY
                         root = Path.Combine(root, "openal-soft-" + version);
-                        var buildDir = Path.Combine(root, "build");
+#endif
+                        var buildDir = Path.Combine(root, "build-" + architecture.ToString());
+                        SetupDirectory(buildDir, true);
 
                         // Build for iOS
-                        SetupDirectory(buildDir, true);
-                        RunCmake(buildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_SYSTEM_NAME=iOS -DALSOFT_OSX_FRAMEWORK=ON -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
+                        RunCmake(root, platform, TargetArchitecture.ARM64, $"-B\"{buildDir}\" -DCMAKE_SYSTEM_NAME=iOS -DALSOFT_OSX_FRAMEWORK=ON -DLIBTYPE=STATIC -DCMAKE_BUILD_TYPE=" + configuration + config, envVars);
                         BuildCmake(buildDir, envVars);
                         var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
                         foreach (var file in binariesToCopy)
@@ -285,6 +263,16 @@ namespace Flax.Deps.Dependencies
                     }
                 }
             }
+
+            // Deploy license
+            Utilities.FileCopy(Path.Combine(root, "COPYING"), Path.Combine(dstIncludePath, "COPYING"), true);
+
+            // Deploy header files
+            var files = Directory.GetFiles(Path.Combine(root, "include", "AL"));
+            foreach (var file in files)
+            {
+                Utilities.FileCopy(file, Path.Combine(dstIncludePath, Path.GetFileName(file)));
+            }
         }
     }
 }

From 430e685a7ccafaba6d753371f0d218f5ad404923 Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sat, 18 Oct 2025 02:34:14 +0300
Subject: [PATCH 14/33] Fix building PhysX on Linux and macOS

---
 Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
index c5016c3e9..9122f6565 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
@@ -95,8 +95,13 @@ namespace Flax.Deps.Dependencies
                 if (cmakeSwitch.HasAttribute("name") && cmakeSwitch.Attributes["name"].Value == name)
                 {
                     cmakeSwitch.Attributes["value"].Value = value;
+                    return;
                 }
             }
+            var child = cmakeSwitches.OwnerDocument.CreateElement(cmakeSwitches.ChildNodes[0].Name);
+            child.SetAttribute("name", name);
+            child.SetAttribute("value", value);
+            cmakeSwitches.AppendChild(child);
         }
 
         private void Build(BuildOptions options, string preset, TargetPlatform targetPlatform, TargetArchitecture architecture)
@@ -129,6 +134,10 @@ namespace Flax.Deps.Dependencies
                     ConfigureCmakeSwitch(cmakeParams, "PX_COPY_EXTERNAL_DLL", "OFF");
                 }
                 break;
+            case TargetPlatform.Linux:
+                ConfigureCmakeSwitch(cmakeParams, "CMAKE_C_FLAGS", "&quot;-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs&quot;");
+                ConfigureCmakeSwitch(cmakeParams, "CMAKE_CXX_FLAGS", "&quot;-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs&quot;");
+                break;
             case TargetPlatform.Android:
                 ConfigureCmakeSwitch(cmakeParams, "CMAKE_INSTALL_PREFIX", $"install/android-{Configuration.AndroidPlatformApi}/PhysX");
                 ConfigureCmakeSwitch(cmakeParams, "ANDROID_NATIVE_API_LEVEL", $"android-{Configuration.AndroidPlatformApi}");
@@ -136,6 +145,8 @@ namespace Flax.Deps.Dependencies
                 break;
             case TargetPlatform.Mac:
                 ConfigureCmakeSwitch(cmakeParams, "CMAKE_OSX_DEPLOYMENT_TARGET", Configuration.MacOSXMinVer);
+                ConfigureCmakeSwitch(cmakeParams, "CMAKE_C_FLAGS", "&quot;-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs&quot;");
+                ConfigureCmakeSwitch(cmakeParams, "CMAKE_CXX_FLAGS", "&quot;-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs&quot;");
                 break;
             case TargetPlatform.iOS:
                 ConfigureCmakeSwitch(cmakeParams, "CMAKE_OSX_DEPLOYMENT_TARGET", Configuration.iOSMinVer);
@@ -156,6 +167,7 @@ namespace Flax.Deps.Dependencies
             bool suppressBitsPostfix = false;
             string binariesPrefix = string.Empty;
             var envVars = new Dictionary<string, string>();
+            envVars.Add("CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel);
             switch (architecture)
             {
             case TargetArchitecture.x86:

From ebd929176cdd01c9c6edb0739ecfc6860e9fd7af Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sat, 18 Oct 2025 02:34:22 +0300
Subject: [PATCH 15/33] Fix python tool call on macOS for glslang

---
 Source/Tools/Flax.Build/Deps/Dependencies/glslang.cs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/glslang.cs b/Source/Tools/Flax.Build/Deps/Dependencies/glslang.cs
index e5f5e4dfe..32c14a037 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/glslang.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/glslang.cs
@@ -1,5 +1,6 @@
 // Copyright (c) Wojciech Figat. All rights reserved.
 
+using System;
 using System.IO;
 using Flax.Build;
 
@@ -82,7 +83,8 @@ namespace Flax.Deps.Dependencies
 
             // Setup the external sources
             // Requires distutils (pip install setuptools)
-            Utilities.Run("python", "update_glslang_sources.py", null, root, Utilities.RunOptions.ConsoleLogOutput);
+            if (Utilities.Run(BuildPlatform != TargetPlatform.Mac ? "python" : "python3", "update_glslang_sources.py", null, root, Utilities.RunOptions.ConsoleLogOutput) != 0)
+                throw new Exception("Failed to update glslang sources, make sure setuptools python package is installed.");
 
             foreach (var platform in options.Platforms)
             {

From 4b552563beab961b978226f9ad0dc949a97b9d83 Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sat, 18 Oct 2025 02:34:33 +0300
Subject: [PATCH 16/33] Fix PhysX compilation on Linux and macOS

---
 Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
index 9122f6565..f65767fd2 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
@@ -135,8 +135,7 @@ namespace Flax.Deps.Dependencies
                 }
                 break;
             case TargetPlatform.Linux:
-                ConfigureCmakeSwitch(cmakeParams, "CMAKE_C_FLAGS", "&quot;-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs&quot;");
-                ConfigureCmakeSwitch(cmakeParams, "CMAKE_CXX_FLAGS", "&quot;-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs&quot;");
+                ConfigureCmakeSwitch(cmakeParams, "PHYSX_CXX_FLAGS", "\"-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs\"");
                 break;
             case TargetPlatform.Android:
                 ConfigureCmakeSwitch(cmakeParams, "CMAKE_INSTALL_PREFIX", $"install/android-{Configuration.AndroidPlatformApi}/PhysX");
@@ -145,8 +144,7 @@ namespace Flax.Deps.Dependencies
                 break;
             case TargetPlatform.Mac:
                 ConfigureCmakeSwitch(cmakeParams, "CMAKE_OSX_DEPLOYMENT_TARGET", Configuration.MacOSXMinVer);
-                ConfigureCmakeSwitch(cmakeParams, "CMAKE_C_FLAGS", "&quot;-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs&quot;");
-                ConfigureCmakeSwitch(cmakeParams, "CMAKE_CXX_FLAGS", "&quot;-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs&quot;");
+                ConfigureCmakeSwitch(cmakeParams, "PHYSX_CXX_FLAGS", "\"-Wno-error=format -Wno-error=unused-but-set-variable -Wno-error=switch-default -Wno-error=invalid-offsetof -Wno-error=unsafe-buffer-usage -Wno-error=unsafe-buffer-usage-in-libc-call -Wno-error=missing-include-dirs\"");
                 break;
             case TargetPlatform.iOS:
                 ConfigureCmakeSwitch(cmakeParams, "CMAKE_OSX_DEPLOYMENT_TARGET", Configuration.iOSMinVer);
@@ -498,7 +496,7 @@ namespace Flax.Deps.Dependencies
             var dstIncludePath = Path.Combine(options.ThirdPartyFolder, "PhysX");
             Directory.GetFiles(dstIncludePath, "*.h", SearchOption.AllDirectories).ToList().ForEach(File.Delete);
             Utilities.FileCopy(Path.Combine(root, "LICENSE.md"), Path.Combine(dstIncludePath, "License.txt"));
-            Utilities.DirectoryCopy(Path.Combine(root, "physx", "include"), dstIncludePath);
+            Utilities.DirectoryCopy(Path.Combine(root, "physx", "include"), dstIncludePath, true, true);
         }
     }
 }

From 1d2b3bc858615bcb16acb2b8a2d492f8857d68bb Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sat, 18 Oct 2025 02:34:41 +0300
Subject: [PATCH 17/33] Fix NvCloth compilation on Linux and macOS

---
 .../Tools/Flax.Build/Deps/Dependencies/NvCloth.cs  | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs b/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs
index 25557cb70..aa15aadac 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs
@@ -1,5 +1,6 @@
 // Copyright (c) Wojciech Figat. All rights reserved.
 
+using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
@@ -89,6 +90,15 @@ namespace Flax.Deps.Dependencies
             // Get the source
             CloneGitRepoSingleBranch(root, "https://github.com/FlaxEngine/NvCloth.git", "master");
 
+            // Patch the CMakeLists.txt to support custom compilation flags
+            foreach (var os in new[] { "android", "ios", "linux", "mac", "windows", })
+            {
+                var filePath = Path.Combine(nvCloth, "compiler", "cmake", os, "CMakeLists.txt");
+                var appendLine = "SET(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} ${NVCLOTH_CXX_FLAGS}\")";
+                if (!File.ReadAllText(filePath).Contains(appendLine))
+                    File.AppendAllText(filePath, Environment.NewLine + appendLine + Environment.NewLine);
+            }
+
             foreach (var platform in options.Platforms)
             {
                 foreach (var architecture in options.Architectures)
@@ -185,7 +195,7 @@ namespace Flax.Deps.Dependencies
                 }
                 break;
             case TargetPlatform.Mac:
-                cmakeArgs += " -DTARGET_BUILD_PLATFORM=mac";
+                cmakeArgs += " -DTARGET_BUILD_PLATFORM=mac -DNVCLOTH_CXX_FLAGS=\"-Wno-error=poison-system-directories -Wno-error=missing-include-dirs\"";
                 cmakeName = "mac";
                 binariesPrefix = "lib";
                 break;
@@ -195,7 +205,7 @@ namespace Flax.Deps.Dependencies
                 binariesPrefix = "lib";
                 break;
             case TargetPlatform.Linux:
-                cmakeArgs += " -DTARGET_BUILD_PLATFORM=linux";
+                cmakeArgs += " -DTARGET_BUILD_PLATFORM=linux -DNVCLOTH_CXX_FLAGS=\"-Wno-error=poison-system-directories -Wno-error=missing-include-dirs\"";
                 cmakeName = "linux";
                 binariesPrefix = "lib";
                 envVars.Add("CC", "clang-" + Configuration.LinuxClangMinVer);

From af54d04f9d567e39ee55bdf289a9ea5fb124da6d Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sat, 18 Oct 2025 02:34:50 +0300
Subject: [PATCH 18/33] Fix building ogg+vorbis on macOS

---
 .../Flax.Build/Deps/Dependencies/vorbis.cs    | 240 ++++++++----------
 1 file changed, 111 insertions(+), 129 deletions(-)

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs b/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
index 402adbe4f..5fc08cf60 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
@@ -83,18 +83,20 @@ namespace Flax.Deps.Dependencies
         {
             public string Filename;
             public string SrcFolder;
+            public string DstFilename;
 
-            public Binary(string filename, string srcFolder)
+            public Binary(string filename, string srcFolder, string dstFilename = null)
             {
                 Filename = filename;
                 SrcFolder = srcFolder;
+                DstFilename = dstFilename;
             }
         }
 
         private bool hasSourcesReady;
         private string root;
         private string rootMsvcLib;
-        private string configurationMsvc;
+        private string _configuration = "Release";
         private List<string> vcxprojContentsWindows;
         private string[] vcxprojPathsWindows;
 
@@ -104,22 +106,6 @@ namespace Flax.Deps.Dependencies
             new Binary("libvorbisfile_static.lib", "libvorbisfile"),
         };
 
-        private (string, string)[] vorbisBinariesToCopyWindowsCmake =
-        {
-            ("vorbis.lib", "libvorbis_static.lib"),
-            ("vorbisfile.lib", "libvorbisfile_static.lib"),
-        };
-
-        private Binary[] oggBinariesToCopyWindows =
-        {
-            new Binary("libogg_static.lib", "ogg"),
-        };
-
-        private (string, string)[] oggBinariesToCopyWindowsCmake =
-        {
-            ("ogg.lib", "libogg_static.lib"),
-        };
-
         private void PatchWindowsTargetPlatformVersion(string windowsTargetPlatformVersion, string platformToolset)
         {
             // Fix the MSVC project settings for Windows
@@ -137,7 +123,6 @@ namespace Flax.Deps.Dependencies
                 return;
 
             hasSourcesReady = true;
-            configurationMsvc = "Release";
 
             string oggRoot = Path.Combine(root, "libogg");
             string vorbisRoot = Path.Combine(root, "libvorbis");
@@ -227,7 +212,7 @@ namespace Flax.Deps.Dependencies
                     break;
                 default: throw new InvalidArchitectureException(architecture);
                 }
-                binariesToCopy.AddRange(vorbisBinariesToCopyWindows.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, configurationMsvc))));
+                binariesToCopy.AddRange(vorbisBinariesToCopyWindows.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, _configuration))));
                 break;
             }
             case TargetPlatform.PS4:
@@ -246,7 +231,7 @@ namespace Flax.Deps.Dependencies
                                         buildDir, true, true);
                 Utilities.FileCopy(Path.Combine(GetBinariesFolder(options, platform), "Data", "ogg", "ogg", "config_types.h"),
                                    Path.Combine(root, "libogg", "include", "ogg", "config_types.h"));
-                binariesToCopy.AddRange(binariesToCopyVorbis.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, configurationMsvc))));
+                binariesToCopy.AddRange(binariesToCopyVorbis.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, _configuration))));
                 break;
             }
             case TargetPlatform.PS5:
@@ -267,7 +252,7 @@ namespace Flax.Deps.Dependencies
                 Utilities.FileCopy(
                                    Path.Combine(GetBinariesFolder(options, platform), "Data", "ogg", "ogg", "config_types.h"),
                                    Path.Combine(root, "libogg", "include", "ogg", "config_types.h"));
-                binariesToCopy.AddRange(binariesToCopyVorbis.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, configurationMsvc))));
+                binariesToCopy.AddRange(binariesToCopyVorbis.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, _configuration))));
                 break;
             }
             case TargetPlatform.XboxOne:
@@ -275,21 +260,21 @@ namespace Flax.Deps.Dependencies
                 vcxprojPaths = vcxprojPathsWindows;
                 buildPlatform = "x64";
                 PatchWindowsTargetPlatformVersion("10.0", "v143");
-                binariesToCopy.AddRange(vorbisBinariesToCopyWindows.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, configurationMsvc))));
+                binariesToCopy.AddRange(vorbisBinariesToCopyWindows.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, _configuration))));
                 break;
             case TargetPlatform.XboxScarlett:
                 buildDir = Path.Combine(rootMsvcLib, "win32", "VS2010");
                 vcxprojPaths = vcxprojPathsWindows;
                 buildPlatform = "x64";
                 PatchWindowsTargetPlatformVersion("10.0", "v143");
-                binariesToCopy.AddRange(vorbisBinariesToCopyWindows.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, configurationMsvc))));
+                binariesToCopy.AddRange(vorbisBinariesToCopyWindows.Select(x => new Binary(x.Filename, Path.Combine(buildDir, x.SrcFolder, buildPlatform, _configuration))));
                 break;
             default: throw new InvalidPlatformException(platform);
             }
 
             // Build
             foreach (var vcxprojPath in vcxprojPaths)
-                Deploy.VCEnvironment.BuildSolution(vcxprojPath, configurationMsvc, buildPlatform);
+                Deploy.VCEnvironment.BuildSolution(vcxprojPath, _configuration, buildPlatform);
 
             // Copy binaries
             var depsFolder = GetThirdPartyFolder(options, platform, architecture);
@@ -303,48 +288,107 @@ namespace Flax.Deps.Dependencies
 
             string oggRoot = Path.Combine(root, "libogg");
             string vorbisRoot = Path.Combine(root, "libvorbis");
-
             var oggBuildDir = Path.Combine(oggRoot, "build-" + architecture.ToString());
             var vorbisBuildDir = Path.Combine(vorbisRoot, "build-" + architecture.ToString());
+            var installDir = Path.Combine(root, "install");
 
             string ext;
+            string oggConfig = $"-DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE={_configuration} -DCMAKE_INSTALL_PREFIX=\"{installDir}\"";
+            string vorbisConfig = $"-DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE={_configuration} -DCMAKE_INSTALL_PREFIX=\"{installDir}\"";
+            Dictionary<string, string> envVars = new Dictionary<string, string>();
+            (string, string)[] oggBinariesToCopy;
+            Binary[] vorbisBinariesToCopy;
             switch (platform)
             {
             case TargetPlatform.Windows:
             case TargetPlatform.UWP:
             case TargetPlatform.XboxOne:
+                oggConfig += " -DBUILD_SHARED_LIBS=OFF";
+                vorbisConfig += " -DBUILD_SHARED_LIBS=OFF";
                 ext = ".lib";
                 break;
             case TargetPlatform.Linux:
+                oggConfig += " -DCMAKE_POSITION_INDEPENDENT_CODE=ON";
+                vorbisConfig += " -DCMAKE_POSITION_INDEPENDENT_CODE=ON";
+                envVars = new Dictionary<string, string>
+                {
+                    { "CC", "clang-" + Configuration.LinuxClangMinVer },
+                    { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
+                    { "CXX", "clang++-" + Configuration.LinuxClangMinVer },
+                    { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
+                };
+                ext = ".a";
+                break;
+            case TargetPlatform.Mac:
+                //oggConfig += $" -DOGG_INCLUDE_DIR=\"{oggRoot}/install/include\" -DOGG_LIBRARY=\"{oggRoot}/install/lib\"";
                 ext = ".a";
                 break;
             default: throw new InvalidPlatformException(platform);
             }
 
-            var binariesToCopy = new List<(string, string)>();
-
-            // Build ogg
+            switch (platform)
             {
-                var solutionPath = Path.Combine(oggBuildDir, "ogg.sln");
-
-                RunCmake(oggRoot, platform, architecture, $"-B\"{oggBuildDir}\" -DBUILD_SHARED_LIBS=OFF -DCMAKE_POLICY_VERSION_MINIMUM=3.5");
-                Deploy.VCEnvironment.BuildSolution(solutionPath, configurationMsvc, architecture.ToString());
-                foreach (var file in oggBinariesToCopyWindowsCmake)
-                    binariesToCopy.Add((Path.Combine(oggBuildDir, configurationMsvc, file.Item1), file.Item2));
+            case TargetPlatform.Windows:
+            case TargetPlatform.UWP:
+            case TargetPlatform.XboxOne:
+                oggBinariesToCopy =
+                [
+                    ("ogg.lib", "libogg_static.lib")
+                ];
+                vorbisBinariesToCopy =
+                [
+                    new Binary("vorbis.lib", "libvorbis", "libvorbis_static.lib"),
+                    new Binary("vorbisfile.lib", "libvorbisfile", "libvorbisfile_static.lib")
+                ];
+                break;
+            case TargetPlatform.Linux:
+            case TargetPlatform.Mac:
+                oggBinariesToCopy =
+                [
+                    ("libogg.a", "libogg.a")
+                ];
+                vorbisBinariesToCopy =
+                [
+                    new Binary("libvorbis.a", "lib"),
+                    new Binary("libvorbisenc.a", "lib"),
+                    new Binary("libvorbisfile.a", "lib")
+                ];
+                break;
+            default: throw new InvalidPlatformException(platform);
             }
 
+            vorbisConfig += $" -DOGG_INCLUDE_DIR=\"{Path.Combine(installDir, "include")}\" -DOGG_LIBRARY=\"{Path.Combine(installDir, "lib", "libogg" + ext)}\"";
+
+            var binariesToCopy = new List<(string, string)>();
+
+            SetupDirectory(installDir, true);
+            // Build ogg
+            {
+                SetupDirectory(oggBuildDir, true);
+                RunCmake(oggRoot, platform, architecture, $"-B\"{oggBuildDir}\" " + oggConfig, envVars);
+                if (platform == TargetPlatform.Windows)
+                    Deploy.VCEnvironment.BuildSolution(Path.Combine(oggBuildDir, "ogg.sln"), _configuration, architecture.ToString());
+                else if (platform == TargetPlatform.Mac || platform == TargetPlatform.Linux)
+                    BuildCmake(oggBuildDir);
+                Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.DefaultTool);
+            }
             // Build vorbis
             {
-                var oggLibraryPath = Path.Combine(oggBuildDir, configurationMsvc, "ogg" + ext);
-                var solutionPath = Path.Combine(vorbisBuildDir, "vorbis.sln");
-
-                RunCmake(vorbisRoot, platform, architecture, $"-B\"{vorbisBuildDir}\" -DOGG_INCLUDE_DIR=\"{Path.Combine(oggRoot, "include")}\" -DOGG_LIBRARY=\"{oggLibraryPath}\" -DBUILD_SHARED_LIBS=OFF -DCMAKE_POLICY_VERSION_MINIMUM=3.5");
-                Deploy.VCEnvironment.BuildSolution(solutionPath, configurationMsvc, architecture.ToString());
-                foreach (var file in vorbisBinariesToCopyWindowsCmake)
-                    binariesToCopy.Add((Path.Combine(vorbisBuildDir, "lib", configurationMsvc, file.Item1), file.Item2));
+                SetupDirectory(vorbisBuildDir, true);
+                RunCmake(vorbisRoot, platform, architecture, $"-B\"{vorbisBuildDir}\" " + vorbisConfig);
+                if (platform == TargetPlatform.Windows)
+                    Deploy.VCEnvironment.BuildSolution(Path.Combine(vorbisBuildDir, "vorbis.sln"), _configuration, architecture.ToString());
+                else if (platform == TargetPlatform.Mac || platform == TargetPlatform.Linux)
+                    BuildCmake(vorbisBuildDir);
+                Utilities.Run("cmake", "--build . --target install", null, vorbisBuildDir, Utilities.RunOptions.DefaultTool);
             }
 
             // Copy binaries
+            foreach (var file in oggBinariesToCopy)
+                binariesToCopy.Add((Path.Combine(installDir, "lib", file.Item1), file.Item2));
+            foreach (var file in vorbisBinariesToCopy)
+                binariesToCopy.Add((Path.Combine(installDir, "lib", file.Filename), file.DstFilename ?? file.Filename));
+
             var depsFolder = GetThirdPartyFolder(options, platform, architecture);
             foreach (var file in binariesToCopy)
                 Utilities.FileCopy(file.Item1, Path.Combine(depsFolder, file.Item2));
@@ -389,31 +433,7 @@ namespace Flax.Deps.Dependencies
                     }
                     case TargetPlatform.Linux:
                     {
-                        // Note: assumes the libogg-dev package is pre-installed on the system
-
-                        // Get the source
-                        CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git");
-
-                        var envVars = new Dictionary<string, string>
-                        {
-                            { "CC", "clang-" + Configuration.LinuxClangMinVer },
-                            { "CC_FOR_BUILD", "clang-" + Configuration.LinuxClangMinVer },
-                            { "CXX", "clang++-" + Configuration.LinuxClangMinVer },
-                            { "CMAKE_BUILD_PARALLEL_LEVEL", CmakeBuildParallel },
-                        };
-                        var buildDir = Path.Combine(root, "build");
-
-                        Utilities.Run(Path.Combine(root, "autogen.sh"), null, null, root, Utilities.RunOptions.DefaultTool, envVars);
-
-                        // Build for Linux
-                        var toolchain = UnixToolchain.GetToolchainName(platform, architecture);
-                        Utilities.Run(Path.Combine(root, "configure"), string.Format("--host={0}", toolchain), null, root, Utilities.RunOptions.ThrowExceptionOnError, envVars);
-                        SetupDirectory(buildDir, true);
-                        Utilities.Run("cmake", "-G \"Unix Makefiles\" -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=Release ..", null, buildDir, Utilities.RunOptions.ConsoleLogOutput, envVars);
-                        Utilities.Run("cmake", "--build .", null, buildDir, Utilities.RunOptions.ConsoleLogOutput, envVars);
-                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
-                        foreach (var file in binariesToCopyUnix)
-                            Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename));
+                        BuildCmake(options, TargetPlatform.Linux, architecture);
                         break;
                     }
                     case TargetPlatform.PS4:
@@ -460,51 +480,33 @@ namespace Flax.Deps.Dependencies
                         var oggBuildDir = Path.Combine(oggRoot, "build");
                         var buildDir = Path.Combine(root, "build");
 
-                    // Get the source
-                    SetupDirectory(oggRoot, false);
-                    CloneGitRepo(root, "https://github.com/xiph/vorbis.git");
-                    GitCheckout(root, "master", "98eddc72d36e3421519d54b101c09b57e4d4d10d");
-                    CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git");
-                    GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5");
-                    Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data/ogg"), oggRoot, true, true);
-                    Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data/vorbis"), buildDir, true, true);
-
-                    // Build for Switch
-                    SetupDirectory(oggBuildDir, true);
-                    RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\"");
-                    Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput);
-                    Utilities.FileCopy(Path.Combine(GetBinariesFolder(options, platform), "Data/ogg", "include", "ogg", "config_types.h"), Path.Combine(oggRoot, "install", "include", "ogg", "config_types.h"));
-                    SetupDirectory(buildDir, true);
-                    RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot));
-                    BuildCmake(buildDir);
-                    var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
-                    foreach (var file in binariesToCopyUnix)
-                        Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename));
-                    break;
-                }
-                case TargetPlatform.Mac:
-                {
-                    var oggRoot = Path.Combine(root, "ogg");
-                    var oggBuildDir = Path.Combine(oggRoot, "build");
-                    var buildDir = Path.Combine(root, "build");
-
                         // Get the source
-                        CloneGitRepoFast(root, "https://github.com/xiph/vorbis.git");
+                        SetupDirectory(oggRoot, false);
+                        CloneGitRepo(root, "https://github.com/xiph/vorbis.git");
+                        GitCheckout(root, "master", "98eddc72d36e3421519d54b101c09b57e4d4d10d");
                         CloneGitRepo(oggRoot, "https://github.com/xiph/ogg.git");
                         GitCheckout(oggRoot, "master", "4380566a44b8d5e85ad511c9c17eb04197863ec5");
+                        Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data/ogg"), oggRoot, true, true);
+                        Utilities.DirectoryCopy(Path.Combine(GetBinariesFolder(options, platform), "Data/vorbis"), buildDir, true, true);
 
-                        // Build for Mac
+                        // Build for Switch
                         SetupDirectory(oggBuildDir, true);
-                        RunCmake(oggBuildDir, platform, architecture, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\"");
+                        RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\"");
                         Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput);
+                        Utilities.FileCopy(Path.Combine(GetBinariesFolder(options, platform), "Data/ogg", "include", "ogg", "config_types.h"), Path.Combine(oggRoot, "install", "include", "ogg", "config_types.h"));
                         SetupDirectory(buildDir, true);
-                        RunCmake(buildDir, platform, architecture, string.Format(".. -DCMAKE_BUILD_TYPE=Release  -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot));
+                        RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot));
                         BuildCmake(buildDir);
-                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
+                        var depsFolder = GetThirdPartyFolder(options, platform, TargetArchitecture.ARM64);
                         foreach (var file in binariesToCopyUnix)
                             Utilities.FileCopy(Path.Combine(buildDir, file.SrcFolder, file.Filename), Path.Combine(depsFolder, file.Filename));
                         break;
                     }
+                    case TargetPlatform.Mac:
+                    {
+                        BuildCmake(options, TargetPlatform.Mac, architecture);
+                        break;
+                    }
                     case TargetPlatform.iOS:
                     {
                         var oggRoot = Path.Combine(root, "ogg");
@@ -532,37 +534,17 @@ namespace Flax.Deps.Dependencies
                 }
             }
 
-            // Backup files
-            if (hasSourcesReady)
-                root = rootMsvcLib;
-            var srcIncludePath = Path.Combine(root, "include", "vorbis");
-            var dstIncludePath = Path.Combine(options.ThirdPartyFolder, "vorbis");
-            foreach (var filename in filesToKeep)
-            {
-                var src = Path.Combine(dstIncludePath, filename);
-                var dst = Path.Combine(options.IntermediateFolder, filename + ".tmp");
-                Utilities.FileCopy(src, dst);
-            }
+            // Setup headers directory
+            var installDir = Path.Combine(root, "install");
+            var oggOut = Path.Combine(options.ThirdPartyFolder, "ogg");
+            var vorbisOut = Path.Combine(options.ThirdPartyFolder, "vorbis");
 
-            try
-            {
-                // Setup headers directory
-                SetupDirectory(dstIncludePath, true);
+            // Deploy header files
+            Utilities.DirectoryCopy(Path.Combine(installDir, "include", "ogg"), oggOut, true, true);
+            Utilities.DirectoryCopy(Path.Combine(installDir, "include", "vorbis"), vorbisOut, true, true);
 
-                // Deploy header files and restore files
-                Directory.GetFiles(srcIncludePath, "Makefile*").ToList().ForEach(File.Delete);
-                Utilities.DirectoryCopy(srcIncludePath, dstIncludePath, true, true);
-                Utilities.FileCopy(Path.Combine(root, "COPYING"), Path.Combine(dstIncludePath, "COPYING"));
-            }
-            finally
-            {
-                foreach (var filename in filesToKeep)
-                {
-                    var src = Path.Combine(options.IntermediateFolder, filename + ".tmp");
-                    var dst = Path.Combine(dstIncludePath, filename);
-                    Utilities.FileCopy(src, dst);
-                }
-            }
+            Utilities.FileCopy(Path.Combine(root, "libogg", "COPYING"), Path.Combine(oggOut, "COPYING"));
+            Utilities.FileCopy(Path.Combine(root, "libvorbis", "COPYING"), Path.Combine(vorbisOut, "COPYING"));
         }
     }
 }

From 47cdd0582c51ecee2680dca19bdd11333e512a9d Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sat, 18 Oct 2025 03:24:51 +0300
Subject: [PATCH 19/33] Check VS2026 toolset before trying to compile PhysX

---
 .../Tools/Flax.Build/Deps/Dependencies/PhysX.cs | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
index f65767fd2..46ad23381 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
@@ -425,15 +425,20 @@ namespace Flax.Deps.Dependencies
                     {
                         if (architecture == TargetArchitecture.x64 || architecture == TargetArchitecture.ARM64)
                         {
-                            try
+                            if (WindowsPlatform.GetToolsets().Any(x => x.Key == WindowsPlatformToolset.v145))
                             {
-                                Build(options, architecture == TargetArchitecture.x64 ? "vc18win64" : "vc18win-arm64", platform, architecture);
+                                try
+                                {
+                                    Build(options, architecture == TargetArchitecture.x64 ? "vc18win64" : "vc18win-arm64", platform, architecture);
+                                }
+                                catch (Exception e)
+                                {
+                                    Log.Warning($"Failed to generate VS2026 solution for PhysX, fallback to VS2022: {e.Message}");
+                                    Build(options, architecture == TargetArchitecture.x64 ? "vc17win64" : "vc17win-arm64", platform, architecture);
+                                }
                             }
-                            catch
-                            {
-                                Log.Verbose("Failed to generate VS2026 solution for PhysX, fallback to VS2022");
+                            else
                                 Build(options, architecture == TargetArchitecture.x64 ? "vc17win64" : "vc17win-arm64", platform, architecture);
-                            }
                         }
                         else
                             throw new InvalidArchitectureException(architecture);

From afd59d7eb344b7bdae0e5ae18063ab38806fc983 Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sat, 18 Oct 2025 03:31:07 +0300
Subject: [PATCH 20/33] Fix building vorbis on Windows

---
 Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs b/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
index 5fc08cf60..6f6dde474 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
@@ -295,6 +295,7 @@ namespace Flax.Deps.Dependencies
             string ext;
             string oggConfig = $"-DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE={_configuration} -DCMAKE_INSTALL_PREFIX=\"{installDir}\"";
             string vorbisConfig = $"-DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE={_configuration} -DCMAKE_INSTALL_PREFIX=\"{installDir}\"";
+            string liboggFilename = "libogg";
             Dictionary<string, string> envVars = new Dictionary<string, string>();
             (string, string)[] oggBinariesToCopy;
             Binary[] vorbisBinariesToCopy;
@@ -306,6 +307,7 @@ namespace Flax.Deps.Dependencies
                 oggConfig += " -DBUILD_SHARED_LIBS=OFF";
                 vorbisConfig += " -DBUILD_SHARED_LIBS=OFF";
                 ext = ".lib";
+                liboggFilename = "ogg";
                 break;
             case TargetPlatform.Linux:
                 oggConfig += " -DCMAKE_POSITION_INDEPENDENT_CODE=ON";
@@ -357,7 +359,7 @@ namespace Flax.Deps.Dependencies
             default: throw new InvalidPlatformException(platform);
             }
 
-            vorbisConfig += $" -DOGG_INCLUDE_DIR=\"{Path.Combine(installDir, "include")}\" -DOGG_LIBRARY=\"{Path.Combine(installDir, "lib", "libogg" + ext)}\"";
+            vorbisConfig += $" -DOGG_INCLUDE_DIR=\"{Path.Combine(installDir, "include")}\" -DOGG_LIBRARY=\"{Path.Combine(installDir, "lib", liboggFilename + ext)}\"";
 
             var binariesToCopy = new List<(string, string)>();
 

From b08c765400faa29a8d7399e007b0706484091cca Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sat, 18 Oct 2025 04:18:01 +0300
Subject: [PATCH 21/33] Add dependency build script for WinPixEventRuntime

---
 .../Deps/Dependencies/WinPixEventRuntime.cs   | 91 +++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 Source/Tools/Flax.Build/Deps/Dependencies/WinPixEventRuntime.cs

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/WinPixEventRuntime.cs b/Source/Tools/Flax.Build/Deps/Dependencies/WinPixEventRuntime.cs
new file mode 100644
index 000000000..84a6f4f8b
--- /dev/null
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/WinPixEventRuntime.cs
@@ -0,0 +1,91 @@
+// Copyright (c) Wojciech Figat. All rights reserved.
+
+using System;
+using System.IO;
+using System.IO.Compression;
+using System.Linq;
+using Flax.Build;
+using Flax.Build.Platforms;
+
+namespace Flax.Deps.Dependencies
+{
+    /// <summary>
+    /// WinPixEventRuntime. https://github.com/microsoft/PixEvents
+    /// </summary>
+    /// <seealso cref="Flax.Deps.Dependency" />
+    class WinPixEventRuntime : Dependency
+    {
+        /// <inheritdoc />
+        public override TargetPlatform[] Platforms
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetPlatform.Windows,
+                    };
+                default: return new TargetPlatform[0];
+                }
+            }
+        }
+
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
+        /// <inheritdoc />
+        public override void Build(BuildOptions options)
+        {
+            // Get the source
+            var root = options.IntermediateFolder;
+            var packagePath = Path.Combine(root, $"package.zip");
+            if (!File.Exists(packagePath))
+            {
+                Downloader.DownloadFileFromUrlToPath("https://www.nuget.org/api/v2/package/WinPixEventRuntime/1.0.240308001", packagePath);
+            }
+            var extractedPath = Path.Combine(root, "extracted");
+            if (!Directory.Exists(extractedPath))
+            {
+                using (ZipArchive archive = ZipFile.Open(packagePath, ZipArchiveMode.Read))
+                    archive.ExtractToDirectory(extractedPath);
+            }
+            root = extractedPath;
+
+            foreach (var platform in options.Platforms)
+            {
+                foreach (var architecture in options.Architectures)
+                {
+                    BuildStarted(platform, architecture);
+                    switch (platform)
+                    {
+                    case TargetPlatform.Windows:
+                    {
+                        var bin = Path.Combine(root, "bin", architecture.ToString());
+                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
+                        Utilities.FileCopy(Path.Combine(bin, "WinPixEventRuntime.dll"), Path.Combine(depsFolder, "WinPixEventRuntime.dll"));
+                        Utilities.FileCopy(Path.Combine(bin, "WinPixEventRuntime.lib"), Path.Combine(depsFolder, "WinPixEventRuntime.lib"));
+                        break;
+                    }
+                    }
+                }
+            }
+        }
+    }
+}

From 9becddd84f43762bef6b360d63640057344fe7f2 Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sun, 19 Oct 2025 13:47:58 +0300
Subject: [PATCH 22/33] Add dependency build script for Visual Studio EnvDTE

---
 .../Flax.Build/Deps/Dependencies/EnvDTE.cs    | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 Source/Tools/Flax.Build/Deps/Dependencies/EnvDTE.cs

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/EnvDTE.cs b/Source/Tools/Flax.Build/Deps/Dependencies/EnvDTE.cs
new file mode 100644
index 000000000..32d783e81
--- /dev/null
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/EnvDTE.cs
@@ -0,0 +1,95 @@
+// Copyright (c) Wojciech Figat. All rights reserved.
+
+using System;
+using System.IO;
+using System.IO.Compression;
+using System.Linq;
+using Flax.Build;
+using Flax.Build.Platforms;
+
+namespace Flax.Deps.Dependencies
+{
+    /// <summary>
+    /// Visual Studio EnvDTE COM library. https://learn.microsoft.com/en-us/dotnet/api/envdte?view=visualstudiosdk-2022
+    /// </summary>
+    /// <seealso cref="Flax.Deps.Dependency" />
+    class EnvDTE : Dependency
+    {
+        /// <inheritdoc />
+        public override TargetPlatform[] Platforms
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                return new[]
+                {
+                        TargetPlatform.Windows,
+                    };
+                default: return new TargetPlatform[0];
+                }
+            }
+        }
+
+        /// <inheritdoc />
+        public override TargetArchitecture[] Architectures
+        {
+            get
+            {
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                return new[]
+                {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
+
+        /// <inheritdoc />
+        public override void Build(BuildOptions options)
+        {
+            options.IntermediateFolder.Replace("/" + GetType().Name, "/Microsoft.VisualStudio.Setup.Configuration.Native");
+
+            // Get the source
+            var root = options.IntermediateFolder;
+            var packagePath = Path.Combine(root, $"package.zip");
+            if (!File.Exists(packagePath))
+            {
+                Downloader.DownloadFileFromUrlToPath("https://www.nuget.org/api/v2/package/Microsoft.VisualStudio.Setup.Configuration.Native/3.14.2075", packagePath);
+            }
+            var extractedPath = Path.Combine(root, "extracted");
+            if (!Directory.Exists(extractedPath))
+            {
+                using (ZipArchive archive = ZipFile.Open(packagePath, ZipArchiveMode.Read))
+                    archive.ExtractToDirectory(extractedPath);
+            }
+            root = extractedPath;
+
+            foreach (var platform in options.Platforms)
+            {
+                foreach (var architecture in options.Architectures)
+                {
+                    BuildStarted(platform, architecture);
+                    switch (platform)
+                    {
+                    case TargetPlatform.Windows:
+                    {
+                        var bin = Path.Combine(root, "lib", "native", "v141", architecture.ToString().ToLower());
+                        var depsFolder = GetThirdPartyFolder(options, platform, architecture);
+                        Utilities.FileCopy(Path.Combine(bin, "Microsoft.VisualStudio.Setup.Configuration.Native.lib"), Path.Combine(depsFolder, "Microsoft.VisualStudio.Setup.Configuration.Native.lib"));
+
+                        var include = Path.Combine(root, "lib", "native", "include");
+                        Utilities.FileCopy(Path.Combine(include, "Setup.Configuration.h"), Path.Combine(options.ThirdPartyFolder, "Microsoft.VisualStudio.Setup.Configuration.Native", "Setup.Configuration.h"));
+                        break;
+                    }
+                    }
+                }
+            }
+        }
+    }
+}

From ea20dc6da040c6b487c43f84cf0dfe3563857def Mon Sep 17 00:00:00 2001
From: Ari Vuollet <ari.vuollet@goat.moe>
Date: Sun, 19 Oct 2025 17:24:10 +0300
Subject: [PATCH 23/33] Fix wrong build configuration used in ogg and vorbis
 for Windows

---
 .../Tools/Flax.Build/Deps/Dependencies/vorbis.cs   | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs b/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
index 6f6dde474..c19fab782 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
@@ -370,9 +370,9 @@ namespace Flax.Deps.Dependencies
                 RunCmake(oggRoot, platform, architecture, $"-B\"{oggBuildDir}\" " + oggConfig, envVars);
                 if (platform == TargetPlatform.Windows)
                     Deploy.VCEnvironment.BuildSolution(Path.Combine(oggBuildDir, "ogg.sln"), _configuration, architecture.ToString());
-                else if (platform == TargetPlatform.Mac || platform == TargetPlatform.Linux)
+                else
                     BuildCmake(oggBuildDir);
-                Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.DefaultTool);
+                Utilities.Run("cmake", $"--build . --config {_configuration} --target install", null, oggBuildDir, Utilities.RunOptions.DefaultTool);
             }
             // Build vorbis
             {
@@ -380,9 +380,9 @@ namespace Flax.Deps.Dependencies
                 RunCmake(vorbisRoot, platform, architecture, $"-B\"{vorbisBuildDir}\" " + vorbisConfig);
                 if (platform == TargetPlatform.Windows)
                     Deploy.VCEnvironment.BuildSolution(Path.Combine(vorbisBuildDir, "vorbis.sln"), _configuration, architecture.ToString());
-                else if (platform == TargetPlatform.Mac || platform == TargetPlatform.Linux)
+                else
                     BuildCmake(vorbisBuildDir);
-                Utilities.Run("cmake", "--build . --target install", null, vorbisBuildDir, Utilities.RunOptions.DefaultTool);
+                Utilities.Run("cmake", $"--build . --config {_configuration} --target install", null, vorbisBuildDir, Utilities.RunOptions.DefaultTool);
             }
 
             // Copy binaries
@@ -467,7 +467,7 @@ namespace Flax.Deps.Dependencies
                         // Build for Android
                         SetupDirectory(oggBuildDir, true);
                         RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\"");
-                        Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput);
+                        Utilities.Run("cmake", "--build . --config Release --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput);
                         SetupDirectory(buildDir, true);
                         RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release  -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot));
                         BuildCmake(buildDir);
@@ -494,7 +494,7 @@ namespace Flax.Deps.Dependencies
                         // Build for Switch
                         SetupDirectory(oggBuildDir, true);
                         RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\"");
-                        Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput);
+                        Utilities.Run("cmake", "--build . --config Release --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput);
                         Utilities.FileCopy(Path.Combine(GetBinariesFolder(options, platform), "Data/ogg", "include", "ogg", "config_types.h"), Path.Combine(oggRoot, "install", "include", "ogg", "config_types.h"));
                         SetupDirectory(buildDir, true);
                         RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot));
@@ -523,7 +523,7 @@ namespace Flax.Deps.Dependencies
                         // Build for Mac
                         SetupDirectory(oggBuildDir, true);
                         RunCmake(oggBuildDir, platform, TargetArchitecture.ARM64, ".. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=\"../install\"");
-                        Utilities.Run("cmake", "--build . --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput);
+                        Utilities.Run("cmake", "--build . --config Release --target install", null, oggBuildDir, Utilities.RunOptions.ConsoleLogOutput);
                         SetupDirectory(buildDir, true);
                         RunCmake(buildDir, platform, TargetArchitecture.ARM64, string.Format(".. -DCMAKE_BUILD_TYPE=Release  -DOGG_INCLUDE_DIR=\"{0}/install/include\" -DOGG_LIBRARY=\"{0}/install/lib\"", oggRoot));
                         BuildCmake(buildDir);

From 38d8832468a0bebd9a3362ff8664255b7ee5e0a1 Mon Sep 17 00:00:00 2001
From: Chandler Cox <mr.chandlercox@gmail.com>
Date: Fri, 19 Dec 2025 11:46:47 -0600
Subject: [PATCH 24/33] Update meshoptimizer library to 1.0

---
 Source/ThirdParty/meshoptimizer/allocator.cpp |   15 +-
 .../ThirdParty/meshoptimizer/clusterizer.cpp  | 1119 ++++++++++++--
 .../{vcacheanalyzer.cpp => indexanalyzer.cpp} |   53 +
 .../ThirdParty/meshoptimizer/indexcodec.cpp   |   52 +-
 .../meshoptimizer/indexgenerator.cpp          |  346 +++--
 .../ThirdParty/meshoptimizer/meshoptimizer.h  |  567 ++++++--
 .../meshoptimizer/overdrawoptimizer.cpp       |   14 +-
 Source/ThirdParty/meshoptimizer/partition.cpp |  624 ++++++++
 .../{overdrawanalyzer.cpp => rasterizer.cpp}  |  166 ++-
 .../ThirdParty/meshoptimizer/simplifier.cpp   | 1288 ++++++++++++++---
 .../ThirdParty/meshoptimizer/spatialorder.cpp |  239 ++-
 .../ThirdParty/meshoptimizer/stripifier.cpp   |   11 +-
 .../ThirdParty/meshoptimizer/vertexcodec.cpp  | 1108 +++++++++++---
 .../ThirdParty/meshoptimizer/vertexfilter.cpp |  640 ++++++--
 .../meshoptimizer/vfetchanalyzer.cpp          |   58 -
 15 files changed, 5247 insertions(+), 1053 deletions(-)
 rename Source/ThirdParty/meshoptimizer/{vcacheanalyzer.cpp => indexanalyzer.cpp} (58%)
 create mode 100644 Source/ThirdParty/meshoptimizer/partition.cpp
 rename Source/ThirdParty/meshoptimizer/{overdrawanalyzer.cpp => rasterizer.cpp} (62%)
 delete mode 100644 Source/ThirdParty/meshoptimizer/vfetchanalyzer.cpp

diff --git a/Source/ThirdParty/meshoptimizer/allocator.cpp b/Source/ThirdParty/meshoptimizer/allocator.cpp
index 12eda3872..6b6083da2 100644
--- a/Source/ThirdParty/meshoptimizer/allocator.cpp
+++ b/Source/ThirdParty/meshoptimizer/allocator.cpp
@@ -1,8 +1,17 @@
 // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
 #include "meshoptimizer.h"
 
-void meshopt_setAllocator(void*(MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void(MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
+#ifdef MESHOPTIMIZER_ALLOC_EXPORT
+meshopt_Allocator::Storage& meshopt_Allocator::storage()
 {
-	meshopt_Allocator::Storage::allocate = allocate;
-	meshopt_Allocator::Storage::deallocate = deallocate;
+	static Storage s = {::operator new, ::operator delete };
+	return s;
+}
+#endif
+
+void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
+{
+	meshopt_Allocator::Storage& s = meshopt_Allocator::storage();
+	s.allocate = allocate;
+	s.deallocate = deallocate;
 }
diff --git a/Source/ThirdParty/meshoptimizer/clusterizer.cpp b/Source/ThirdParty/meshoptimizer/clusterizer.cpp
index 52fe5a362..73cc0ab53 100644
--- a/Source/ThirdParty/meshoptimizer/clusterizer.cpp
+++ b/Source/ThirdParty/meshoptimizer/clusterizer.cpp
@@ -6,19 +6,39 @@
 #include <math.h>
 #include <string.h>
 
+// The block below auto-detects SIMD ISA that can be used on the target platform
+#ifndef MESHOPTIMIZER_NO_SIMD
+#if defined(__SSE2__) || (defined(_MSC_VER) && defined(_M_X64))
+#define SIMD_SSE
+#include <emmintrin.h>
+#elif defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64) && _MSC_VER >= 1922)
+#define SIMD_NEON
+#include <arm_neon.h>
+#endif
+#endif // !MESHOPTIMIZER_NO_SIMD
+
 // This work is based on:
 // Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
 // Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
 // Jack Ritter. An Efficient Bounding Sphere. 1990
+// Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 2008
+// Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 2006
 namespace meshopt
 {
 
-// This must be <= 255 since index 0xff is used internally to indice a vertex that doesn't belong to a meshlet
-const size_t kMeshletMaxVertices = 255;
+// This must be <= 256 since meshlet indices are stored as bytes
+const size_t kMeshletMaxVertices = 256;
 
 // A reasonable limit is around 2*max_vertices or less
 const size_t kMeshletMaxTriangles = 512;
 
+// We keep a limited number of seed triangles and add a few triangles per finished meshlet
+const size_t kMeshletMaxSeeds = 256;
+const size_t kMeshletAddSeeds = 4;
+
+// To avoid excessive recursion for malformed inputs, we limit the maximum depth of the tree
+const int kMeshletMaxTreeDepth = 50;
+
 struct TriangleAdjacency2
 {
 	unsigned int* counts;
@@ -70,72 +90,190 @@ static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned
 	for (size_t i = 0; i < vertex_count; ++i)
 	{
 		assert(adjacency.offsets[i] >= adjacency.counts[i]);
-
 		adjacency.offsets[i] -= adjacency.counts[i];
 	}
 }
 
-static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
+static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
 {
-	assert(count > 0);
+	size_t face_count = index_count / 3;
 
-	// find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates
-	size_t pmin[3] = {0, 0, 0};
-	size_t pmax[3] = {0, 0, 0};
+	// sparse mode can build adjacency more quickly by ignoring unused vertices, using a bit to mark visited vertices
+	const unsigned int sparse_seen = 1u << 31;
+	assert(index_count < sparse_seen);
+
+	// allocate arrays
+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+	// fill triangle counts
+	for (size_t i = 0; i < index_count; ++i)
+		assert(indices[i] < vertex_count);
+
+	for (size_t i = 0; i < index_count; ++i)
+		adjacency.counts[indices[i]] = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+		adjacency.counts[indices[i]]++;
+
+	// fill offset table; uses sparse_seen bit to tag visited vertices
+	unsigned int offset = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int v = indices[i];
+
+		if ((adjacency.counts[v] & sparse_seen) == 0)
+		{
+			adjacency.offsets[v] = offset;
+			offset += adjacency.counts[v];
+			adjacency.counts[v] |= sparse_seen;
+		}
+	}
+
+	assert(offset == index_count);
+
+	// fill triangle data
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+		adjacency.data[adjacency.offsets[a]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[b]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[c]++] = unsigned(i);
+	}
+
+	// fix offsets that have been disturbed by the previous pass
+	// also fix counts (that were marked with sparse_seen by the first pass)
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int v = indices[i];
+
+		if (adjacency.counts[v] & sparse_seen)
+		{
+			adjacency.counts[v] &= ~sparse_seen;
+
+			assert(adjacency.offsets[v] >= adjacency.counts[v]);
+			adjacency.offsets[v] -= adjacency.counts[v];
+		}
+	}
+}
+
+static void clearUsed(short* used, size_t vertex_count, const unsigned int* indices, size_t index_count)
+{
+	// for sparse inputs, it's faster to only clear vertices referenced by the index buffer
+	if (vertex_count <= index_count)
+		memset(used, -1, vertex_count * sizeof(short));
+	else
+		for (size_t i = 0; i < index_count; ++i)
+		{
+			assert(indices[i] < vertex_count);
+			used[indices[i]] = -1;
+		}
+}
+
+static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count)
+{
+	static const float kAxes[7][3] = {
+	    // X, Y, Z
+	    {1, 0, 0},
+	    {0, 1, 0},
+	    {0, 0, 1},
+
+	    // XYZ, -XYZ, X-YZ, XY-Z; normalized to unit length
+	    {0.57735026f, 0.57735026f, 0.57735026f},
+	    {-0.57735026f, 0.57735026f, 0.57735026f},
+	    {0.57735026f, -0.57735026f, 0.57735026f},
+	    {0.57735026f, 0.57735026f, -0.57735026f},
+	};
+
+	assert(count > 0);
+	assert(axis_count <= sizeof(kAxes) / sizeof(kAxes[0]));
+
+	size_t points_stride_float = points_stride / sizeof(float);
+	size_t radii_stride_float = radii_stride / sizeof(float);
+
+	// find extremum points along all axes; for each axis we get a pair of points with min/max coordinates
+	size_t pmin[7], pmax[7];
+	float tmin[7], tmax[7];
+
+	for (size_t axis = 0; axis < axis_count; ++axis)
+	{
+		pmin[axis] = pmax[axis] = 0;
+		tmin[axis] = FLT_MAX;
+		tmax[axis] = -FLT_MAX;
+	}
 
 	for (size_t i = 0; i < count; ++i)
 	{
-		const float* p = points[i];
+		const float* p = points + i * points_stride_float;
+		float r = radii[i * radii_stride_float];
 
-		for (int axis = 0; axis < 3; ++axis)
+		for (size_t axis = 0; axis < axis_count; ++axis)
 		{
-			pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis];
-			pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis];
+			const float* ax = kAxes[axis];
+
+			float tp = ax[0] * p[0] + ax[1] * p[1] + ax[2] * p[2];
+			float tpmin = tp - r, tpmax = tp + r;
+
+			pmin[axis] = (tpmin < tmin[axis]) ? i : pmin[axis];
+			pmax[axis] = (tpmax > tmax[axis]) ? i : pmax[axis];
+			tmin[axis] = (tpmin < tmin[axis]) ? tpmin : tmin[axis];
+			tmax[axis] = (tpmax > tmax[axis]) ? tpmax : tmax[axis];
 		}
 	}
 
 	// find the pair of points with largest distance
-	float paxisd2 = 0;
-	int paxis = 0;
+	size_t paxis = 0;
+	float paxisdr = 0;
 
-	for (int axis = 0; axis < 3; ++axis)
+	for (size_t axis = 0; axis < axis_count; ++axis)
 	{
-		const float* p1 = points[pmin[axis]];
-		const float* p2 = points[pmax[axis]];
+		const float* p1 = points + pmin[axis] * points_stride_float;
+		const float* p2 = points + pmax[axis] * points_stride_float;
+		float r1 = radii[pmin[axis] * radii_stride_float];
+		float r2 = radii[pmax[axis] * radii_stride_float];
 
 		float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
+		float dr = sqrtf(d2) + r1 + r2;
 
-		if (d2 > paxisd2)
+		if (dr > paxisdr)
 		{
-			paxisd2 = d2;
+			paxisdr = dr;
 			paxis = axis;
 		}
 	}
 
 	// use the longest segment as the initial sphere diameter
-	const float* p1 = points[pmin[paxis]];
-	const float* p2 = points[pmax[paxis]];
+	const float* p1 = points + pmin[paxis] * points_stride_float;
+	const float* p2 = points + pmax[paxis] * points_stride_float;
+	float r1 = radii[pmin[paxis] * radii_stride_float];
+	float r2 = radii[pmax[paxis] * radii_stride_float];
 
-	float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2};
-	float radius = sqrtf(paxisd2) / 2;
+	float paxisd = sqrtf((p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]));
+	float paxisk = paxisd > 0 ? (paxisd + r2 - r1) / (2 * paxisd) : 0.f;
+
+	float center[3] = {p1[0] + (p2[0] - p1[0]) * paxisk, p1[1] + (p2[1] - p1[1]) * paxisk, p1[2] + (p2[2] - p1[2]) * paxisk};
+	float radius = paxisdr / 2;
 
 	// iteratively adjust the sphere up until all points fit
 	for (size_t i = 0; i < count; ++i)
 	{
-		const float* p = points[i];
+		const float* p = points + i * points_stride_float;
+		float r = radii[i * radii_stride_float];
+
 		float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+		float d = sqrtf(d2);
 
-		if (d2 > radius * radius)
+		if (d + r > radius)
 		{
-			float d = sqrtf(d2);
-			assert(d > 0);
+			float k = d > 0 ? (d + r - radius) / (2 * d) : 0.f;
 
-			float k = 0.5f + (radius / d) / 2;
-
-			center[0] = center[0] * k + p[0] * (1 - k);
-			center[1] = center[1] * k + p[1] * (1 - k);
-			center[2] = center[2] * k + p[2] * (1 - k);
-			radius = (radius + d) / 2;
+			center[0] += k * (p[0] - center[0]);
+			center[1] += k * (p[1] - center[1]);
+			center[2] += k * (p[2] - center[2]);
+			radius = (radius + d + r) / 2;
 		}
 	}
 
@@ -151,12 +289,12 @@ struct Cone
 	float nx, ny, nz;
 };
 
-static float getMeshletScore(float distance2, float spread, float cone_weight, float expected_radius)
+static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius)
 {
 	float cone = 1.f - spread * cone_weight;
 	float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
 
-	return (1 + sqrtf(distance2) / expected_radius * (1 - cone_weight)) * cone_clamped;
+	return (1 + distance / expected_radius * (1 - cone_weight)) * cone_clamped;
 }
 
 static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)
@@ -221,72 +359,61 @@ static float computeTriangleCones(Cone* triangles, const unsigned int* indices,
 	return mesh_area;
 }
 
-static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles)
+static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false)
 {
-	size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3;
-
-	// fill 4b padding with 0
-	while (offset & 3)
-		meshlet_triangles[offset++] = 0;
-}
-
-static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, unsigned char* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles)
-{
-	unsigned char& av = used[a];
-	unsigned char& bv = used[b];
-	unsigned char& cv = used[c];
+	short& av = used[a];
+	short& bv = used[b];
+	short& cv = used[c];
 
 	bool result = false;
 
-	unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+	int used_extra = (av < 0) + (bv < 0) + (cv < 0);
 
-	if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+	if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles || split)
 	{
 		meshlets[meshlet_offset] = meshlet;
 
 		for (size_t j = 0; j < meshlet.vertex_count; ++j)
-			used[meshlet_vertices[meshlet.vertex_offset + j]] = 0xff;
-
-		finishMeshlet(meshlet, meshlet_triangles);
+			used[meshlet_vertices[meshlet.vertex_offset + j]] = -1;
 
 		meshlet.vertex_offset += meshlet.vertex_count;
-		meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding
+		meshlet.triangle_offset += meshlet.triangle_count * 3;
 		meshlet.vertex_count = 0;
 		meshlet.triangle_count = 0;
 
 		result = true;
 	}
 
-	if (av == 0xff)
+	if (av < 0)
 	{
-		av = (unsigned char)meshlet.vertex_count;
+		av = short(meshlet.vertex_count);
 		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;
 	}
 
-	if (bv == 0xff)
+	if (bv < 0)
 	{
-		bv = (unsigned char)meshlet.vertex_count;
+		bv = short(meshlet.vertex_count);
 		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;
 	}
 
-	if (cv == 0xff)
+	if (cv < 0)
 	{
-		cv = (unsigned char)meshlet.vertex_count;
+		cv = short(meshlet.vertex_count);
 		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;
 	}
 
-	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = av;
-	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = bv;
-	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = cv;
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = (unsigned char)av;
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = (unsigned char)bv;
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = (unsigned char)cv;
 	meshlet.triangle_count++;
 
 	return result;
 }
 
-static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight, unsigned int* out_extra)
+static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone& meshlet_cone, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const short* used, float meshlet_expected_radius, float cone_weight)
 {
 	unsigned int best_triangle = ~0u;
-	unsigned int best_extra = 5;
+	int best_priority = 5;
 	float best_score = FLT_MAX;
 
 	for (size_t i = 0; i < meshlet.vertex_count; ++i)
@@ -301,61 +428,159 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co
 			unsigned int triangle = neighbors[j];
 			unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
 
-			unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
+			int extra = (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
+			assert(extra <= 2);
+
+			int priority = -1;
 
 			// triangles that don't add new vertices to meshlets are max. priority
-			if (extra != 0)
-			{
-				// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
-				if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
-					extra = 0;
-
-				extra++;
-			}
+			if (extra == 0)
+				priority = 0;
+			// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
+			else if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
+				priority = 1;
+			// if two vertices have live count of 2, removing this triangle will make another triangle dangling which is good for overall flow
+			else if ((live_triangles[a] == 2) + (live_triangles[b] == 2) + (live_triangles[c] == 2) >= 2)
+				priority = 1 + extra;
+			// otherwise adjust priority to be after the above cases, 3 or 4 based on used[] count
+			else
+				priority = 2 + extra;
 
 			// since topology-based priority is always more important than the score, we can skip scoring in some cases
-			if (extra > best_extra)
+			if (priority > best_priority)
 				continue;
 
-			float score = 0;
+			const Cone& tri_cone = triangles[triangle];
 
-			// caller selects one of two scoring functions: geometrical (based on meshlet cone) or topological (based on remaining triangles)
-			if (meshlet_cone)
-			{
-				const Cone& tri_cone = triangles[triangle];
+			float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz;
+			float distance = sqrtf(dx * dx + dy * dy + dz * dz);
+			float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
 
-				float distance2 =
-				    (tri_cone.px - meshlet_cone->px) * (tri_cone.px - meshlet_cone->px) +
-				    (tri_cone.py - meshlet_cone->py) * (tri_cone.py - meshlet_cone->py) +
-				    (tri_cone.pz - meshlet_cone->pz) * (tri_cone.pz - meshlet_cone->pz);
-
-				float spread = tri_cone.nx * meshlet_cone->nx + tri_cone.ny * meshlet_cone->ny + tri_cone.nz * meshlet_cone->nz;
-
-				score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
-			}
-			else
-			{
-				// each live_triangles entry is >= 1 since it includes the current triangle we're processing
-				score = float(live_triangles[a] + live_triangles[b] + live_triangles[c] - 3);
-			}
+			float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius);
 
 			// note that topology-based priority is always more important than the score
 			// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
-			if (extra < best_extra || score < best_score)
+			if (priority < best_priority || score < best_score)
 			{
 				best_triangle = triangle;
-				best_extra = extra;
+				best_priority = priority;
 				best_score = score;
 			}
 		}
 	}
 
-	if (out_extra)
-		*out_extra = best_extra;
-
 	return best_triangle;
 }
 
+static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& meshlet, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)
+{
+	unsigned int best_seeds[kMeshletAddSeeds];
+	unsigned int best_live[kMeshletAddSeeds];
+	float best_score[kMeshletAddSeeds];
+
+	for (size_t i = 0; i < kMeshletAddSeeds; ++i)
+	{
+		best_seeds[i] = ~0u;
+		best_live[i] = ~0u;
+		best_score[i] = FLT_MAX;
+	}
+
+	for (size_t i = 0; i < meshlet.vertex_count; ++i)
+	{
+		unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
+
+		unsigned int best_neighbor = ~0u;
+		unsigned int best_neighbor_live = ~0u;
+
+		// find the neighbor with the smallest live metric
+		unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
+		size_t neighbors_size = adjacency.counts[index];
+
+		for (size_t j = 0; j < neighbors_size; ++j)
+		{
+			unsigned int triangle = neighbors[j];
+			unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+
+			unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
+
+			if (live < best_neighbor_live)
+			{
+				best_neighbor = triangle;
+				best_neighbor_live = live;
+			}
+		}
+
+		// add the neighbor to the list of seeds; the list is unsorted and the replacement criteria is approximate
+		if (best_neighbor == ~0u)
+			continue;
+
+		float dx = triangles[best_neighbor].px - cornerx, dy = triangles[best_neighbor].py - cornery, dz = triangles[best_neighbor].pz - cornerz;
+		float best_neighbor_score = sqrtf(dx * dx + dy * dy + dz * dz);
+
+		for (size_t j = 0; j < kMeshletAddSeeds; ++j)
+		{
+			// non-strict comparison reduces the number of duplicate seeds (triangles adjacent to multiple vertices)
+			if (best_neighbor_live < best_live[j] || (best_neighbor_live == best_live[j] && best_neighbor_score <= best_score[j]))
+			{
+				best_seeds[j] = best_neighbor;
+				best_live[j] = best_neighbor_live;
+				best_score[j] = best_neighbor_score;
+				break;
+			}
+		}
+	}
+
+	// add surviving seeds to the meshlet
+	size_t seed_count = 0;
+
+	for (size_t i = 0; i < kMeshletAddSeeds; ++i)
+		if (best_seeds[i] != ~0u)
+			seeds[seed_count++] = best_seeds[i];
+
+	return seed_count;
+}
+
+static size_t pruneSeedTriangles(unsigned int* seeds, size_t seed_count, const unsigned char* emitted_flags)
+{
+	size_t result = 0;
+
+	for (size_t i = 0; i < seed_count; ++i)
+	{
+		unsigned int index = seeds[i];
+
+		seeds[result] = index;
+		result += emitted_flags[index] == 0;
+	}
+
+	return result;
+}
+
+static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_count, const unsigned int* indices, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)
+{
+	unsigned int best_seed = ~0u;
+	unsigned int best_live = ~0u;
+	float best_score = FLT_MAX;
+
+	for (size_t i = 0; i < seed_count; ++i)
+	{
+		unsigned int index = seeds[i];
+		unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+		unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
+		float dx = triangles[index].px - cornerx, dy = triangles[index].py - cornery, dz = triangles[index].pz - cornerz;
+		float score = sqrtf(dx * dx + dy * dy + dz * dz);
+
+		if (live < best_live || (live == best_live && score < best_score))
+		{
+			best_seed = index;
+			best_live = live;
+			best_score = score;
+		}
+	}
+
+	return best_seed;
+}
+
 struct KDNode
 {
 	union
@@ -364,13 +589,13 @@ struct KDNode
 		unsigned int index;
 	};
 
-	// leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point)
+	// leaves: axis = 3, children = number of points including this one
 	// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
 	unsigned int axis : 2;
 	unsigned int children : 30;
 };
 
-static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)
+static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, int axis, float pivot)
 {
 	size_t m = 0;
 
@@ -400,7 +625,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u
 
 	result.index = indices[0];
 	result.axis = 3;
-	result.children = unsigned(count - 1);
+	result.children = unsigned(count);
 
 	// all remaining points are stored in nodes immediately following the leaf
 	for (size_t i = 1; i < count; ++i)
@@ -415,7 +640,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u
 	return offset + count;
 }
 
-static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
+static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size, int depth)
 {
 	assert(count > 0);
 	assert(offset < node_count);
@@ -441,13 +666,14 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
 	}
 
 	// split axis is one where the variance is largest
-	unsigned int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
+	int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
 
 	float split = mean[axis];
 	size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
 
 	// when the partition is degenerate simply consolidate the points into a single node
-	if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
+	// this also ensures recursion depth is bounded on pathological inputs
+	if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2 || depth >= kMeshletMaxTreeDepth)
 		return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
 
 	KDNode& result = nodes[offset];
@@ -456,35 +682,40 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
 	result.axis = axis;
 
 	// left subtree is right after our node
-	size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
+	size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size, depth + 1);
 
 	// distance to the right subtree is represented explicitly
+	assert(next_offset - offset > 1);
 	result.children = unsigned(next_offset - offset - 1);
 
-	return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
+	return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size, depth + 1);
 }
 
 static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
 {
 	const KDNode& node = nodes[root];
 
+	if (node.children == 0)
+		return;
+
 	if (node.axis == 3)
 	{
 		// leaf
-		for (unsigned int i = 0; i <= node.children; ++i)
+		bool inactive = true;
+
+		for (unsigned int i = 0; i < node.children; ++i)
 		{
 			unsigned int index = nodes[root + i].index;
 
 			if (emitted_flags[index])
 				continue;
 
+			inactive = false;
+
 			const float* point = points + index * stride;
 
-			float distance2 =
-			    (point[0] - position[0]) * (point[0] - position[0]) +
-			    (point[1] - position[1]) * (point[1] - position[1]) +
-			    (point[2] - position[2]) * (point[2] - position[2]);
-			float distance = sqrtf(distance2);
+			float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2];
+			float distance = sqrtf(dx * dx + dy * dy + dz * dz);
 
 			if (distance < limit)
 			{
@@ -492,6 +723,10 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
 				limit = distance;
 			}
 		}
+
+		// deactivate leaves that no longer have items to emit
+		if (inactive)
+			nodes[root].children = 0;
 	}
 	else
 	{
@@ -500,6 +735,12 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
 		unsigned int first = (delta <= 0) ? 0 : node.children;
 		unsigned int second = first ^ node.children;
 
+		// deactivate branches that no longer have items to emit to accelerate traversal
+		// note that we do this *before* recursing which delays deactivation but keeps tail calls
+		if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0)
+			nodes[root].children = 0;
+
+		// recursion depth is bounded by tree depth (which is limited by construction)
 		kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
 
 		// only process the other node if it can have a match based on closest distance so far
@@ -508,6 +749,380 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
 	}
 }
 
+struct BVHBoxT
+{
+	float min[4];
+	float max[4];
+};
+
+struct BVHBox
+{
+	float min[3];
+	float max[3];
+};
+
+#if defined(SIMD_SSE)
+static float boxMerge(BVHBoxT& box, const BVHBox& other)
+{
+	__m128 min = _mm_loadu_ps(box.min);
+	__m128 max = _mm_loadu_ps(box.max);
+
+	// note: over-read is safe because BVHBox array is allocated with padding
+	min = _mm_min_ps(min, _mm_loadu_ps(other.min));
+	max = _mm_max_ps(max, _mm_loadu_ps(other.max));
+
+	_mm_storeu_ps(box.min, min);
+	_mm_storeu_ps(box.max, max);
+
+	__m128 size = _mm_sub_ps(max, min);
+	__m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1));
+	__m128 mul = _mm_mul_ps(size, size_yzx);
+	__m128 sum_xy = _mm_add_ss(mul, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 1, 1, 1)));
+	__m128 sum_xyz = _mm_add_ss(sum_xy, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 2, 2, 2)));
+
+	return _mm_cvtss_f32(sum_xyz);
+}
+#elif defined(SIMD_NEON)
+static float boxMerge(BVHBoxT& box, const BVHBox& other)
+{
+	float32x4_t min = vld1q_f32(box.min);
+	float32x4_t max = vld1q_f32(box.max);
+
+	// note: over-read is safe because BVHBox array is allocated with padding
+	min = vminq_f32(min, vld1q_f32(other.min));
+	max = vmaxq_f32(max, vld1q_f32(other.max));
+
+	vst1q_f32(box.min, min);
+	vst1q_f32(box.max, max);
+
+	float32x4_t size = vsubq_f32(max, min);
+	float32x4_t size_yzx = vextq_f32(vextq_f32(size, size, 3), size, 2);
+	float32x4_t mul = vmulq_f32(size, size_yzx);
+	float sum_xy = vgetq_lane_f32(mul, 0) + vgetq_lane_f32(mul, 1);
+	float sum_xyz = sum_xy + vgetq_lane_f32(mul, 2);
+
+	return sum_xyz;
+}
+#else
+static float boxMerge(BVHBoxT& box, const BVHBox& other)
+{
+	for (int k = 0; k < 3; ++k)
+	{
+		box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k];
+		box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k];
+	}
+
+	float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2];
+	return sx * sy + sx * sz + sy * sz;
+}
+#endif
+
+inline unsigned int radixFloat(unsigned int v)
+{
+	// if sign bit is 0, flip sign bit
+	// if sign bit is 1, flip everything
+	unsigned int mask = (int(v) >> 31) | 0x80000000;
+	return v ^ mask;
+}
+
+static void computeHistogram(unsigned int (&hist)[1024][3], const float* data, size_t count)
+{
+	memset(hist, 0, sizeof(hist));
+
+	const unsigned int* bits = reinterpret_cast<const unsigned int*>(data);
+
+	// compute 3 10-bit histograms in parallel (dropping 2 LSB)
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = radixFloat(bits[i]);
+
+		hist[(id >> 2) & 1023][0]++;
+		hist[(id >> 12) & 1023][1]++;
+		hist[(id >> 22) & 1023][2]++;
+	}
+
+	unsigned int sum0 = 0, sum1 = 0, sum2 = 0;
+
+	// replace histogram data with prefix histogram sums in-place
+	for (int i = 0; i < 1024; ++i)
+	{
+		unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
+
+		hist[i][0] = sum0;
+		hist[i][1] = sum1;
+		hist[i][2] = sum2;
+
+		sum0 += hx;
+		sum1 += hy;
+		sum2 += hz;
+	}
+
+	assert(sum0 == count && sum1 == count && sum2 == count);
+}
+
+static void radixPass(unsigned int* destination, const unsigned int* source, const float* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
+{
+	const unsigned int* bits = reinterpret_cast<const unsigned int*>(keys);
+	int bitoff = pass * 10 + 2; // drop 2 LSB to be able to use 3 10-bit passes
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = (radixFloat(bits[source[i]]) >> bitoff) & 1023;
+
+		destination[hist[id][pass]++] = source[i];
+	}
+}
+
+static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indices, size_t face_count, const float* vertex_positions, size_t vertex_count, size_t vertex_stride_float)
+{
+	(void)vertex_count;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		const float* va = vertex_positions + vertex_stride_float * a;
+		const float* vb = vertex_positions + vertex_stride_float * b;
+		const float* vc = vertex_positions + vertex_stride_float * c;
+
+		BVHBox& box = boxes[i];
+
+		for (int k = 0; k < 3; ++k)
+		{
+			box.min[k] = va[k] < vb[k] ? va[k] : vb[k];
+			box.min[k] = vc[k] < box.min[k] ? vc[k] : box.min[k];
+
+			box.max[k] = va[k] > vb[k] ? va[k] : vb[k];
+			box.max[k] = vc[k] > box.max[k] ? vc[k] : box.max[k];
+
+			centroids[i + face_count * k] = (box.min[k] + box.max[k]) / 2.f;
+		}
+	}
+}
+
+static size_t bvhCountVertices(const unsigned int* order, size_t count, short* used, const unsigned int* indices, unsigned int* out = NULL)
+{
+	// count number of unique vertices
+	size_t used_vertices = 0;
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int index = order[i];
+		unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+		used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
+		used[a] = used[b] = used[c] = 1;
+
+		if (out)
+			out[i] = unsigned(used_vertices);
+	}
+
+	// reset used[] for future invocations
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int index = order[i];
+		unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+		used[a] = used[b] = used[c] = -1;
+	}
+
+	return used_vertices;
+}
+
+static void bvhPackLeaf(unsigned char* boundary, size_t count)
+{
+	// mark meshlet boundary for future reassembly
+	assert(count > 0);
+
+	boundary[0] = 1;
+	memset(boundary + 1, 0, count - 1);
+}
+
+static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles)
+{
+	for (size_t i = 0; i < count;)
+	{
+		size_t chunk = i + max_triangles <= count ? max_triangles : count - i;
+
+		if (bvhCountVertices(order + i, chunk, used, indices) <= max_vertices)
+		{
+			bvhPackLeaf(boundary + i, chunk);
+			i += chunk;
+			continue;
+		}
+
+		// chunk is vertex bound, split it into smaller meshlets
+		assert(chunk > max_vertices / 3);
+
+		bvhPackLeaf(boundary + i, max_vertices / 3);
+		i += max_vertices / 3;
+	}
+}
+
+static bool bvhDivisible(size_t count, size_t min, size_t max)
+{
+	// count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)]
+	// equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv (see nv_cluster_builder)
+	// we avoid expensive integer divisions in the common case where min is <= max/2
+	return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min);
+}
+
+static void bvhComputeArea(float* areas, const BVHBox* boxes, const unsigned int* order, size_t count)
+{
+	BVHBoxT accuml = {{FLT_MAX, FLT_MAX, FLT_MAX, 0}, {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0}};
+	BVHBoxT accumr = accuml;
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		float larea = boxMerge(accuml, boxes[order[i]]);
+		float rarea = boxMerge(accumr, boxes[order[count - 1 - i]]);
+
+		areas[i] = larea;
+		areas[i + count] = rarea;
+	}
+}
+
+static size_t bvhPivot(const float* areas, const unsigned int* vertices, size_t count, size_t step, size_t min, size_t max, float fill, size_t maxfill, float* out_cost)
+{
+	bool aligned = count >= min * 2 && bvhDivisible(count, min, max);
+	size_t end = aligned ? count - min : count - 1;
+
+	float rmaxfill = 1.f / float(int(maxfill));
+
+	// find best split that minimizes SAH
+	size_t bestsplit = 0;
+	float bestcost = FLT_MAX;
+
+	for (size_t i = min - 1; i < end; i += step)
+	{
+		size_t lsplit = i + 1, rsplit = count - (i + 1);
+
+		if (!bvhDivisible(lsplit, min, max))
+			continue;
+		if (aligned && !bvhDivisible(rsplit, min, max))
+			continue;
+
+		// areas[x] = inclusive surface area of boxes[0..x]
+		// areas[count-1-x] = inclusive surface area of boxes[x..count-1]
+		float larea = areas[i], rarea = areas[(count - 1 - (i + 1)) + count];
+		float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit));
+
+		if (cost > bestcost)
+			continue;
+
+		// use vertex fill when splitting vertex limited clusters; note that we use the same (left->right) vertex count
+		// using bidirectional vertex counts is a little more expensive to compute and produces slightly worse results in practice
+		size_t lfill = vertices ? vertices[i] : lsplit;
+		size_t rfill = vertices ? vertices[i] : rsplit;
+
+		// fill cost; use floating point math to round up to maxfill to avoid expensive integer modulo
+		int lrest = int(float(int(lfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(lfill);
+		int rrest = int(float(int(rfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(rfill);
+
+		cost += fill * (float(lrest) * larea + float(rrest) * rarea);
+
+		if (cost < bestcost)
+		{
+			bestcost = cost;
+			bestsplit = i + 1;
+		}
+	}
+
+	*out_cost = bestcost;
+	return bestsplit;
+}
+
+static void bvhPartition(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
+{
+	size_t l = 0, r = split;
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned char side = sides[order[i]];
+		target[side ? r : l] = order[i];
+		l += 1;
+		l -= side;
+		r += side;
+	}
+
+	assert(l == split && r == count);
+}
+
+static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
+{
+	if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices)
+		return bvhPackLeaf(boundary, count);
+
+	unsigned int* axes[3] = {orderx, ordery, orderz};
+
+	// we can use step=1 unconditionally but to reduce the cost for min=max case we use step=max
+	size_t step = min_triangles == max_triangles && count > max_triangles ? max_triangles : 1;
+
+	// if we could not pack the meshlet, we must be vertex bound
+	size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles;
+	size_t maxfill = count <= max_triangles ? max_vertices : max_triangles;
+
+	// find best split that minimizes SAH
+	int bestk = -1;
+	size_t bestsplit = 0;
+	float bestcost = FLT_MAX;
+
+	for (int k = 0; k < 3; ++k)
+	{
+		float* areas = static_cast<float*>(scratch);
+		unsigned int* vertices = NULL;
+
+		bvhComputeArea(areas, boxes, axes[k], count);
+
+		if (count <= max_triangles)
+		{
+			// for vertex bound clusters, count number of unique vertices for each split
+			vertices = reinterpret_cast<unsigned int*>(areas + 2 * count);
+			bvhCountVertices(axes[k], count, used, indices, vertices);
+		}
+
+		float axiscost = FLT_MAX;
+		size_t axissplit = bvhPivot(areas, vertices, count, step, mint, max_triangles, fill_weight, maxfill, &axiscost);
+
+		if (axissplit && axiscost < bestcost)
+		{
+			bestk = k;
+			bestcost = axiscost;
+			bestsplit = axissplit;
+		}
+	}
+
+	// this may happen if SAH costs along the admissible splits are NaN, or due to imbalanced splits on pathological inputs
+	if (bestk < 0 || depth >= kMeshletMaxTreeDepth)
+		return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
+
+	// mark sides of split for partitioning
+	unsigned char* sides = static_cast<unsigned char*>(scratch) + count * sizeof(unsigned int);
+
+	for (size_t i = 0; i < bestsplit; ++i)
+		sides[axes[bestk][i]] = 0;
+
+	for (size_t i = bestsplit; i < count; ++i)
+		sides[axes[bestk][i]] = 1;
+
+	// partition all axes into two sides, maintaining order
+	unsigned int* temp = static_cast<unsigned int*>(scratch);
+
+	for (int k = 0; k < 3; ++k)
+	{
+		if (k == bestk)
+			continue;
+
+		unsigned int* axis = axes[k];
+		memcpy(temp, axis, sizeof(unsigned int) * count);
+		bvhPartition(axis, temp, sides, bestsplit, count);
+	}
+
+	// recursion depth is bounded due to max depth check above
+	bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
+	bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
+}
+
 } // namespace meshopt
 
 size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
@@ -517,7 +1132,6 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_
 	assert(index_count % 3 == 0);
 	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
 	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
-	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
 
 	(void)kMeshletMaxVertices;
 	(void)kMeshletMaxTriangles;
@@ -532,7 +1146,7 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_
 	return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
 }
 
-size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
+size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)
 {
 	using namespace meshopt;
 
@@ -541,18 +1155,24 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
 	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
-	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
-	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
+	assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
 
 	assert(cone_weight >= 0 && cone_weight <= 1);
+	assert(split_factor >= 0);
+
+	if (index_count == 0)
+		return 0;
 
 	meshopt_Allocator allocator;
 
 	TriangleAdjacency2 adjacency = {};
-	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+	if (vertex_count > index_count && index_count < (1u << 31))
+		buildTriangleAdjacencySparse(adjacency, indices, index_count, vertex_count, allocator);
+	else
+		buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
 
-	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
-	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+	// live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match
+	unsigned int* live_triangles = adjacency.counts;
 
 	size_t face_count = index_count / 3;
 
@@ -573,11 +1193,45 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 		kdindices[i] = unsigned(i);
 
 	KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
-	kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
+	kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8, 0);
 
-	// index of the vertex in the meshlet, 0xff if the vertex isn't used
-	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
-	memset(used, -1, vertex_count);
+	// find a specific corner of the mesh to use as a starting point for meshlet flow
+	float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		const Cone& tri = triangles[i];
+
+		cornerx = cornerx > tri.px ? tri.px : cornerx;
+		cornery = cornery > tri.py ? tri.py : cornery;
+		cornerz = cornerz > tri.pz ? tri.pz : cornerz;
+	}
+
+	// index of the vertex in the meshlet, -1 if the vertex isn't used
+	short* used = allocator.allocate<short>(vertex_count);
+	clearUsed(used, vertex_count, indices, index_count);
+
+	// initial seed triangle is the one closest to the corner
+	unsigned int initial_seed = ~0u;
+	float initial_score = FLT_MAX;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		const Cone& tri = triangles[i];
+
+		float dx = tri.px - cornerx, dy = tri.py - cornery, dz = tri.pz - cornerz;
+		float score = sqrtf(dx * dx + dy * dy + dz * dz);
+
+		if (initial_seed == ~0u || score < initial_score)
+		{
+			initial_seed = unsigned(i);
+			initial_score = score;
+		}
+	}
+
+	// seed triangles to continue meshlet flow
+	unsigned int seeds[kMeshletMaxSeeds] = {};
+	size_t seed_count = 0;
 
 	meshopt_Meshlet meshlet = {};
 	size_t meshlet_offset = 0;
@@ -588,46 +1242,61 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 	{
 		Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
 
-		unsigned int best_extra = 0;
-		unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight, &best_extra);
+		unsigned int best_triangle = ~0u;
 
-		// if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring
-		if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
-		{
-			best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f, NULL);
-		}
+		// for the first triangle, we don't have a meshlet cone yet, so we use the initial seed
+		// to continue the meshlet, we select an adjacent triangle based on connectivity and spatial scoring
+		if (meshlet_offset == 0 && meshlet.triangle_count == 0)
+			best_triangle = initial_seed;
+		else
+			best_triangle = getNeighborTriangle(meshlet, meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight);
 
-		// when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
+		bool split = false;
+
+		// when we run out of adjacent triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
 		if (best_triangle == ~0u)
 		{
 			float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
 			unsigned int index = ~0u;
-			float limit = FLT_MAX;
+			float distance = FLT_MAX;
 
-			kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, limit);
+			kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, distance);
 
 			best_triangle = index;
+			split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor;
 		}
 
 		if (best_triangle == ~0u)
 			break;
 
+		int best_extra = (used[indices[best_triangle * 3 + 0]] < 0) + (used[indices[best_triangle * 3 + 1]] < 0) + (used[indices[best_triangle * 3 + 2]] < 0);
+
+		// if the best triangle doesn't fit into current meshlet, we re-select using seeds to maintain global flow
+		if (split || (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
+		{
+			seed_count = pruneSeedTriangles(seeds, seed_count, emitted_flags);
+			seed_count = (seed_count + kMeshletAddSeeds <= kMeshletMaxSeeds) ? seed_count : kMeshletMaxSeeds - kMeshletAddSeeds;
+			seed_count += appendSeedTriangles(seeds + seed_count, meshlet, meshlet_vertices, indices, adjacency, triangles, live_triangles, cornerx, cornery, cornerz);
+
+			unsigned int best_seed = selectSeedTriangle(seeds, seed_count, indices, triangles, live_triangles, cornerx, cornery, cornerz);
+
+			// we may not find a valid seed triangle if the mesh is disconnected as seeds are based on adjacency
+			best_triangle = best_seed != ~0u ? best_seed : best_triangle;
+		}
+
 		unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];
 		assert(a < vertex_count && b < vertex_count && c < vertex_count);
 
 		// add meshlet to the output; when the current meshlet is full we reset the accumulated bounds
-		if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles))
+		if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split))
 		{
 			meshlet_offset++;
 			memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
 		}
 
-		live_triangles[a]--;
-		live_triangles[b]--;
-		live_triangles[c]--;
-
 		// remove emitted triangle from adjacency data
 		// this makes sure that we spend less time traversing these lists on subsequent iterations
+		// live triangle counts are updated as a byproduct of these adjustments
 		for (size_t k = 0; k < 3; ++k)
 		{
 			unsigned int index = indices[best_triangle * 3 + k];
@@ -656,20 +1325,23 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 		meshlet_cone_acc.ny += triangles[best_triangle].ny;
 		meshlet_cone_acc.nz += triangles[best_triangle].nz;
 
+		assert(!emitted_flags[best_triangle]);
 		emitted_flags[best_triangle] = 1;
 	}
 
 	if (meshlet.triangle_count)
-	{
-		finishMeshlet(meshlet, meshlet_triangles);
-
 		meshlets[meshlet_offset++] = meshlet;
-	}
 
-	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles));
+	assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
 	return meshlet_offset;
 }
 
+size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
+{
+	return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f);
+}
+
 size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
 {
 	using namespace meshopt;
@@ -678,13 +1350,12 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle
 
 	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
 	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
-	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
 
 	meshopt_Allocator allocator;
 
-	// index of the vertex in the meshlet, 0xff if the vertex isn't used
-	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
-	memset(used, -1, vertex_count);
+	// index of the vertex in the meshlet, -1 if the vertex isn't used
+	short* used = allocator.allocate<short>(vertex_count);
+	clearUsed(used, vertex_count, indices, index_count);
 
 	meshopt_Meshlet meshlet = {};
 	size_t meshlet_offset = 0;
@@ -699,13 +1370,109 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle
 	}
 
 	if (meshlet.triangle_count)
-	{
-		finishMeshlet(meshlet, meshlet_triangles);
-
 		meshlets[meshlet_offset++] = meshlet;
-	}
 
 	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+	assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
+	return meshlet_offset;
+}
+
+size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
+	assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
+
+	if (index_count == 0)
+		return 0;
+
+	size_t face_count = index_count / 3;
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	meshopt_Allocator allocator;
+
+	// 3 floats plus 1 uint for sorting, or
+	// 2 floats plus 1 uint for pivoting, or
+	// 1 uint plus 1 byte for partitioning
+	float* scratch = allocator.allocate<float>(face_count * 4);
+
+	// compute bounding boxes and centroids for sorting
+	BVHBox* boxes = allocator.allocate<BVHBox>(face_count + 1); // padding for SIMD
+	bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float);
+	memset(boxes + face_count, 0, sizeof(BVHBox));
+
+	unsigned int* axes = allocator.allocate<unsigned int>(face_count * 3);
+	unsigned int* temp = reinterpret_cast<unsigned int*>(scratch) + face_count * 3;
+
+	for (int k = 0; k < 3; ++k)
+	{
+		unsigned int* order = axes + k * face_count;
+		const float* keys = scratch + k * face_count;
+
+		unsigned int hist[1024][3];
+		computeHistogram(hist, keys, face_count);
+
+		// 3-pass radix sort computes the resulting order into axes
+		for (size_t i = 0; i < face_count; ++i)
+			temp[i] = unsigned(i);
+
+		radixPass(order, temp, keys, face_count, hist, 0);
+		radixPass(temp, order, keys, face_count, hist, 1);
+		radixPass(order, temp, keys, face_count, hist, 2);
+	}
+
+	// index of the vertex in the meshlet, -1 if the vertex isn't used
+	short* used = allocator.allocate<short>(vertex_count);
+	clearUsed(used, vertex_count, indices, index_count);
+
+	unsigned char* boundary = allocator.allocate<unsigned char>(face_count);
+
+	bvhSplit(boxes, &axes[0], &axes[face_count], &axes[face_count * 2], boundary, face_count, 0, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
+
+	// compute the desired number of meshlets; note that on some meshes with a lot of vertex bound clusters this might go over the bound
+	size_t meshlet_count = 0;
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		assert(boundary[i] <= 1);
+		meshlet_count += boundary[i];
+	}
+
+	size_t meshlet_bound = meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles);
+
+	// pack triangles into meshlets according to the order and boundaries marked by bvhSplit
+	meshopt_Meshlet meshlet = {};
+	size_t meshlet_offset = 0;
+	size_t meshlet_pending = meshlet_count;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		assert(boundary[i] <= 1);
+		bool split = i > 0 && boundary[i] == 1;
+
+		// while we are over the limit, we ignore boundary[] data and disable splits until we free up enough space
+		if (split && meshlet_count > meshlet_bound && meshlet_offset + meshlet_pending >= meshlet_bound)
+			split = false;
+
+		unsigned int index = axes[i];
+		assert(index < face_count);
+
+		unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+		// appends triangle to the meshlet and writes previous meshlet to the output if full
+		meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split);
+		meshlet_pending -= boundary[i];
+	}
+
+	if (meshlet.triangle_count)
+		meshlets[meshlet_offset++] = meshlet;
+
+	assert(meshlet_offset <= meshlet_bound);
+	assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
 	return meshlet_offset;
 }
 
@@ -765,15 +1532,17 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
 	if (triangles == 0)
 		return bounds;
 
+	const float rzero = 0.f;
+
 	// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
 	float psphere[4] = {};
-	computeBoundingSphere(psphere, corners[0], triangles * 3);
+	computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0, 7);
 
 	float center[3] = {psphere[0], psphere[1], psphere[2]};
 
 	// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
 	float nsphere[4] = {};
-	computeBoundingSphere(nsphere, normals, triangles);
+	computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0, 3);
 
 	float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
 	float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
@@ -883,6 +1652,33 @@ meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices
 	return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
 }
 
+meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride)
+{
+	using namespace meshopt;
+
+	assert(positions_stride >= 12 && positions_stride <= 256);
+	assert(positions_stride % sizeof(float) == 0);
+	assert((radii_stride >= 4 && radii_stride <= 256) || radii == NULL);
+	assert(radii_stride % sizeof(float) == 0);
+
+	meshopt_Bounds bounds = {};
+
+	if (count == 0)
+		return bounds;
+
+	const float rzero = 0.f;
+
+	float psphere[4] = {};
+	computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0, 7);
+
+	bounds.center[0] = psphere[0];
+	bounds.center[1] = psphere[1];
+	bounds.center[2] = psphere[2];
+	bounds.radius = psphere[3];
+
+	return bounds;
+}
+
 void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count)
 {
 	using namespace meshopt;
@@ -950,25 +1746,28 @@ void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* mesh
 	// reorder meshlet vertices for access locality assuming index buffer is scanned sequentially
 	unsigned int order[kMeshletMaxVertices];
 
-	unsigned char remap[kMeshletMaxVertices];
-	memset(remap, -1, vertex_count);
+	short remap[kMeshletMaxVertices];
+	memset(remap, -1, vertex_count * sizeof(short));
 
 	size_t vertex_offset = 0;
 
 	for (size_t i = 0; i < triangle_count * 3; ++i)
 	{
-		unsigned char& r = remap[indices[i]];
+		short& r = remap[indices[i]];
 
-		if (r == 0xff)
+		if (r < 0)
 		{
-			r = (unsigned char)(vertex_offset);
+			r = short(vertex_offset);
 			order[vertex_offset] = vertices[indices[i]];
 			vertex_offset++;
 		}
 
-		indices[i] = r;
+		indices[i] = (unsigned char)r;
 	}
 
 	assert(vertex_offset <= vertex_count);
 	memcpy(vertices, order, vertex_offset * sizeof(unsigned int));
 }
+
+#undef SIMD_SSE
+#undef SIMD_NEON
diff --git a/Source/ThirdParty/meshoptimizer/vcacheanalyzer.cpp b/Source/ThirdParty/meshoptimizer/indexanalyzer.cpp
similarity index 58%
rename from Source/ThirdParty/meshoptimizer/vcacheanalyzer.cpp
rename to Source/ThirdParty/meshoptimizer/indexanalyzer.cpp
index 368274382..87ceeae66 100644
--- a/Source/ThirdParty/meshoptimizer/vcacheanalyzer.cpp
+++ b/Source/ThirdParty/meshoptimizer/indexanalyzer.cpp
@@ -71,3 +71,56 @@ meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* ind
 
 	return result;
 }
+
+meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	meshopt_VertexFetchStatistics result = {};
+
+	unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count);
+	memset(vertex_visited, 0, vertex_count);
+
+	const size_t kCacheLine = 64;
+	const size_t kCacheSize = 128 * 1024;
+
+	// simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
+	size_t cache[kCacheSize / kCacheLine] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		vertex_visited[index] = 1;
+
+		size_t start_address = index * vertex_size;
+		size_t end_address = start_address + vertex_size;
+
+		size_t start_tag = start_address / kCacheLine;
+		size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
+
+		assert(start_tag < end_tag);
+
+		for (size_t tag = start_tag; tag < end_tag; ++tag)
+		{
+			size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
+
+			// we store +1 since cache is filled with 0 by default
+			result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
+			cache[line] = tag + 1;
+		}
+	}
+
+	size_t unique_vertex_count = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_vertex_count += vertex_visited[i];
+
+	result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
+
+	return result;
+}
diff --git a/Source/ThirdParty/meshoptimizer/indexcodec.cpp b/Source/ThirdParty/meshoptimizer/indexcodec.cpp
index b30046005..7a8fd6867 100644
--- a/Source/ThirdParty/meshoptimizer/indexcodec.cpp
+++ b/Source/ThirdParty/meshoptimizer/indexcodec.cpp
@@ -14,6 +14,7 @@ const unsigned char kIndexHeader = 0xe0;
 const unsigned char kSequenceHeader = 0xd0;
 
 static int gEncodeIndexVersion = 1;
+const int kDecodeIndexVersion = 1;
 
 typedef unsigned int VertexFifo[16];
 typedef unsigned int EdgeFifo[16][2];
@@ -209,6 +210,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
 
 		if (fer >= 0 && (fer >> 2) < 15)
 		{
+			// note: getEdgeFifo implicitly rotates triangles by matching a/b to existing edge
 			const unsigned int* order = kTriangleIndexOrder[fer & 3];
 
 			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
@@ -266,6 +268,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons
 			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
 
 			// after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a
+			// note: decoder implicitly assumes that if feb=fec=0, then fea=0 (reset code); this is enforced by rotation
 			int fea = (a == next) ? (next++, 0) : 15;
 			int feb = (fb >= 0 && fb < 14) ? fb + 1 : (b == next ? (next++, 0) : 15);
 			int fec = (fc >= 0 && fc < 14) ? fc + 1 : (c == next ? (next++, 0) : 15);
@@ -354,11 +357,28 @@ size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)
 
 void meshopt_encodeIndexVersion(int version)
 {
-	assert(unsigned(version) <= 1);
+	assert(unsigned(version) <= unsigned(meshopt::kDecodeIndexVersion));
 
 	meshopt::gEncodeIndexVersion = version;
 }
 
+int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size)
+{
+	if (buffer_size < 1)
+		return -1;
+
+	unsigned char header = buffer[0];
+
+	if ((header & 0xf0) != meshopt::kIndexHeader && (header & 0xf0) != meshopt::kSequenceHeader)
+		return -1;
+
+	int version = header & 0x0f;
+	if (version > meshopt::kDecodeIndexVersion)
+		return -1;
+
+	return version;
+}
+
 int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
 {
 	using namespace meshopt;
@@ -374,7 +394,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
 		return -1;
 
 	int version = buffer[0] & 0x0f;
-	if (version > 1)
+	if (version > kDecodeIndexVersion)
 		return -1;
 
 	EdgeFifo edgefifo;
@@ -415,6 +435,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
 			// fifo reads are wrapped around 16 entry buffer
 			unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
 			unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
+			unsigned int c = 0;
 
 			int fec = codetri & 15;
 
@@ -424,37 +445,30 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
 			{
 				// fifo reads are wrapped around 16 entry buffer
 				unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
-				unsigned int c = (fec == 0) ? next : cf;
+				c = (fec == 0) ? next : cf;
 
 				int fec0 = fec == 0;
 				next += fec0;
 
-				// output triangle
-				writeTriangle(destination, i, index_size, a, b, c);
-
-				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				// push vertex fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
 				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
-
-				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
-				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
 			}
 			else
 			{
-				unsigned int c = 0;
-
 				// fec - (fec ^ 3) decodes 13, 14 into -1, 1
 				// note that we need to update the last index since free indices are delta-encoded
 				last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);
 
-				// output triangle
-				writeTriangle(destination, i, index_size, a, b, c);
-
 				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
 				pushVertexFifo(vertexfifo, c, vertexfifooffset);
-
-				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
-				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
 			}
+
+			// push edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+
+			// output triangle
+			writeTriangle(destination, i, index_size, a, b, c);
 		}
 		else
 		{
@@ -627,7 +641,7 @@ int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t in
 		return -1;
 
 	int version = buffer[0] & 0x0f;
-	if (version > 1)
+	if (version > kDecodeIndexVersion)
 		return -1;
 
 	const unsigned char* data = buffer + 1;
diff --git a/Source/ThirdParty/meshoptimizer/indexgenerator.cpp b/Source/ThirdParty/meshoptimizer/indexgenerator.cpp
index f6728345a..4bf9fccad 100644
--- a/Source/ThirdParty/meshoptimizer/indexgenerator.cpp
+++ b/Source/ThirdParty/meshoptimizer/indexgenerator.cpp
@@ -5,7 +5,9 @@
 #include <string.h>
 
 // This work is based on:
+// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
 // John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010
+// John Hable. Variable Rate Shading with Visibility Buffer Rendering. 2024
 namespace meshopt
 {
 
@@ -85,6 +87,46 @@ struct VertexStreamHasher
 	}
 };
 
+struct VertexCustomHasher
+{
+	const float* vertex_positions;
+	size_t vertex_stride_float;
+
+	int (*callback)(void*, unsigned int, unsigned int);
+	void* context;
+
+	size_t hash(unsigned int index) const
+	{
+		const unsigned int* key = reinterpret_cast<const unsigned int*>(vertex_positions + index * vertex_stride_float);
+
+		unsigned int x = key[0], y = key[1], z = key[2];
+
+		// replace negative zero with zero
+		x = (x == 0x80000000) ? 0 : x;
+		y = (y == 0x80000000) ? 0 : y;
+		z = (z == 0x80000000) ? 0 : z;
+
+		// scramble bits to make sure that integer coordinates have entropy in lower bits
+		x ^= x >> 17;
+		y ^= y >> 17;
+		z ^= z >> 17;
+
+		// Optimized Spatial Hashing for Collision Detection of Deformable Objects
+		return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791);
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		const float* lp = vertex_positions + lhs * vertex_stride_float;
+		const float* rp = vertex_positions + rhs * vertex_stride_float;
+
+		if (lp[0] != rp[0] || lp[1] != rp[1] || lp[2] != rp[2])
+			return false;
+
+		return callback ? callback(context, lhs, rhs) : true;
+	}
+};
+
 struct EdgeHasher
 {
 	const unsigned int* remap;
@@ -182,6 +224,43 @@ static void buildPositionRemap(unsigned int* remap, const float* vertex_position
 	allocator.deallocate(vertex_table);
 }
 
+template <typename Hash>
+static size_t generateVertexRemap(unsigned int* remap, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator)
+{
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(index < vertex_count);
+
+		if (remap[index] != ~0u)
+			continue;
+
+		unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u);
+
+		if (*entry == ~0u)
+		{
+			*entry = index;
+			remap[index] = next_vertex++;
+		}
+		else
+		{
+			assert(remap[*entry] != ~0u);
+			remap[index] = remap[*entry];
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+	return next_vertex;
+}
+
 template <size_t BlockSize>
 static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
 {
@@ -196,6 +275,35 @@ static void remapVertices(void* destination, const void* vertices, size_t vertex
 		}
 }
 
+template <typename Hash>
+static void generateShadowBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator)
+{
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (remap[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u);
+
+			if (*entry == ~0u)
+				*entry = index;
+
+			remap[index] = *entry;
+		}
+
+		destination[i] = remap[index];
+	}
+}
+
 } // namespace meshopt
 
 size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
@@ -207,44 +315,9 @@ size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int
 	assert(vertex_size > 0 && vertex_size <= 256);
 
 	meshopt_Allocator allocator;
-
-	memset(destination, -1, vertex_count * sizeof(unsigned int));
-
 	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_size};
 
-	size_t table_size = hashBuckets(vertex_count);
-	unsigned int* table = allocator.allocate<unsigned int>(table_size);
-	memset(table, -1, table_size * sizeof(unsigned int));
-
-	unsigned int next_vertex = 0;
-
-	for (size_t i = 0; i < index_count; ++i)
-	{
-		unsigned int index = indices ? indices[i] : unsigned(i);
-		assert(index < vertex_count);
-
-		if (destination[index] == ~0u)
-		{
-			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
-
-			if (*entry == ~0u)
-			{
-				*entry = index;
-
-				destination[index] = next_vertex++;
-			}
-			else
-			{
-				assert(destination[*entry] != ~0u);
-
-				destination[index] = destination[*entry];
-			}
-		}
-	}
-
-	assert(next_vertex <= vertex_count);
-
-	return next_vertex;
+	return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
 }
 
 size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
@@ -262,44 +335,24 @@ size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigne
 	}
 
 	meshopt_Allocator allocator;
-
-	memset(destination, -1, vertex_count * sizeof(unsigned int));
-
 	VertexStreamHasher hasher = {streams, stream_count};
 
-	size_t table_size = hashBuckets(vertex_count);
-	unsigned int* table = allocator.allocate<unsigned int>(table_size);
-	memset(table, -1, table_size * sizeof(unsigned int));
+	return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
+}
 
-	unsigned int next_vertex = 0;
+size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context)
+{
+	using namespace meshopt;
 
-	for (size_t i = 0; i < index_count; ++i)
-	{
-		unsigned int index = indices ? indices[i] : unsigned(i);
-		assert(index < vertex_count);
+	assert(indices || index_count == vertex_count);
+	assert(!indices || index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
 
-		if (destination[index] == ~0u)
-		{
-			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+	meshopt_Allocator allocator;
+	VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), callback, context};
 
-			if (*entry == ~0u)
-			{
-				*entry = index;
-
-				destination[index] = next_vertex++;
-			}
-			else
-			{
-				assert(destination[*entry] != ~0u);
-
-				destination[index] = destination[*entry];
-			}
-		}
-	}
-
-	assert(next_vertex <= vertex_count);
-
-	return next_vertex;
+	return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator);
 }
 
 void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
@@ -361,33 +414,9 @@ void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned
 	assert(vertex_size <= vertex_stride);
 
 	meshopt_Allocator allocator;
-
-	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
-	memset(remap, -1, vertex_count * sizeof(unsigned int));
-
 	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_stride};
 
-	size_t table_size = hashBuckets(vertex_count);
-	unsigned int* table = allocator.allocate<unsigned int>(table_size);
-	memset(table, -1, table_size * sizeof(unsigned int));
-
-	for (size_t i = 0; i < index_count; ++i)
-	{
-		unsigned int index = indices[i];
-		assert(index < vertex_count);
-
-		if (remap[index] == ~0u)
-		{
-			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
-
-			if (*entry == ~0u)
-				*entry = index;
-
-			remap[index] = *entry;
-		}
-
-		destination[i] = remap[index];
-	}
+	generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator);
 }
 
 void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
@@ -405,32 +434,33 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns
 	}
 
 	meshopt_Allocator allocator;
-
-	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
-	memset(remap, -1, vertex_count * sizeof(unsigned int));
-
 	VertexStreamHasher hasher = {streams, stream_count};
 
+	generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator);
+}
+
+void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+	VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), NULL, NULL};
+
 	size_t table_size = hashBuckets(vertex_count);
 	unsigned int* table = allocator.allocate<unsigned int>(table_size);
 	memset(table, -1, table_size * sizeof(unsigned int));
 
-	for (size_t i = 0; i < index_count; ++i)
+	for (size_t i = 0; i < vertex_count; ++i)
 	{
-		unsigned int index = indices[i];
-		assert(index < vertex_count);
+		unsigned int* entry = hashLookup(table, table_size, hasher, unsigned(i), ~0u);
 
-		if (remap[index] == ~0u)
-		{
-			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+		if (*entry == ~0u)
+			*entry = unsigned(i);
 
-			if (*entry == ~0u)
-				*entry = index;
-
-			remap[index] = *entry;
-		}
-
-		destination[i] = remap[index];
+		destination[i] = *entry;
 	}
 }
 
@@ -576,3 +606,99 @@ void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const un
 		memcpy(destination + i * 4, patch, sizeof(patch));
 	}
 }
+
+size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	assert(index_count % 3 == 0);
+
+	meshopt_Allocator allocator;
+
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	// compute vertex valence; this is used to prioritize least used corner
+	// note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic
+	unsigned char* valence = allocator.allocate<unsigned char>(vertex_count);
+	memset(valence, 0, vertex_count);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		valence[index]++;
+	}
+
+	unsigned int reorder_offset = 0;
+
+	// assign provoking vertices; leave the rest for the next pass
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		// try to rotate triangle such that provoking vertex hasn't been seen before
+		// if multiple vertices are new, prioritize the one with least valence
+		// this reduces the risk that a future triangle will have all three vertices seen
+		unsigned int va = remap[a] == ~0u ? valence[a] : ~0u;
+		unsigned int vb = remap[b] == ~0u ? valence[b] : ~0u;
+		unsigned int vc = remap[c] == ~0u ? valence[c] : ~0u;
+
+		if (vb != ~0u && vb <= va && vb <= vc)
+		{
+			// abc -> bca
+			unsigned int t = a;
+			a = b, b = c, c = t;
+		}
+		else if (vc != ~0u && vc <= va && vc <= vb)
+		{
+			// abc -> cab
+			unsigned int t = c;
+			c = b, b = a, a = t;
+		}
+
+		unsigned int newidx = reorder_offset;
+
+		// now remap[a] = ~0u or all three vertices are old
+		// recording remap[a] makes it possible to remap future references to the same index, conserving space
+		if (remap[a] == ~0u)
+			remap[a] = newidx;
+
+		// we need to clone the provoking vertex to get a unique index
+		// if all three are used the choice is arbitrary since no future triangle will be able to reuse any of these
+		reorder[reorder_offset++] = a;
+
+		// note: first vertex is final, the other two will be fixed up in next pass
+		destination[i + 0] = newidx;
+		destination[i + 1] = b;
+		destination[i + 2] = c;
+
+		// update vertex valences for corner heuristic
+		valence[a]--;
+		valence[b]--;
+		valence[c]--;
+	}
+
+	// remap or clone non-provoking vertices (iterating to skip provoking vertices)
+	int step = 1;
+
+	for (size_t i = 1; i < index_count; i += step, step ^= 3)
+	{
+		unsigned int index = destination[i];
+
+		if (remap[index] == ~0u)
+		{
+			// we haven't seen the vertex before as a provoking vertex
+			// to maintain the reference to the original vertex we need to clone it
+			unsigned int newidx = reorder_offset;
+
+			remap[index] = newidx;
+			reorder[reorder_offset++] = index;
+		}
+
+		destination[i] = remap[index];
+	}
+
+	assert(reorder_offset <= vertex_count + index_count / 3);
+	return reorder_offset;
+}
diff --git a/Source/ThirdParty/meshoptimizer/meshoptimizer.h b/Source/ThirdParty/meshoptimizer/meshoptimizer.h
index 6c8dcd7e8..c9239bc30 100644
--- a/Source/ThirdParty/meshoptimizer/meshoptimizer.h
+++ b/Source/ThirdParty/meshoptimizer/meshoptimizer.h
@@ -1,7 +1,7 @@
 /**
- * meshoptimizer - version 0.21
+ * meshoptimizer - version 1.0
  *
- * Copyright (C) 2016-2024, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2016-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
  * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
  *
  * This library is distributed under the MIT License. See notice at the end of this file.
@@ -12,7 +12,7 @@
 #include <stddef.h>
 
 /* Version macro; major * 1000 + minor * 10 + patch */
-#define MESHOPTIMIZER_VERSION 210 /* 0.21 */
+#define MESHOPTIMIZER_VERSION 1000 /* 1.0 */
 
 /* If no API is defined, assume default */
 #ifndef MESHOPTIMIZER_API
@@ -29,11 +29,14 @@
 #endif
 
 /* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */
+#ifndef MESHOPTIMIZER_EXPERIMENTAL
 #define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API
+#endif
 
 /* C interface */
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
 /**
@@ -71,6 +74,19 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination,
  */
 MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
 
+/**
+ * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
+ * As a result, all vertices that are equivalent map to the same (new) location, with no gaps in the resulting sequence.
+ * Equivalence is checked in two steps: vertex positions are compared for equality, and then the user-specified equality function is called (if provided).
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * indices can be NULL if the input is unindexed
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * callback can be NULL if no additional equality check is needed; otherwise, it should return 1 if vertices with specified indices are equivalent and 0 if they are not
+ */
+MESHOPTIMIZER_API size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context);
+
 /**
  * Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap
  *
@@ -108,6 +124,16 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
  */
 MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
 
+/**
+ * Generates a remap table that maps all vertices with the same position to the same (existing) index.
+ * Similarly to meshopt_generateShadowIndexBuffer, this can be helpful to pre-process meshes for position-only rendering.
+ * This can also be used to implement algorithms that require positional-only connectivity, such as hierarchical simplification.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ */
+MESHOPTIMIZER_API void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
 /**
  * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
  * Each triangle is converted into a 6-vertex patch with the following layout:
@@ -137,10 +163,23 @@ MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destin
  */
 MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
+/**
+ * Generate index buffer that can be used for visibility buffer rendering and returns the size of the reorder table
+ * Each triangle's provoking vertex index is equal to primitive id; this allows passing it to the fragment shader using flat/nointerpolation attribute.
+ * This is important for performance on hardware where primitive id can't be accessed efficiently in fragment shader.
+ * The reorder table stores the original vertex id for each vertex in the new index buffer, and should be used in the vertex shader to load vertex data.
+ * The provoking vertex is assumed to be the first vertex in the triangle; if this is not the case (OpenGL), rotate each triangle (abc -> bca) before rendering.
+ * For maximum efficiency the input index buffer should be optimized for vertex cache first.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * reorder must contain enough space for the worst case reorder table (vertex_count + index_count/3 elements)
+ */
+MESHOPTIMIZER_API size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
 /**
  * Vertex transform cache optimizer
  * Reorders indices to reduce the number of GPU vertex shader invocations
- * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
  *
  * destination must contain enough space for the resulting index buffer (index_count elements)
  */
@@ -159,7 +198,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destinatio
  * Vertex transform cache optimizer for FIFO caches
  * Reorders indices to reduce the number of GPU vertex shader invocations
  * Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache
- * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
  *
  * destination must contain enough space for the resulting index buffer (index_count elements)
  * cache_size should be less than the actual GPU cache size to avoid cache thrashing
@@ -169,7 +208,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination
 /**
  * Overdraw optimizer
  * Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw
- * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
  *
  * destination must contain enough space for the resulting index buffer (index_count elements)
  * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
@@ -182,7 +221,7 @@ MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const
  * Vertex fetch cache optimizer
  * Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing
  * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
- * This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
+ * This function works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
  *
  * destination must contain enough space for the resulting vertex buffer (vertex_count elements)
  * indices is used both as an input and as an output index buffer
@@ -212,7 +251,8 @@ MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t
 MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count);
 
 /**
- * Set index encoder format version
+ * Set index encoder format version (defaults to 1)
+ *
  * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+)
  */
 MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
@@ -227,6 +267,13 @@ MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
  */
 MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
 
+/**
+ * Get encoded index format version
+ * Returns format version of the encoded index buffer/sequence, or -1 if the buffer header is invalid
+ * Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed.
+ */
+MESHOPTIMIZER_API int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size);
+
 /**
  * Index sequence encoder
  * Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original.
@@ -254,15 +301,31 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde
  * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
  * This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream.
  * Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized.
+ * For maximum efficiency the vertex buffer being encoded has to be quantized and optimized for locality of reference (cache/fetch) first.
  *
  * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
+ * vertex_size must be a multiple of 4 (and <= 256)
  */
 MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
 MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
 
 /**
- * Set vertex encoder format version
- * version must specify the data format version to encode; valid values are 0 (decodable by all library versions)
+ * Vertex buffer encoder
+ * Encodes vertex data just like meshopt_encodeVertexBuffer, but allows to override compression level.
+ * For compression level to take effect, the vertex encoding version must be set to 1.
+ * The default compression level implied by meshopt_encodeVertexBuffer is 2.
+ *
+ * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
+ * vertex_size must be a multiple of 4 (and <= 256)
+ * level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio.
+ * version should be -1 to use the default version (specified via meshopt_encodeVertexVersion), or 0/1 to override the version; per above, level won't take effect if version is 0.
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version);
+
+/**
+ * Set vertex encoder format version (defaults to 1)
+ *
+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.23+)
  */
 MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
 
@@ -273,32 +336,44 @@ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
  * The decoder is safe to use for untrusted input, but it may produce garbage data.
  *
  * destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes)
+ * vertex_size must be a multiple of 4 (and <= 256)
  */
 MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);
 
+/**
+ * Get encoded vertex format version
+ * Returns format version of the encoded vertex buffer, or -1 if the buffer header is invalid
+ * Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed.
+ */
+MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size);
+
 /**
  * Vertex buffer filters
  * These functions can be used to filter output of meshopt_decodeVertexBuffer in-place.
  *
- * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f.
+ * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit signed X/Y as an input; Z must store 1.0f.
  * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
  *
- * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct.
+ * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit component encoding and a 2-bit component index indicating which component to reconstruct.
  * Each component is stored as an 16-bit integer; stride must be equal to 8.
  *
  * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
  * Each 32-bit component is decoded in isolation; stride must be divisible by 4.
+ *
+ * meshopt_decodeFilterColor decodes RGBA colors from YCoCg (+A) color encoding where RGB is converted to YCoCg space with K-bit component encoding, and A is stored using K-1 bits.
+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8.
  */
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
+MESHOPTIMIZER_API void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);
 
 /**
  * Vertex buffer filter encoders
  * These functions can be used to encode data in a format that meshopt_decodeFilter can decode
  *
- * meshopt_encodeFilterOct encodes unit vectors with K-bit (K <= 16) signed X/Y as an output.
- * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
+ * meshopt_encodeFilterOct encodes unit vectors with K-bit (2 <= K <= 16) signed X/Y as an output.
+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. Z will store 1.0f, W is preserved as is.
  * Input data must contain 4 floats for every vector (count*4 total).
  *
  * meshopt_encodeFilterQuat encodes unit quaternions with K-bit (4 <= K <= 16) component encoding.
@@ -308,6 +383,10 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t cou
  * meshopt_encodeFilterExp encodes arbitrary (finite) floating-point data with 8-bit exponent and K-bit integer mantissa (1 <= K <= 24).
  * Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
  * Input data must contain stride/4 floats for every vector (count*stride/4 total).
+ *
+ * meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with K-bit (2 <= K <= 16) component encoding; A is stored using K-1 bits.
+ * Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8.
+ * Input data must contain 4 floats for every color (count*4 total).
  */
 enum meshopt_EncodeExpMode
 {
@@ -317,11 +396,14 @@ enum meshopt_EncodeExpMode
 	meshopt_EncodeExpSharedVector,
 	/* When encoding exponents, use shared value for each component of all vectors (best compression) */
 	meshopt_EncodeExpSharedComponent,
+	/* When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */
+	meshopt_EncodeExpClamped,
 };
 
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
+MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
+MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
+MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
+MESHOPTIMIZER_API void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);
 
 /**
  * Simplification options
@@ -334,16 +416,34 @@ enum
 	meshopt_SimplifySparse = 1 << 1,
 	/* Treat error limit and resulting error as absolute instead of relative to mesh extents. */
 	meshopt_SimplifyErrorAbsolute = 1 << 2,
+	/* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */
+	meshopt_SimplifyPrune = 1 << 3,
+	/* Produce more regular triangle sizes and shapes during simplification, at some cost to geometric and attribute quality. */
+	meshopt_SimplifyRegularize = 1 << 4,
+	/* Experimental: Allow collapses across attribute discontinuities, except for vertices that are tagged with meshopt_SimplifyVertex_Protect in vertex_lock. */
+	meshopt_SimplifyPermissive = 1 << 5,
+};
+
+/**
+ * Experimental: Simplification vertex flags/locks, for use in `vertex_lock` arrays in simplification APIs
+ */
+enum
+{
+	/* Do not move this vertex. */
+	meshopt_SimplifyVertex_Lock = 1 << 0,
+	/* Protect attribute discontinuity at this vertex; must be used together with meshopt_SimplifyPermissive option. */
+	meshopt_SimplifyVertex_Protect = 1 << 1,
 };
 
 /**
  * Mesh simplifier
  * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
  * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
- * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification.
+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
  * Returns the number of indices after simplification, with destination containing new index data
+ *
  * The resulting index buffer references vertices from the original vertex buffer.
- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
  *
  * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
  * vertex_positions should have float3 position in the first 12 bytes of each vertex
@@ -354,45 +454,94 @@ enum
 MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error);
 
 /**
- * Experimental: Mesh simplifier with attribute metric
- * The algorithm ehnahces meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details.
- * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes.
+ * Mesh simplifier with attribute metric
+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
+ * Similar to meshopt_simplify, but incorporates attribute values into the error metric used to prioritize simplification order.
+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
+ * Returns the number of indices after simplification, with destination containing new index data
  *
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * Note that the number of attributes with non-zero weights affects memory requirements and running time.
+ *
+ * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
  * vertex_attributes should have attribute_count floats for each vertex
- * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position. The recommended weight range is [1e-3..1e-1], assuming attribute data is in [0..1] range.
- * attribute_count must be <= 16
+ * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
+ * attribute_count must be <= 32
  * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
- * TODO target_error/result_error currently use combined distance+attribute error; this may change in the future
+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
  */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
+MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
 
 /**
- * Experimental: Mesh simplifier (sloppy)
+ * Mesh simplifier with position/attribute update
+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
+ * Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance.
+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
+ * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification.
+ * Returns the number of indices after simplification, indices are destructively updated with new index data
+ *
+ * The updated index buffer references vertices from the original vertex buffer, however the vertex positions and attributes are updated in-place.
+ * Creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended; if the original vertex data is needed, it should be copied before simplification.
+ * Note that the number of attributes with non-zero weights affects memory requirements and running time. Attributes with zero weights are not updated.
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * vertex_attributes should have attribute_count floats for each vertex
+ * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position
+ * attribute_count must be <= 32
+ * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
+ * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
+ */
+MESHOPTIMIZER_API size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
+
+/**
+ * Mesh simplifier (sloppy)
  * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
  * The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error.
  * Returns the number of indices after simplification, with destination containing new index data
  * The resulting index buffer references vertices from the original vertex buffer.
- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
  *
  * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
  * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; vertices that can't be moved should set 1 consistently for all indices with the same position
  * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
  * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
  */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
+MESHOPTIMIZER_API size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error);
 
 /**
- * Experimental: Point cloud simplifier
+ * Mesh simplifier (pruner)
+ * Reduces the number of triangles in the mesh by removing small isolated parts of the mesh
+ * Returns the number of indices after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the target index buffer, worst case is index_count elements
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
+ */
+MESHOPTIMIZER_API size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
+
+/**
+ * Point cloud simplifier
  * Reduces the number of points in the cloud to reach the given target
  * Returns the number of points after simplification, with destination containing new index data
  * The resulting index buffer references vertices from the original vertex buffer.
- * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
  *
  * destination must contain enough space for the target index buffer (target_vertex_count elements)
  * vertex_positions should have float3 position in the first 12 bytes of each vertex
- * vertex_colors should can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
+ * vertex_colors can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
+ * color_weight determines relative priority of color wrt position; 1.0 is a safe default
  */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);
+MESHOPTIMIZER_API size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);
 
 /**
  * Returns the error scaling factor used by the simplifier to convert between absolute and relative extents
@@ -440,6 +589,19 @@ struct meshopt_VertexCacheStatistics
  */
 MESHOPTIMIZER_API struct meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
 
+struct meshopt_VertexFetchStatistics
+{
+	unsigned int bytes_fetched;
+	float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
+};
+
+/**
+ * Vertex fetch cache analyzer
+ * Returns cache hit statistics using a simplified direct mapped model
+ * Results may not match actual GPU performance
+ */
+MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+
 struct meshopt_OverdrawStatistics
 {
 	unsigned int pixels_covered;
@@ -456,26 +618,34 @@ struct meshopt_OverdrawStatistics
  */
 MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
-struct meshopt_VertexFetchStatistics
+struct meshopt_CoverageStatistics
 {
-	unsigned int bytes_fetched;
-	float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
+	float coverage[3];
+	float extent; /* viewport size in mesh coordinates */
 };
 
 /**
- * Vertex fetch cache analyzer
- * Returns cache hit statistics using a simplified direct mapped model
- * Results may not match actual GPU performance
+ * Coverage analyzer
+ * Returns coverage statistics (ratio of viewport pixels covered from each axis) using a software rasterizer
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
  */
-MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_API struct meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
+/**
+ * Meshlet is a small mesh cluster (subset) that consists of:
+ * - triangles, an 8-bit micro triangle (index) buffer, that for each triangle specifies three local vertices to use;
+ * - vertices, a 32-bit vertex indirection buffer, that for each local vertex specifies which mesh vertex to fetch vertex attributes from.
+ *
+ * For efficiency, meshlet triangles and vertices are packed into two large arrays; this structure contains offsets and counts to access the data.
+ */
 struct meshopt_Meshlet
 {
 	/* offsets within meshlet_vertices and meshlet_triangles arrays with meshlet data */
 	unsigned int vertex_offset;
 	unsigned int triangle_offset;
 
-	/* number of vertices and triangles used in the meshlet; data is stored in consecutive range defined by offset and count */
+	/* number of vertices and triangles used in the meshlet; data is stored in consecutive range [offset..offset+count) for vertices and [offset..offset+count*3) for triangles */
 	unsigned int vertex_count;
 	unsigned int triangle_count;
 };
@@ -484,14 +654,15 @@ struct meshopt_Meshlet
  * Meshlet builder
  * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
  * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
+ * When targeting mesh shading hardware, for maximum efficiency meshlets should be further optimized using meshopt_optimizeMeshlet.
  * When using buildMeshlets, vertex positions need to be provided to minimize the size of the resulting clusters.
  * When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
  *
  * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
- * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
- * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
  * vertex_positions should have float3 position in the first 12 bytes of each vertex
- * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512; max_triangles must be divisible by 4)
+ * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512)
  * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
  */
 MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
@@ -499,14 +670,41 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshl
 MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
 
 /**
- * Experimental: Meshlet optimizer
- * Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput
+ * Meshlet builder with flexible cluster sizes
+ * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet.
+ * Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor.
  *
- * meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these
- * need to be computed from meshlet's vertex_offset and triangle_offset
- * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 255 - not 256!, triangle_count <= 512)
+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
+ * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
+ * split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold
  */
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
+MESHOPTIMIZER_API size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
+
+/**
+ * Meshlet builder that produces clusters optimized for raytracing
+ * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but optimizes cluster subdivision for raytracing and allows to specify minimum and maximum number of triangles per meshlet.
+ *
+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
+ * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
+ * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
+ * fill_weight allows to prioritize clusters that are closer to maximum size at some cost to SAH quality; 0.5 is a safe default
+ */
+MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
+
+/**
+ * Meshlet optimizer
+ * Reorders meshlet vertices and triangles to maximize locality which can improve rasterizer throughput or ray tracing performance when using fast-build modes.
+ *
+ * meshlet_triangles and meshlet_vertices must refer to meshlet data; when buildMeshlets* is used, these need to be computed from meshlet's vertex_offset and triangle_offset
+ * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512)
+ */
+MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
 
 struct meshopt_Bounds
 {
@@ -544,11 +742,35 @@ struct meshopt_Bounds
  * Real-Time Rendering 4th Edition, section 19.3).
  *
  * vertex_positions should have float3 position in the first 12 bytes of each vertex
- * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size)
+ * vertex_count should specify the number of vertices in the entire mesh, not cluster or meshlet
+ * index_count/3 and triangle_count must not exceed implementation limits (<= 512)
  */
 MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
+/**
+ * Sphere bounds generator
+ * Creates bounding sphere around a set of points or a set of spheres; returns the center and radius of the sphere, with other fields of the result set to 0.
+ *
+ * positions should have float3 position in the first 12 bytes of each element
+ * radii can be NULL; when it's not NULL, it should have a non-negative float radius in the first 4 bytes of each element
+ */
+MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride);
+
+/**
+ * Cluster partitioner
+ * Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices or are close to each other.
+ * When vertex positions are not provided, only clusters that share vertices will be grouped together, which may result in small partitions for some inputs.
+ *
+ * destination must contain enough space for the resulting partition data (cluster_count elements)
+ * destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function
+ * cluster_indices should have the vertex indices referenced by each cluster, stored sequentially
+ * cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count
+ * vertex_positions can be NULL; when it's not NULL, it should have float3 position in the first 12 bytes of each vertex
+ * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger (up to target + target/3)
+ */
+MESHOPTIMIZER_API size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
+
 /**
  * Spatial sorter
  * Generates a remap table that can be used to reorder points for spatial locality.
@@ -560,13 +782,44 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsig
 MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
 /**
- * Experimental: Spatial sorter
+ * Spatial sorter
  * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
  *
  * destination must contain enough space for the resulting index buffer (index_count elements)
  * vertex_positions should have float3 position in the first 12 bytes of each vertex
  */
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_API void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Spatial clusterizer
+ * Reorders points into clusters optimized for spatial locality, and generates a new index buffer.
+ * Ensures the output can be split into cluster_size chunks where each chunk has good positional locality. Only the last chunk will be smaller than cluster_size.
+ *
+ * destination must contain enough space for the resulting index buffer (vertex_count elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ */
+MESHOPTIMIZER_API void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size);
+
+/**
+ * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
+ * Representable magnitude range: [6e-5; 65504]
+ * Maximum relative reconstruction error: 5e-4
+ */
+MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
+
+/**
+ * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
+ * Preserves infinities/NaN, flushes denormals to zero, rounds to nearest
+ * Assumes N is in a valid mantissa precision range, which is 1..23
+ */
+MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
+
+/**
+ * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
+ * Preserves Inf/NaN, flushes denormals to zero
+ */
+MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
 
 /**
  * Set allocation callbacks
@@ -574,13 +827,13 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* desti
  * Note that all algorithms only allocate memory for temporary use.
  * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
  */
-MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*));
+MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*));
 
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
 
-/* Quantization into commonly supported data formats */
+/* Quantization into fixed point normalized formats; these are only available as inline C++ functions */
 #ifdef __cplusplus
 /**
  * Quantize a float in [0..1] range into an N-bit fixed point unorm value
@@ -595,27 +848,6 @@ inline int meshopt_quantizeUnorm(float v, int N);
  * Maximum reconstruction error: 1/2^N
  */
 inline int meshopt_quantizeSnorm(float v, int N);
-
-/**
- * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
- * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
- * Representable magnitude range: [6e-5; 65504]
- * Maximum relative reconstruction error: 5e-4
- */
-MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
-
-/**
- * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
- * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
- * Assumes N is in a valid mantissa precision range, which is 1..23
- */
-MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
-
-/**
- * Reverse quantization of a half-precision (as defined by IEEE-754 fp16) floating point value
- * Preserves Inf/NaN, flushes denormals to zero
- */
-MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
 #endif
 
 /**
@@ -631,6 +863,10 @@ template <typename T>
 inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
 template <typename T>
 inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
+template <typename F>
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback);
+template <typename T, typename F>
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback);
 template <typename T>
 inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap);
 template <typename T>
@@ -642,6 +878,8 @@ inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indice
 template <typename T>
 inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 template <typename T>
+inline size_t meshopt_generateProvokingIndexBuffer(T* destination, unsigned int* reorder, const T* indices, size_t index_count, size_t vertex_count);
+template <typename T>
 inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count);
 template <typename T>
 inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count);
@@ -661,29 +899,44 @@ template <typename T>
 inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
 template <typename T>
 inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
+inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level);
 template <typename T>
 inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
 template <typename T>
 inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
 template <typename T>
+inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL);
+template <typename T>
 inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL);
 template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error = NULL);
+template <typename T>
+inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
+template <typename T>
 inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
 template <typename T>
 inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index);
 template <typename T>
-inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size);
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
+template <typename T>
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
 template <typename T>
 inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 template <typename T>
-inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+inline meshopt_CoverageStatistics meshopt_analyzeCoverage(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 template <typename T>
 inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
 template <typename T>
 inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
 template <typename T>
+inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
+template <typename T>
+inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
+template <typename T>
 inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 template <typename T>
+inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
+template <typename T>
 inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 #endif
 
@@ -717,31 +970,39 @@ inline int meshopt_quantizeSnorm(float v, int N)
 class meshopt_Allocator
 {
 public:
-	template <typename T>
-	struct StorageT
+	struct Storage
 	{
-		static void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t);
-		static void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*);
+		void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t);
+		void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*);
 	};
 
-	typedef StorageT<void> Storage;
+#ifdef MESHOPTIMIZER_ALLOC_EXPORT
+	MESHOPTIMIZER_API static Storage& storage();
+#else
+	static Storage& storage()
+	{
+		static Storage s = {::operator new, ::operator delete };
+		return s;
+	}
+#endif
 
 	meshopt_Allocator()
-		: blocks()
-		, count(0)
+	    : blocks()
+	    , count(0)
 	{
 	}
 
 	~meshopt_Allocator()
 	{
 		for (size_t i = count; i > 0; --i)
-			Storage::deallocate(blocks[i - 1]);
+			storage().deallocate(blocks[i - 1]);
 	}
 
-	template <typename T> T* allocate(size_t size)
+	template <typename T>
+	T* allocate(size_t size)
 	{
 		assert(count < sizeof(blocks) / sizeof(blocks[0]));
-		T* result = static_cast<T*>(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
+		T* result = static_cast<T*>(storage().allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
 		blocks[count++] = result;
 		return result;
 	}
@@ -749,7 +1010,7 @@ public:
 	void deallocate(void* ptr)
 	{
 		assert(count > 0 && blocks[count - 1] == ptr);
-		Storage::deallocate(ptr);
+		storage().deallocate(ptr);
 		count--;
 	}
 
@@ -757,10 +1018,6 @@ private:
 	void* blocks[24];
 	size_t count;
 };
-
-// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
-template <typename T> void* (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
-template <typename T> void (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
 #endif
 
 /* Inline implementation for C++ templated wrappers */
@@ -782,7 +1039,7 @@ struct meshopt_IndexAdapter<T, false>
 	{
 		size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int);
 
-		data = static_cast<unsigned int*>(meshopt_Allocator::Storage::allocate(size));
+		data = static_cast<unsigned int*>(meshopt_Allocator::storage().allocate(size));
 
 		if (input)
 		{
@@ -799,7 +1056,7 @@ struct meshopt_IndexAdapter<T, false>
 				result[i] = T(data[i]);
 		}
 
-		meshopt_Allocator::Storage::deallocate(data);
+		meshopt_Allocator::storage().deallocate(data);
 	}
 };
 
@@ -830,6 +1087,30 @@ inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const
 	return meshopt_generateVertexRemapMulti(destination, indices ? in.data : NULL, index_count, vertex_count, streams, stream_count);
 }
 
+template <typename F>
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback)
+{
+	struct Call
+	{
+		static int compare(void* context, unsigned int lhs, unsigned int rhs) { return (*static_cast<F*>(context))(lhs, rhs) ? 1 : 0; }
+	};
+
+	return meshopt_generateVertexRemapCustom(destination, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, &Call::compare, &callback);
+}
+
+template <typename T, typename F>
+inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback)
+{
+	struct Call
+	{
+		static int compare(void* context, unsigned int lhs, unsigned int rhs) { return (*static_cast<F*>(context))(lhs, rhs) ? 1 : 0; }
+	};
+
+	meshopt_IndexAdapter<T> in(NULL, indices, indices ? index_count : 0);
+
+	return meshopt_generateVertexRemapCustom(destination, indices ? in.data : NULL, index_count, vertex_positions, vertex_count, vertex_positions_stride, &Call::compare, &callback);
+}
+
 template <typename T>
 inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap)
 {
@@ -875,6 +1156,19 @@ inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* ind
 	meshopt_generateTessellationIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
 }
 
+template <typename T>
+inline size_t meshopt_generateProvokingIndexBuffer(T* destination, unsigned int* reorder, const T* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
+
+	size_t bound = vertex_count + (index_count / 3);
+	assert(size_t(T(bound - 1)) == bound - 1); // bound - 1 must fit in T
+	(void)bound;
+
+	return meshopt_generateProvokingIndexBuffer(out.data, reorder, in.data, index_count, vertex_count);
+}
+
 template <typename T>
 inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
 {
@@ -961,6 +1255,11 @@ inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const
 	return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size);
 }
 
+inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level)
+{
+	return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, level, -1);
+}
+
 template <typename T>
 inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error)
 {
@@ -979,13 +1278,39 @@ inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, s
 	return meshopt_simplifyWithAttributes(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
 }
 
+template <typename T>
+inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error)
+{
+	meshopt_IndexAdapter<T> inout(indices, indices, index_count);
+
+	return meshopt_simplifyWithUpdate(inout.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error);
+}
+
 template <typename T>
 inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error)
 {
 	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
 	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
 
-	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error);
+	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, NULL, target_index_count, target_error, result_error);
+}
+
+template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error)
+{
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
+
+	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_lock, target_index_count, target_error, result_error);
+}
+
+template <typename T>
+inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error)
+{
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, NULL, index_count);
+
+	return meshopt_simplifyPrune(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_error);
 }
 
 template <typename T>
@@ -1007,11 +1332,19 @@ inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_
 }
 
 template <typename T>
-inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size)
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size)
 {
 	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
 
-	return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size);
+	return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, primgroup_size);
+}
+
+template <typename T>
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+
+	return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
 }
 
 template <typename T>
@@ -1023,11 +1356,11 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size
 }
 
 template <typename T>
-inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+inline meshopt_CoverageStatistics meshopt_analyzeCoverage(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
 	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
 
-	return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
+	return meshopt_analyzeCoverage(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
 }
 
 template <typename T>
@@ -1046,6 +1379,22 @@ inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int*
 	return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles);
 }
 
+template <typename T>
+inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)
+{
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+
+	return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, cone_weight, split_factor);
+}
+
+template <typename T>
+inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
+{
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+
+	return meshopt_buildMeshletsSpatial(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, fill_weight);
+}
+
 template <typename T>
 inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
@@ -1054,6 +1403,14 @@ inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t inde
 	return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
 }
 
+template <typename T>
+inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size)
+{
+	meshopt_IndexAdapter<T> in(NULL, cluster_indices, total_index_count);
+
+	return meshopt_partitionClusters(destination, in.data, total_index_count, cluster_index_counts, cluster_count, vertex_positions, vertex_count, vertex_positions_stride, target_partition_size);
+}
+
 template <typename T>
 inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
@@ -1065,7 +1422,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_
 #endif
 
 /**
- * Copyright (c) 2016-2024 Arseny Kapoulkine
+ * Copyright (c) 2016-2025 Arseny Kapoulkine
  *
  * Permission is hereby granted, free of charge, to any person
  * obtaining a copy of this software and associated documentation
diff --git a/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp b/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp
index cc22dbcff..682b924a9 100644
--- a/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp
+++ b/Source/ThirdParty/meshoptimizer/overdrawoptimizer.cpp
@@ -10,24 +10,24 @@
 namespace meshopt
 {
 
-static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
+static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
 {
 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
 
 	float mesh_centroid[3] = {};
 
-	for (size_t i = 0; i < index_count; ++i)
+	for (size_t i = 0; i < vertex_count; ++i)
 	{
-		const float* p = vertex_positions + vertex_stride_float * indices[i];
+		const float* p = vertex_positions + vertex_stride_float * i;
 
 		mesh_centroid[0] += p[0];
 		mesh_centroid[1] += p[1];
 		mesh_centroid[2] += p[2];
 	}
 
-	mesh_centroid[0] /= index_count;
-	mesh_centroid[1] /= index_count;
-	mesh_centroid[2] /= index_count;
+	mesh_centroid[0] /= float(vertex_count);
+	mesh_centroid[1] /= float(vertex_count);
+	mesh_centroid[2] /= float(vertex_count);
 
 	for (size_t cluster = 0; cluster < cluster_count; ++cluster)
 	{
@@ -306,7 +306,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind
 
 	// fill sort data
 	float* sort_data = allocator.allocate<float>(cluster_count);
-	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
+	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, clusters, cluster_count);
 
 	// sort clusters using sort data
 	unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);
diff --git a/Source/ThirdParty/meshoptimizer/partition.cpp b/Source/ThirdParty/meshoptimizer/partition.cpp
new file mode 100644
index 000000000..4119a53ed
--- /dev/null
+++ b/Source/ThirdParty/meshoptimizer/partition.cpp
@@ -0,0 +1,624 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+// This work is based on:
+// Takio Kurita. An efficient agglomerative clustering algorithm using a heap. 1991
+namespace meshopt
+{
+
+// To avoid excessive recursion for malformed inputs, we switch to bisection after some depth
+const int kMergeDepthCutoff = 40;
+
+struct ClusterAdjacency
+{
+	unsigned int* offsets;
+	unsigned int* clusters;
+	unsigned int* shared;
+};
+
+static void filterClusterIndices(unsigned int* data, unsigned int* offsets, const unsigned int* cluster_indices, const unsigned int* cluster_index_counts, size_t cluster_count, unsigned char* used, size_t vertex_count, size_t total_index_count)
+{
+	(void)vertex_count;
+	(void)total_index_count;
+
+	size_t cluster_start = 0;
+	size_t cluster_write = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		offsets[i] = unsigned(cluster_write);
+
+		// copy cluster indices, skipping duplicates
+		for (size_t j = 0; j < cluster_index_counts[i]; ++j)
+		{
+			unsigned int v = cluster_indices[cluster_start + j];
+			assert(v < vertex_count);
+
+			data[cluster_write] = v;
+			cluster_write += 1 - used[v];
+			used[v] = 1;
+		}
+
+		// reset used flags for the next cluster
+		for (size_t j = offsets[i]; j < cluster_write; ++j)
+			used[data[j]] = 0;
+
+		cluster_start += cluster_index_counts[i];
+	}
+
+	assert(cluster_start == total_index_count);
+	assert(cluster_write <= total_index_count);
+	offsets[cluster_count] = unsigned(cluster_write);
+}
+
+static float computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, float* out_center)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float center[3] = {0, 0, 0};
+
+	// approximate center of the cluster by averaging all vertex positions
+	for (size_t j = 0; j < index_count; ++j)
+	{
+		const float* p = vertex_positions + indices[j] * vertex_stride_float;
+
+		center[0] += p[0];
+		center[1] += p[1];
+		center[2] += p[2];
+	}
+
+	// note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes
+	if (index_count)
+	{
+		center[0] /= float(index_count);
+		center[1] /= float(index_count);
+		center[2] /= float(index_count);
+	}
+
+	// compute radius of the bounding sphere for each cluster
+	float radiussq = 0;
+
+	for (size_t j = 0; j < index_count; ++j)
+	{
+		const float* p = vertex_positions + indices[j] * vertex_stride_float;
+
+		float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+
+		radiussq = radiussq < d2 ? d2 : radiussq;
+	}
+
+	memcpy(out_center, center, sizeof(center));
+	return sqrtf(radiussq);
+}
+
+static void buildClusterAdjacency(ClusterAdjacency& adjacency, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	unsigned int* ref_offsets = allocator.allocate<unsigned int>(vertex_count + 1);
+
+	// compute number of clusters referenced by each vertex
+	memset(ref_offsets, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+			ref_offsets[cluster_indices[j]]++;
+	}
+
+	// compute (worst-case) number of adjacent clusters for each cluster
+	size_t total_adjacency = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		size_t count = 0;
+
+		// worst case is every vertex has a disjoint cluster list
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+			count += ref_offsets[cluster_indices[j]] - 1;
+
+		// ... but only every other cluster can be adjacent in the end
+		total_adjacency += count < cluster_count - 1 ? count : cluster_count - 1;
+	}
+
+	// we can now allocate adjacency buffers
+	adjacency.offsets = allocator.allocate<unsigned int>(cluster_count + 1);
+	adjacency.clusters = allocator.allocate<unsigned int>(total_adjacency);
+	adjacency.shared = allocator.allocate<unsigned int>(total_adjacency);
+
+	// convert ref counts to offsets
+	size_t total_refs = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		size_t count = ref_offsets[i];
+		ref_offsets[i] = unsigned(total_refs);
+		total_refs += count;
+	}
+
+	unsigned int* ref_data = allocator.allocate<unsigned int>(total_refs);
+
+	// fill cluster refs for each vertex
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+			ref_data[ref_offsets[cluster_indices[j]]++] = unsigned(i);
+	}
+
+	// after the previous pass, ref_offsets contain the end of the data for each vertex; shift it forward to get the start
+	memmove(ref_offsets + 1, ref_offsets, vertex_count * sizeof(unsigned int));
+	ref_offsets[0] = 0;
+
+	// fill cluster adjacency for each cluster...
+	adjacency.offsets[0] = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		unsigned int* adj = adjacency.clusters + adjacency.offsets[i];
+		unsigned int* shd = adjacency.shared + adjacency.offsets[i];
+		size_t count = 0;
+
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+		{
+			unsigned int v = cluster_indices[j];
+
+			// merge the entire cluster list of each vertex into current list
+			for (size_t k = ref_offsets[v]; k < ref_offsets[v + 1]; ++k)
+			{
+				unsigned int c = ref_data[k];
+				assert(c < cluster_count);
+
+				if (c == unsigned(i))
+					continue;
+
+				// if the cluster is already in the list, increment the shared count
+				bool found = false;
+				for (size_t l = 0; l < count; ++l)
+					if (adj[l] == c)
+					{
+						found = true;
+						shd[l]++;
+						break;
+					}
+
+				// .. or append a new cluster
+				if (!found)
+				{
+					adj[count] = c;
+					shd[count] = 1;
+					count++;
+				}
+			}
+		}
+
+		// mark the end of the adjacency list; the next cluster will start there as well
+		adjacency.offsets[i + 1] = adjacency.offsets[i] + unsigned(count);
+	}
+
+	assert(adjacency.offsets[cluster_count] <= total_adjacency);
+
+	// ref_offsets can't be deallocated as it was allocated before adjacency
+	allocator.deallocate(ref_data);
+}
+
+struct ClusterGroup
+{
+	int group;
+	int next;
+	unsigned int size; // 0 unless root
+	unsigned int vertices;
+
+	float center[3];
+	float radius;
+};
+
+struct GroupOrder
+{
+	unsigned int id;
+	int order;
+};
+
+static void heapPush(GroupOrder* heap, size_t size, GroupOrder item)
+{
+	// insert a new element at the end (breaks heap invariant)
+	heap[size++] = item;
+
+	// bubble up the new element to its correct position
+	size_t i = size - 1;
+	while (i > 0 && heap[i].order < heap[(i - 1) / 2].order)
+	{
+		size_t p = (i - 1) / 2;
+
+		GroupOrder temp = heap[i];
+		heap[i] = heap[p];
+		heap[p] = temp;
+		i = p;
+	}
+}
+
+static GroupOrder heapPop(GroupOrder* heap, size_t size)
+{
+	assert(size > 0);
+	GroupOrder top = heap[0];
+
+	// move the last element to the top (breaks heap invariant)
+	heap[0] = heap[--size];
+
+	// bubble down the new top element to its correct position
+	size_t i = 0;
+	while (i * 2 + 1 < size)
+	{
+		// find the smallest child
+		size_t j = i * 2 + 1;
+		j += (j + 1 < size && heap[j + 1].order < heap[j].order);
+
+		// if the parent is already smaller than both children, we're done
+		if (heap[j].order >= heap[i].order)
+			break;
+
+		// otherwise, swap the parent and child and continue
+		GroupOrder temp = heap[i];
+		heap[i] = heap[j];
+		heap[j] = temp;
+		i = j;
+	}
+
+	return top;
+}
+
+static unsigned int countShared(const ClusterGroup* groups, int group1, int group2, const ClusterAdjacency& adjacency)
+{
+	unsigned int total = 0;
+
+	for (int i1 = group1; i1 >= 0; i1 = groups[i1].next)
+		for (int i2 = group2; i2 >= 0; i2 = groups[i2].next)
+		{
+			for (unsigned int adj = adjacency.offsets[i1]; adj < adjacency.offsets[i1 + 1]; ++adj)
+				if (adjacency.clusters[adj] == unsigned(i2))
+				{
+					total += adjacency.shared[adj];
+					break;
+				}
+		}
+
+	return total;
+}
+
+static void mergeBounds(ClusterGroup& target, const ClusterGroup& source)
+{
+	float r1 = target.radius, r2 = source.radius;
+	float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
+	float d = sqrtf(dx * dx + dy * dy + dz * dz);
+
+	if (d + r1 < r2)
+	{
+		target.center[0] = source.center[0];
+		target.center[1] = source.center[1];
+		target.center[2] = source.center[2];
+		target.radius = source.radius;
+		return;
+	}
+
+	if (d + r2 > r1)
+	{
+		float k = d > 0 ? (d + r2 - r1) / (2 * d) : 0.f;
+
+		target.center[0] += dx * k;
+		target.center[1] += dy * k;
+		target.center[2] += dz * k;
+		target.radius = (d + r2 + r1) / 2;
+	}
+}
+
+static float boundsScore(const ClusterGroup& target, const ClusterGroup& source)
+{
+	float r1 = target.radius, r2 = source.radius;
+	float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
+	float d = sqrtf(dx * dx + dy * dy + dz * dz);
+
+	float mr = d + r1 < r2 ? r2 : (d + r2 < r1 ? r1 : (d + r2 + r1) / 2);
+
+	return mr > 0 ? r1 / mr : 0.f;
+}
+
+static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, bool use_bounds)
+{
+	assert(groups[id].size > 0);
+
+	float group_rsqrt = 1.f / sqrtf(float(int(groups[id].vertices)));
+
+	int best_group = -1;
+	float best_score = 0;
+
+	for (int ci = id; ci >= 0; ci = groups[ci].next)
+	{
+		for (unsigned int adj = adjacency.offsets[ci]; adj != adjacency.offsets[ci + 1]; ++adj)
+		{
+			int other = groups[adjacency.clusters[adj]].group;
+			if (other < 0)
+				continue;
+
+			assert(groups[other].size > 0);
+			if (groups[id].size + groups[other].size > max_partition_size)
+				continue;
+
+			unsigned int shared = countShared(groups, id, other, adjacency);
+			float other_rsqrt = 1.f / sqrtf(float(int(groups[other].vertices)));
+
+			// normalize shared count by the expected boundary of each group (+ keeps scoring symmetric)
+			float score = float(int(shared)) * (group_rsqrt + other_rsqrt);
+
+			// incorporate spatial score to favor merging nearby groups
+			if (use_bounds)
+				score *= 1.f + 0.4f * boundsScore(groups[id], groups[other]);
+
+			if (score > best_score)
+			{
+				best_group = other;
+				best_score = score;
+			}
+		}
+	}
+
+	return best_group;
+}
+
+static void mergeLeaf(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size)
+{
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = order[i];
+		if (groups[id].size == 0 || groups[id].size >= target_partition_size)
+			continue;
+
+		float best_score = -1.f;
+		int best_group = -1;
+
+		for (size_t j = 0; j < count; ++j)
+		{
+			unsigned int other = order[j];
+			if (id == other || groups[other].size == 0)
+				continue;
+
+			if (groups[id].size + groups[other].size > max_partition_size)
+				continue;
+
+			// favor merging nearby groups
+			float score = boundsScore(groups[id], groups[other]);
+
+			if (score > best_score)
+			{
+				best_score = score;
+				best_group = other;
+			}
+		}
+
+		// merge id *into* best_group; that way, we may merge more groups into the same best_group, maximizing the chance of reaching target
+		if (best_group != -1)
+		{
+			// combine groups by linking them together
+			unsigned int tail = best_group;
+			while (groups[tail].next >= 0)
+				tail = groups[tail].next;
+
+			groups[tail].next = id;
+
+			// update group sizes; note, we omit vertices update for simplicity as it's not used for spatial merge
+			groups[best_group].size += groups[id].size;
+			groups[id].size = 0;
+
+			// merge bounding spheres
+			mergeBounds(groups[best_group], groups[id]);
+			groups[id].radius = 0.f;
+		}
+	}
+}
+
+static size_t mergePartition(unsigned int* order, size_t count, const ClusterGroup* groups, int axis, float pivot)
+{
+	size_t m = 0;
+
+	// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
+	for (size_t i = 0; i < count; ++i)
+	{
+		float v = groups[order[i]].center[axis];
+
+		// swap(m, i) unconditionally
+		unsigned int t = order[m];
+		order[m] = order[i];
+		order[i] = t;
+
+		// when v >= pivot, we swap i with m without advancing it, preserving invariants
+		m += v < pivot;
+	}
+
+	return m;
+}
+
+static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size, int depth)
+{
+	size_t total = 0;
+	for (size_t i = 0; i < count; ++i)
+		total += groups[order[i]].size;
+
+	if (total <= max_partition_size || count <= leaf_size)
+		return mergeLeaf(groups, order, count, target_partition_size, max_partition_size);
+
+	float mean[3] = {};
+	float vars[3] = {};
+	float runc = 1, runs = 1;
+
+	// gather statistics on the points in the subtree using Welford's algorithm
+	for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
+	{
+		const float* point = groups[order[i]].center;
+
+		for (int k = 0; k < 3; ++k)
+		{
+			float delta = point[k] - mean[k];
+			mean[k] += delta * runs;
+			vars[k] += delta * (point[k] - mean[k]);
+		}
+	}
+
+	// split axis is one where the variance is largest
+	int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
+
+	float split = mean[axis];
+	size_t middle = mergePartition(order, count, groups, axis, split);
+
+	// enforce balance for degenerate partitions
+	// this also ensures recursion depth is bounded on pathological inputs
+	if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2 || depth >= kMergeDepthCutoff)
+		middle = count / 2;
+
+	// recursion depth is logarithmic and bounded due to max depth check above
+	mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
+	mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
+}
+
+} // namespace meshopt
+
+size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size)
+{
+	using namespace meshopt;
+
+	assert((vertex_positions == NULL || vertex_positions_stride >= 12) && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(target_partition_size > 0);
+
+	size_t max_partition_size = target_partition_size + target_partition_size / 3;
+
+	meshopt_Allocator allocator;
+
+	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
+	memset(used, 0, vertex_count);
+
+	unsigned int* cluster_newindices = allocator.allocate<unsigned int>(total_index_count);
+	unsigned int* cluster_offsets = allocator.allocate<unsigned int>(cluster_count + 1);
+
+	// make new cluster index list that filters out duplicate indices
+	filterClusterIndices(cluster_newindices, cluster_offsets, cluster_indices, cluster_index_counts, cluster_count, used, vertex_count, total_index_count);
+	cluster_indices = cluster_newindices;
+
+	// build cluster adjacency along with edge weights (shared vertex count)
+	ClusterAdjacency adjacency = {};
+	buildClusterAdjacency(adjacency, cluster_indices, cluster_offsets, cluster_count, vertex_count, allocator);
+
+	ClusterGroup* groups = allocator.allocate<ClusterGroup>(cluster_count);
+	memset(groups, 0, sizeof(ClusterGroup) * cluster_count);
+
+	GroupOrder* order = allocator.allocate<GroupOrder>(cluster_count);
+	size_t pending = 0;
+
+	// create a singleton group for each cluster and order them by priority
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		groups[i].group = int(i);
+		groups[i].next = -1;
+		groups[i].size = 1;
+		groups[i].vertices = cluster_offsets[i + 1] - cluster_offsets[i];
+		assert(groups[i].vertices > 0);
+
+		// compute bounding sphere for each cluster if positions are provided
+		if (vertex_positions)
+			groups[i].radius = computeClusterBounds(cluster_indices + cluster_offsets[i], cluster_offsets[i + 1] - cluster_offsets[i], vertex_positions, vertex_positions_stride, groups[i].center);
+
+		GroupOrder item = {};
+		item.id = unsigned(i);
+		item.order = groups[i].vertices;
+
+		heapPush(order, pending++, item);
+	}
+
+	// iteratively merge the smallest group with the best group
+	while (pending)
+	{
+		GroupOrder top = heapPop(order, pending--);
+
+		// this group was merged into another group earlier
+		if (groups[top.id].size == 0)
+			continue;
+
+		// disassociate clusters from the group to prevent them from being merged again; we will re-associate them if the group is reinserted
+		for (int i = top.id; i >= 0; i = groups[i].next)
+		{
+			assert(groups[i].group == int(top.id));
+			groups[i].group = -1;
+		}
+
+		// the group is large enough, emit as is
+		if (groups[top.id].size >= target_partition_size)
+			continue;
+
+		int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, /* use_bounds= */ vertex_positions);
+
+		// we can't grow the group any more, emit as is
+		if (best_group == -1)
+			continue;
+
+		// compute shared vertices to adjust the total vertices estimate after merging
+		unsigned int shared = countShared(groups, top.id, best_group, adjacency);
+
+		// combine groups by linking them together
+		unsigned int tail = top.id;
+		while (groups[tail].next >= 0)
+			tail = groups[tail].next;
+
+		groups[tail].next = best_group;
+
+		// update group sizes; note, the vertex update is a O(1) approximation which avoids recomputing the true size
+		groups[top.id].size += groups[best_group].size;
+		groups[top.id].vertices += groups[best_group].vertices;
+		groups[top.id].vertices = (groups[top.id].vertices > shared) ? groups[top.id].vertices - shared : 1;
+
+		groups[best_group].size = 0;
+		groups[best_group].vertices = 0;
+
+		// merge bounding spheres if bounds are available
+		if (vertex_positions)
+		{
+			mergeBounds(groups[top.id], groups[best_group]);
+			groups[best_group].radius = 0;
+		}
+
+		// re-associate all clusters back to the merged group
+		for (int i = top.id; i >= 0; i = groups[i].next)
+			groups[i].group = int(top.id);
+
+		top.order = groups[top.id].vertices;
+		heapPush(order, pending++, top);
+	}
+
+	// if vertex positions are provided, we do a final pass to see if we can merge small groups based on spatial locality alone
+	if (vertex_positions)
+	{
+		unsigned int* merge_order = reinterpret_cast<unsigned int*>(order);
+		size_t merge_offset = 0;
+
+		for (size_t i = 0; i < cluster_count; ++i)
+			if (groups[i].size)
+				merge_order[merge_offset++] = unsigned(i);
+
+		mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8, 0);
+	}
+
+	// output each remaining group
+	size_t next_group = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		if (groups[i].size == 0)
+			continue;
+
+		for (int j = int(i); j >= 0; j = groups[j].next)
+			destination[j] = unsigned(next_group);
+
+		next_group++;
+	}
+
+	assert(next_group <= cluster_count);
+	return next_group;
+}
diff --git a/Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp b/Source/ThirdParty/meshoptimizer/rasterizer.cpp
similarity index 62%
rename from Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp
rename to Source/ThirdParty/meshoptimizer/rasterizer.cpp
index 31cf6f146..bd788ffdb 100644
--- a/Source/ThirdParty/meshoptimizer/overdrawanalyzer.cpp
+++ b/Source/ThirdParty/meshoptimizer/rasterizer.cpp
@@ -18,14 +18,6 @@ struct OverdrawBuffer
 	unsigned int overdraw[kViewport][kViewport][2];
 };
 
-#ifndef min
-#define min(a, b) ((a) < (b) ? (a) : (b))
-#endif
-
-#ifndef max
-#define max(a, b) ((a) > (b) ? (a) : (b))
-#endif
-
 static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3)
 {
 	// z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1)
@@ -36,8 +28,8 @@ static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1,
 	float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1);
 	float invdet = (det == 0) ? 0 : 1 / det;
 
-	dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet;
-	dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet;
+	dzdx = ((z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1)) * invdet;
+	dzdy = ((x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1)) * invdet;
 
 	return det;
 }
@@ -76,11 +68,26 @@ static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, f
 	// bounding rectangle, clipped against viewport
 	// since we rasterize pixels with covered centers, min >0.5 should round up
 	// as for max, due to top-left filling convention we will never rasterize right/bottom edges
-	// so max >= 0.5 should round down
-	int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0);
-	int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport);
-	int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0);
-	int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport);
+	// so max >= 0.5 should round down for inclusive bounds, and up for exclusive (in our case)
+	int minx = X1 < X2 ? X1 : X2;
+	minx = minx < X3 ? minx : X3;
+	minx = (minx + 7) >> 4;
+	minx = minx < 0 ? 0 : minx;
+
+	int miny = Y1 < Y2 ? Y1 : Y2;
+	miny = miny < Y3 ? miny : Y3;
+	miny = (miny + 7) >> 4;
+	miny = miny < 0 ? 0 : miny;
+
+	int maxx = X1 > X2 ? X1 : X2;
+	maxx = maxx > X3 ? maxx : X3;
+	maxx = (maxx + 7) >> 4;
+	maxx = maxx > kViewport ? kViewport : maxx;
+
+	int maxy = Y1 > Y2 ? Y1 : Y2;
+	maxy = maxy > Y3 ? maxy : Y3;
+	maxy = (maxy + 7) >> 4;
+	maxy = maxy > kViewport ? kViewport : maxy;
 
 	// deltas, 28.4 fixed point
 	int DX12 = X1 - X2;
@@ -139,22 +146,10 @@ static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, f
 	}
 }
 
-} // namespace meshopt
-
-meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+static float transformTriangles(float* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
-	using namespace meshopt;
-
-	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
-	assert(vertex_positions_stride % sizeof(float) == 0);
-
-	meshopt_Allocator allocator;
-
 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
 
-	meshopt_OverdrawStatistics result = {};
-
 	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
 	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
 
@@ -164,15 +159,20 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
 
 		for (int j = 0; j < 3; ++j)
 		{
-			minv[j] = min(minv[j], v[j]);
-			maxv[j] = max(maxv[j], v[j]);
+			float vj = v[j];
+
+			minv[j] = minv[j] > vj ? vj : minv[j];
+			maxv[j] = maxv[j] < vj ? vj : maxv[j];
 		}
 	}
 
-	float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2]));
-	float scale = kViewport / extent;
+	float extent = 0.f;
 
-	float* triangles = allocator.allocate<float>(index_count * 3);
+	extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
+	extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
+	extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
+
+	float scale = kViewport / extent;
 
 	for (size_t i = 0; i < index_count; ++i)
 	{
@@ -186,31 +186,55 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
 		triangles[i * 3 + 2] = (v[2] - minv[2]) * scale;
 	}
 
+	return extent;
+}
+
+static void rasterizeTriangles(OverdrawBuffer* buffer, const float* triangles, size_t index_count, int axis)
+{
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		const float* vn0 = &triangles[3 * (i + 0)];
+		const float* vn1 = &triangles[3 * (i + 1)];
+		const float* vn2 = &triangles[3 * (i + 2)];
+
+		switch (axis)
+		{
+		case 0:
+			rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
+			break;
+		case 1:
+			rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
+			break;
+		case 2:
+			rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
+			break;
+		}
+	}
+}
+
+} // namespace meshopt
+
+meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	meshopt_OverdrawStatistics result = {};
+
+	float* triangles = allocator.allocate<float>(index_count * 3);
+	transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+
 	OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);
 
 	for (int axis = 0; axis < 3; ++axis)
 	{
 		memset(buffer, 0, sizeof(OverdrawBuffer));
-
-		for (size_t i = 0; i < index_count; i += 3)
-		{
-			const float* vn0 = &triangles[3 * (i + 0)];
-			const float* vn1 = &triangles[3 * (i + 1)];
-			const float* vn2 = &triangles[3 * (i + 2)];
-
-			switch (axis)
-			{
-			case 0:
-				rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
-				break;
-			case 1:
-				rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
-				break;
-			case 2:
-				rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
-				break;
-			}
-		}
+		rasterizeTriangles(buffer, triangles, index_count, axis);
 
 		for (int y = 0; y < kViewport; ++y)
 			for (int x = 0; x < kViewport; ++x)
@@ -227,3 +251,39 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
 
 	return result;
 }
+
+meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	meshopt_CoverageStatistics result = {};
+
+	float* triangles = allocator.allocate<float>(index_count * 3);
+	float extent = transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+
+	OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);
+
+	for (int axis = 0; axis < 3; ++axis)
+	{
+		memset(buffer, 0, sizeof(OverdrawBuffer));
+		rasterizeTriangles(buffer, triangles, index_count, axis);
+
+		unsigned int covered = 0;
+
+		for (int y = 0; y < kViewport; ++y)
+			for (int x = 0; x < kViewport; ++x)
+				covered += (buffer->overdraw[y][x][0] | buffer->overdraw[y][x][1]) > 0;
+
+		result.coverage[axis] = float(covered) / float(kViewport * kViewport);
+	}
+
+	result.extent = extent;
+
+	return result;
+}
diff --git a/Source/ThirdParty/meshoptimizer/simplifier.cpp b/Source/ThirdParty/meshoptimizer/simplifier.cpp
index e59b4afcd..14d4d42fe 100644
--- a/Source/ThirdParty/meshoptimizer/simplifier.cpp
+++ b/Source/ThirdParty/meshoptimizer/simplifier.cpp
@@ -27,6 +27,7 @@
 // Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
 // Peter Van Sandt, Yannis Chronis, Jignesh M. Patel. Efficiently Searching In-Memory Sorted Arrays: Revenge of the Interpolation Search? 2019
 // Hugues Hoppe. New Quadric Metric for Simplifying Meshes with Appearance Attributes. 1999
+// Hugues Hoppe, Steve Marschner. Efficient Minimization of New Quadric Metric for Simplifying Meshes with Appearance Attributes. 2000
 namespace meshopt
 {
 
@@ -118,10 +119,17 @@ struct PositionHasher
 		unsigned int ri = sparse_remap ? sparse_remap[index] : index;
 		const unsigned int* key = reinterpret_cast<const unsigned int*>(vertex_positions + ri * vertex_stride_float);
 
+		unsigned int x = key[0], y = key[1], z = key[2];
+
+		// replace negative zero with zero
+		x = (x == 0x80000000) ? 0 : x;
+		y = (y == 0x80000000) ? 0 : y;
+		z = (z == 0x80000000) ? 0 : z;
+
 		// scramble bits to make sure that integer coordinates have entropy in lower bits
-		unsigned int x = key[0] ^ (key[0] >> 17);
-		unsigned int y = key[1] ^ (key[1] >> 17);
-		unsigned int z = key[2] ^ (key[2] >> 17);
+		x ^= x >> 17;
+		y ^= y >> 17;
+		z ^= z >> 17;
 
 		// Optimized Spatial Hashing for Collision Detection of Deformable Objects
 		return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791);
@@ -132,7 +140,10 @@ struct PositionHasher
 		unsigned int li = sparse_remap ? sparse_remap[lhs] : lhs;
 		unsigned int ri = sparse_remap ? sparse_remap[rhs] : rhs;
 
-		return memcmp(vertex_positions + li * vertex_stride_float, vertex_positions + ri * vertex_stride_float, sizeof(float) * 3) == 0;
+		const float* lv = vertex_positions + li * vertex_stride_float;
+		const float* rv = vertex_positions + ri * vertex_stride_float;
+
+		return lv[0] == rv[0] && lv[1] == rv[1] && lv[2] == rv[2];
 	}
 };
 
@@ -208,6 +219,11 @@ static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const f
 		remap[index] = *entry;
 	}
 
+	allocator.deallocate(table);
+
+	if (!wedge)
+		return;
+
 	// build wedge table: for each vertex, which other vertex is the next wedge that also maps to the same vertex?
 	// entries in table form a (cyclic) wedge loop per vertex; for manifold vertices, wedge[i] == remap[i] == i
 	for (size_t i = 0; i < vertex_count; ++i)
@@ -221,22 +237,24 @@ static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const f
 			wedge[i] = wedge[r];
 			wedge[r] = unsigned(i);
 		}
-
-	allocator.deallocate(table);
 }
 
 static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count, size_t vertex_count, size_t* out_vertex_count, meshopt_Allocator& allocator)
 {
 	// use a bit set to compute the precise number of unique vertices
 	unsigned char* filter = allocator.allocate<unsigned char>((vertex_count + 7) / 8);
-	memset(filter, 0, (vertex_count + 7) / 8);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+		filter[index / 8] = 0;
+	}
 
 	size_t unique = 0;
 	for (size_t i = 0; i < index_count; ++i)
 	{
 		unsigned int index = indices[i];
-		assert(index < vertex_count);
-
 		unique += (filter[index / 8] & (1 << (index % 8))) == 0;
 		filter[index / 8] |= 1 << (index % 8);
 	}
@@ -255,7 +273,6 @@ static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count,
 	for (size_t i = 0; i < index_count; ++i)
 	{
 		unsigned int index = indices[i];
-
 		unsigned int* entry = hashLookup2(revremap, revremap_size, hasher, index, ~0u);
 
 		if (*entry == ~0u)
@@ -288,14 +305,14 @@ enum VertexKind
 };
 
 // manifold vertices can collapse onto anything
-// border/seam vertices can only be collapsed onto border/seam respectively
+// border/seam vertices can collapse onto border/seam respectively, or locked
 // complex vertices can collapse onto complex/locked
 // a rule of thumb is that collapsing kind A into kind B preserves the kind B in the target vertex
 // for example, while we could collapse Complex into Manifold, this would mean the target vertex isn't Manifold anymore
 const unsigned char kCanCollapse[Kind_Count][Kind_Count] = {
     {1, 1, 1, 1, 1},
-    {0, 1, 0, 0, 0},
-    {0, 0, 1, 0, 0},
+    {0, 1, 0, 0, 1},
+    {0, 0, 1, 0, 1},
     {0, 0, 0, 1, 1},
     {0, 0, 0, 0, 0},
 };
@@ -303,11 +320,13 @@ const unsigned char kCanCollapse[Kind_Count][Kind_Count] = {
 // if a vertex is manifold or seam, adjoining edges are guaranteed to have an opposite edge
 // note that for seam edges, the opposite edge isn't present in the attribute-based topology
 // but is present if you consider a position-only mesh variant
+// while many complex collapses have the opposite edge, since complex vertices collapse to the
+// same wedge, keeping opposite edges separate improves the quality by considering both targets
 const unsigned char kHasOpposite[Kind_Count][Kind_Count] = {
-    {1, 1, 1, 0, 1},
+    {1, 1, 1, 1, 1},
     {1, 0, 1, 0, 0},
     {1, 1, 1, 0, 1},
-    {0, 0, 0, 0, 0},
+    {1, 0, 0, 0, 0},
     {1, 0, 1, 0, 0},
 };
 
@@ -323,14 +342,33 @@ static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int
 	return false;
 }
 
+static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int b, const unsigned int* remap, const unsigned int* wedge)
+{
+	unsigned int v = a;
+
+	do
+	{
+		unsigned int count = adjacency.offsets[v + 1] - adjacency.offsets[v];
+		const EdgeAdjacency::Edge* edges = adjacency.data + adjacency.offsets[v];
+
+		for (size_t i = 0; i < count; ++i)
+			if (remap[edges[i].next] == remap[b])
+				return true;
+
+		v = wedge[v];
+	} while (v != a);
+
+	return false;
+}
+
 static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_lock, const unsigned int* sparse_remap, unsigned int options)
 {
 	memset(loop, -1, vertex_count * sizeof(unsigned int));
 	memset(loopback, -1, vertex_count * sizeof(unsigned int));
 
 	// incoming & outgoing open edges: ~0u if no open edges, i if there are more than 1
-	// note that this is the same data as required in loop[] arrays; loop[] data is only valid for border/seam
-	// but here it's okay to fill the data out for other types of vertices as well
+	// note that this is the same data as required in loop[] arrays; loop[] data is only used for border/seam by default
+	// in permissive mode we also use it to guide complex-complex collapses, so we fill it for all vertices
 	unsigned int* openinc = loopback;
 	unsigned int* openout = loop;
 
@@ -369,12 +407,7 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
 	{
 		if (remap[i] == i)
 		{
-			if (vertex_lock && vertex_lock[sparse_remap ? sparse_remap[i] : i])
-			{
-				// vertex is explicitly locked
-				result[i] = Kind_Locked;
-			}
-			else if (wedge[i] == i)
+			if (wedge[i] == i)
 			{
 				// no attribute seam, need to check if it's manifold
 				unsigned int openi = openinc[i], openo = openout[i];
@@ -386,6 +419,13 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
 				{
 					result[i] = Kind_Manifold;
 				}
+				else if (openi != ~0u && openo != ~0u && remap[openi] == remap[openo] && openi != i)
+				{
+					// classify half-seams as seams (the branch below would mis-classify them as borders)
+					// half-seam is a single vertex that connects to both vertices of a potential seam
+					// treating these as seams allows collapsing the "full" seam vertex onto them
+					result[i] = Kind_Seam;
+				}
 				else if (openi != i && openo != i)
 				{
 					result[i] = Kind_Border;
@@ -407,7 +447,7 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
 				if (openiv != ~0u && openiv != i && openov != ~0u && openov != i &&
 				    openiw != ~0u && openiw != w && openow != ~0u && openow != w)
 				{
-					if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw])
+					if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw] && remap[openiv] != remap[openov])
 					{
 						result[i] = Kind_Seam;
 					}
@@ -438,6 +478,58 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
 		}
 	}
 
+	if (options & meshopt_SimplifyPermissive)
+		for (size_t i = 0; i < vertex_count; ++i)
+			if (result[i] == Kind_Seam || result[i] == Kind_Locked)
+			{
+				if (remap[i] != i)
+				{
+					// only process primary vertices; wedges will be updated to match the primary vertex
+					result[i] = result[remap[i]];
+					continue;
+				}
+
+				bool protect = false;
+
+				// vertex_lock may protect any wedge, not just the primary vertex, so we switch to complex only if no wedges are protected
+				unsigned int v = unsigned(i);
+				do
+				{
+					unsigned int rv = sparse_remap ? sparse_remap[v] : v;
+					protect |= vertex_lock && (vertex_lock[rv] & meshopt_SimplifyVertex_Protect) != 0;
+					v = wedge[v];
+				} while (v != i);
+
+				// protect if any adjoining edge doesn't have an opposite edge (indicating vertex is on the border)
+				do
+				{
+					const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[v]];
+					size_t count = adjacency.offsets[v + 1] - adjacency.offsets[v];
+
+					for (size_t j = 0; j < count; ++j)
+						protect |= !hasEdge(adjacency, edges[j].next, v, remap, wedge);
+					v = wedge[v];
+				} while (v != i);
+
+				result[i] = protect ? result[i] : int(Kind_Complex);
+			}
+
+	if (vertex_lock)
+	{
+		// vertex_lock may lock any wedge, not just the primary vertex, so we need to lock the primary vertex and relock any wedges
+		for (size_t i = 0; i < vertex_count; ++i)
+		{
+			unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i);
+
+			if (vertex_lock[ri] & meshopt_SimplifyVertex_Lock)
+				result[remap[i]] = Kind_Locked;
+		}
+
+		for (size_t i = 0; i < vertex_count; ++i)
+			if (result[remap[i]] == Kind_Locked)
+				result[i] = Kind_Locked;
+	}
+
 	if (options & meshopt_SimplifyLockBorder)
 		for (size_t i = 0; i < vertex_count; ++i)
 			if (result[i] == Kind_Border)
@@ -454,7 +546,7 @@ struct Vector3
 	float x, y, z;
 };
 
-static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* sparse_remap = NULL)
+static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* sparse_remap = NULL, float* out_offset = NULL)
 {
 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
 
@@ -500,10 +592,17 @@ static float rescalePositions(Vector3* result, const float* vertex_positions_dat
 		}
 	}
 
+	if (out_offset)
+	{
+		out_offset[0] = minv[0];
+		out_offset[1] = minv[1];
+		out_offset[2] = minv[2];
+	}
+
 	return extent;
 }
 
-static void rescaleAttributes(float* result, const float* vertex_attributes_data, size_t vertex_count, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned int* sparse_remap)
+static void rescaleAttributes(float* result, const float* vertex_attributes_data, size_t vertex_count, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned int* attribute_remap, const unsigned int* sparse_remap)
 {
 	size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float);
 
@@ -513,18 +612,61 @@ static void rescaleAttributes(float* result, const float* vertex_attributes_data
 
 		for (size_t k = 0; k < attribute_count; ++k)
 		{
-			float a = vertex_attributes_data[ri * vertex_attributes_stride_float + k];
+			unsigned int rk = attribute_remap[k];
+			float a = vertex_attributes_data[ri * vertex_attributes_stride_float + rk];
 
-			result[i * attribute_count + k] = a * attribute_weights[k];
+			result[i * attribute_count + k] = a * attribute_weights[rk];
 		}
 	}
 }
 
-static const size_t kMaxAttributes = 16;
+static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_kind, const unsigned char* vertex_update, const unsigned char* vertex_lock)
+{
+	size_t vertex_positions_stride_float = vertex_positions_stride / sizeof(float);
+	size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (!vertex_update[i])
+			continue;
+
+		unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i);
+
+		// updating externally locked vertices is not allowed
+		if (vertex_lock && (vertex_lock[ri] & meshopt_SimplifyVertex_Lock) != 0)
+			continue;
+
+		// moving locked vertices may result in floating point drift
+		if (vertex_kind[i] != Kind_Locked)
+		{
+			const Vector3& p = vertex_positions[i];
+			float* v = vertex_positions_data + ri * vertex_positions_stride_float;
+
+			v[0] = p.x * vertex_scale + vertex_offset[0];
+			v[1] = p.y * vertex_scale + vertex_offset[1];
+			v[2] = p.z * vertex_scale + vertex_offset[2];
+		}
+
+		if (attribute_count)
+		{
+			const float* sa = vertex_attributes + i * attribute_count;
+			float* va = vertex_attributes_data + ri * vertex_attributes_stride_float;
+
+			for (size_t k = 0; k < attribute_count; ++k)
+			{
+				unsigned int rk = attribute_remap[k];
+
+				va[rk] = sa[k] / attribute_weights[rk];
+			}
+		}
+	}
+}
+
+static const size_t kMaxAttributes = 32;
 
 struct Quadric
 {
-	// a00*x^2 + a11*y^2 + a22*z^2 + 2*(a10*xy + a20*xz + a21*yz) + b0*x + b1*y + b2*z + c
+	// a00*x^2 + a11*y^2 + a22*z^2 + 2*a10*xy + 2*a20*xz + 2*a21*yz + 2*b0*x + 2*b1*y + 2*b2*z + c
 	float a00, a11, a22;
 	float a10, a20, a21;
 	float b0, b1, b2, c;
@@ -586,6 +728,14 @@ static void quadricAdd(Quadric& Q, const Quadric& R)
 	Q.w += R.w;
 }
 
+static void quadricAdd(QuadricGrad& G, const QuadricGrad& R)
+{
+	G.gx += R.gx;
+	G.gy += R.gy;
+	G.gz += R.gz;
+	G.gw += R.gw;
+}
+
 static void quadricAdd(QuadricGrad* G, const QuadricGrad* R, size_t attribute_count)
 {
 	for (size_t k = 0; k < attribute_count; ++k)
@@ -597,7 +747,7 @@ static void quadricAdd(QuadricGrad* G, const QuadricGrad* R, size_t attribute_co
 	}
 }
 
-static float quadricError(const Quadric& Q, const Vector3& v)
+static float quadricEval(const Quadric& Q, const Vector3& v)
 {
 	float rx = Q.b0;
 	float ry = Q.b1;
@@ -620,6 +770,12 @@ static float quadricError(const Quadric& Q, const Vector3& v)
 	r += ry * v.y;
 	r += rz * v.z;
 
+	return r;
+}
+
+static float quadricError(const Quadric& Q, const Vector3& v)
+{
+	float r = quadricEval(Q, v);
 	float s = Q.w == 0.f ? 0.f : 1.f / Q.w;
 
 	return fabsf(r) * s;
@@ -627,26 +783,7 @@ static float quadricError(const Quadric& Q, const Vector3& v)
 
 static float quadricError(const Quadric& Q, const QuadricGrad* G, size_t attribute_count, const Vector3& v, const float* va)
 {
-	float rx = Q.b0;
-	float ry = Q.b1;
-	float rz = Q.b2;
-
-	rx += Q.a10 * v.y;
-	ry += Q.a21 * v.z;
-	rz += Q.a20 * v.x;
-
-	rx *= 2;
-	ry *= 2;
-	rz *= 2;
-
-	rx += Q.a00 * v.x;
-	ry += Q.a11 * v.y;
-	rz += Q.a22 * v.z;
-
-	float r = Q.c;
-	r += rx * v.x;
-	r += ry * v.y;
-	r += rz * v.z;
+	float r = quadricEval(Q, v);
 
 	// see quadricFromAttributes for general derivation; here we need to add the parts of (eval(pos) - attr)^2 that depend on attr
 	for (size_t k = 0; k < attribute_count; ++k)
@@ -654,14 +791,11 @@ static float quadricError(const Quadric& Q, const QuadricGrad* G, size_t attribu
 		float a = va[k];
 		float g = v.x * G[k].gx + v.y * G[k].gy + v.z * G[k].gz + G[k].gw;
 
-		r += a * a * Q.w;
-		r -= 2 * a * g;
+		r += a * (a * Q.w - 2 * g);
 	}
 
-	// TODO: weight normalization is breaking attribute error somehow
-	float s = 1; // Q.w == 0.f ? 0.f : 1.f / Q.w;
-
-	return fabsf(r) * s;
+	// note: unlike position error, we do not normalize by Q.w to retain edge scaling as described in quadricFromAttributes
+	return fabsf(r);
 }
 
 static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w)
@@ -684,6 +818,17 @@ static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, flo
 	Q.w = w;
 }
 
+static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w)
+{
+	Q.a00 = Q.a11 = Q.a22 = w;
+	Q.a10 = Q.a20 = Q.a21 = 0;
+	Q.b0 = -x * w;
+	Q.b1 = -y * w;
+	Q.b2 = -z * w;
+	Q.c = (x * x + y * y + z * z) * w;
+	Q.w = w;
+}
+
 static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
 {
 	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
@@ -702,20 +847,24 @@ static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1
 static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
 {
 	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
-	float length = normalize(p10);
 
-	// p20p = length of projection of p2-p0 onto normalize(p1 - p0)
+	// edge length; keep squared length around for projection correction
+	float lengthsq = p10.x * p10.x + p10.y * p10.y + p10.z * p10.z;
+	float length = sqrtf(lengthsq);
+
+	// p20p = length of projection of p2-p0 onto p1-p0; note that p10 is unnormalized so we need to correct it later
 	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
 	float p20p = p20.x * p10.x + p20.y * p10.y + p20.z * p10.z;
 
-	// normal = altitude of triangle from point p2 onto edge p1-p0
-	Vector3 normal = {p20.x - p10.x * p20p, p20.y - p10.y * p20p, p20.z - p10.z * p20p};
-	normalize(normal);
+	// perp = perpendicular vector from p2 to line segment p1-p0
+	// note: since p10 is unnormalized we need to correct the projection; we scale p20 instead to take advantage of normalize below
+	Vector3 perp = {p20.x * lengthsq - p10.x * p20p, p20.y * lengthsq - p10.y * p20p, p20.z * lengthsq - p10.z * p20p};
+	normalize(perp);
 
-	float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z;
+	float distance = perp.x * p0.x + perp.y * p0.y + perp.z * p0.z;
 
 	// note: the weight is scaled linearly with edge length; this has to match the triangle weight
-	quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight);
+	quadricFromPlane(Q, perp.x, perp.y, perp.z, -distance, length * weight);
 }
 
 static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0, const Vector3& p1, const Vector3& p2, const float* va0, const float* va1, const float* va2, size_t attribute_count)
@@ -728,16 +877,21 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0,
 	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
 	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
 
-	// weight is scaled linearly with edge length
+	// normal = cross(p1 - p0, p2 - p0)
 	Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x};
-	float area = sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z);
-	float w = sqrtf(area); // TODO this needs more experimentation
+	float area = sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z) * 0.5f;
+
+	// quadric is weighted with the square of edge length (= area)
+	// this equalizes the units with the positional error (which, after normalization, is a square of distance)
+	// as a result, a change in weighted attribute of 1 along distance d is approximately equivalent to a change in position of d
+	float w = area;
 
 	// we compute gradients using barycentric coordinates; barycentric coordinates can be computed as follows:
 	// v = (d11 * d20 - d01 * d21) / denom
 	// w = (d00 * d21 - d01 * d20) / denom
 	// u = 1 - v - w
 	// here v0, v1 are triangle edge vectors, v2 is a vector from point to triangle corner, and dij = dot(vi, vj)
+	// note: v2 and d20/d21 can not be evaluated here as v2 is effectively an unknown variable; we need these only as variables for derivation of gradients
 	const Vector3& v0 = p10;
 	const Vector3& v1 = p20;
 	float d00 = v0.x * v0.x + v0.y * v0.y + v0.z * v0.z;
@@ -747,7 +901,7 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0,
 	float denomr = denom == 0 ? 0.f : 1.f / denom;
 
 	// precompute gradient factors
-	// these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out common factors that are shared between attributes
+	// these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out expressions that are shared between attributes
 	float gx1 = (d11 * v0.x - d01 * v1.x) * denomr;
 	float gx2 = (d00 * v1.x - d01 * v0.x) * denomr;
 	float gy1 = (d11 * v0.y - d01 * v1.y) * denomr;
@@ -772,6 +926,7 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0,
 
 		// quadric encodes (eval(pos)-attr)^2; this means that the resulting expansion needs to compute, for example, pos.x * pos.y * K
 		// since quadrics already encode factors for pos.x * pos.y, we can accumulate almost everything in basic quadric fields
+		// note: for simplicity we scale all factors by weight here instead of outside the loop
 		Q.a00 += w * (gx * gx);
 		Q.a11 += w * (gy * gy);
 		Q.a22 += w * (gz * gz);
@@ -794,7 +949,112 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0,
 	}
 }
 
-static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
+static void quadricVolumeGradient(QuadricGrad& G, const Vector3& p0, const Vector3& p1, const Vector3& p2)
+{
+	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
+	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
+
+	// normal = cross(p1 - p0, p2 - p0)
+	Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x};
+	float area = normalize(normal) * 0.5f;
+
+	G.gx = normal.x * area;
+	G.gy = normal.y * area;
+	G.gz = normal.z * area;
+	G.gw = (-p0.x * normal.x - p0.y * normal.y - p0.z * normal.z) * area;
+}
+
+static bool quadricSolve(Vector3& p, const Quadric& Q, const QuadricGrad& GV)
+{
+	// solve A*p = -b where A is the quadric matrix and b is the linear term
+	float a00 = Q.a00, a11 = Q.a11, a22 = Q.a22;
+	float a10 = Q.a10, a20 = Q.a20, a21 = Q.a21;
+	float x0 = -Q.b0, x1 = -Q.b1, x2 = -Q.b2;
+
+	float eps = 1e-6f * Q.w;
+
+	// LDL decomposition: A = LDL^T
+	float d0 = a00;
+	float l10 = a10 / d0;
+	float l20 = a20 / d0;
+
+	float d1 = a11 - a10 * l10;
+	float dl21 = a21 - a20 * l10;
+	float l21 = dl21 / d1;
+
+	float d2 = a22 - a20 * l20 - dl21 * l21;
+
+	// solve L*y = x
+	float y0 = x0;
+	float y1 = x1 - l10 * y0;
+	float y2 = x2 - l20 * y0 - l21 * y1;
+
+	// solve D*z = y
+	float z0 = y0 / d0;
+	float z1 = y1 / d1;
+	float z2 = y2 / d2;
+
+	// augment system with linear constraint GV using Lagrange multiplier
+	float a30 = GV.gx, a31 = GV.gy, a32 = GV.gz;
+	float x3 = -GV.gw;
+
+	float l30 = a30 / d0;
+	float dl31 = a31 - a30 * l10;
+	float l31 = dl31 / d1;
+	float dl32 = a32 - a30 * l20 - dl31 * l21;
+	float l32 = dl32 / d2;
+	float d3 = 0.f - a30 * l30 - dl31 * l31 - dl32 * l32;
+
+	float y3 = x3 - l30 * y0 - l31 * y1 - l32 * y2;
+	float z3 = fabsf(d3) > eps ? y3 / d3 : 0.f; // if d3 is zero, we can ignore the constraint
+
+	// substitute L^T*p = z
+	float lambda = z3;
+	float pz = z2 - l32 * lambda;
+	float py = z1 - l21 * pz - l31 * lambda;
+	float px = z0 - l10 * py - l20 * pz - l30 * lambda;
+
+	p.x = px;
+	p.y = py;
+	p.z = pz;
+
+	return fabsf(d0) > eps && fabsf(d1) > eps && fabsf(d2) > eps;
+}
+
+static void quadricReduceAttributes(Quadric& Q, const Quadric& A, const QuadricGrad* G, size_t attribute_count)
+{
+	// update vertex quadric with attribute quadric; multiply by vertex weight to minimize normalized error
+	Q.a00 += A.a00 * Q.w;
+	Q.a11 += A.a11 * Q.w;
+	Q.a22 += A.a22 * Q.w;
+	Q.a10 += A.a10 * Q.w;
+	Q.a20 += A.a20 * Q.w;
+	Q.a21 += A.a21 * Q.w;
+	Q.b0 += A.b0 * Q.w;
+	Q.b1 += A.b1 * Q.w;
+	Q.b2 += A.b2 * Q.w;
+
+	float iaw = A.w == 0 ? 0.f : Q.w / A.w;
+
+	// update linear system based on attribute gradients (BB^T/a)
+	for (size_t k = 0; k < attribute_count; ++k)
+	{
+		const QuadricGrad& g = G[k];
+
+		Q.a00 -= (g.gx * g.gx) * iaw;
+		Q.a11 -= (g.gy * g.gy) * iaw;
+		Q.a22 -= (g.gz * g.gz) * iaw;
+		Q.a10 -= (g.gx * g.gy) * iaw;
+		Q.a20 -= (g.gx * g.gz) * iaw;
+		Q.a21 -= (g.gy * g.gz) * iaw;
+
+		Q.b0 -= (g.gx * g.gw) * iaw;
+		Q.b1 -= (g.gy * g.gw) * iaw;
+		Q.b2 -= (g.gz * g.gw) * iaw;
+	}
+}
+
+static void fillFaceQuadrics(Quadric* vertex_quadrics, QuadricGrad* volume_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
 {
 	for (size_t i = 0; i < index_count; i += 3)
 	{
@@ -808,6 +1068,36 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
 		quadricAdd(vertex_quadrics[remap[i0]], Q);
 		quadricAdd(vertex_quadrics[remap[i1]], Q);
 		quadricAdd(vertex_quadrics[remap[i2]], Q);
+
+		if (volume_gradients)
+		{
+			QuadricGrad GV;
+			quadricVolumeGradient(GV, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2]);
+
+			quadricAdd(volume_gradients[remap[i0]], GV);
+			quadricAdd(volume_gradients[remap[i1]], GV);
+			quadricAdd(volume_gradients[remap[i2]], GV);
+		}
+	}
+}
+
+static void fillVertexQuadrics(Quadric* vertex_quadrics, const Vector3* vertex_positions, size_t vertex_count, const unsigned int* remap, unsigned int options)
+{
+	// by default, we use a very small weight to improve triangulation and numerical stability without affecting the shape or error
+	float factor = (options & meshopt_SimplifyRegularize) ? 1e-1f : 1e-7f;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (remap[i] != i)
+			continue;
+
+		const Vector3& p = vertex_positions[i];
+		float w = vertex_quadrics[i].w * factor;
+
+		Quadric Q;
+		quadricFromPoint(Q, p.x, p.y, p.z, w);
+
+		quadricAdd(vertex_quadrics[i], Q);
 	}
 }
 
@@ -837,15 +1127,11 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
 			if ((k1 == Kind_Border || k1 == Kind_Seam) && loopback[i1] != i0)
 				continue;
 
-			// seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges
-			if (kHasOpposite[k0][k1] && remap[i1] > remap[i0])
-				continue;
-
 			unsigned int i2 = indices[i + next[e + 1]];
 
 			// we try hard to maintain border edge geometry; seam edges can move more freely
 			// due to topological restrictions on collapses, seam quadrics slightly improves collapse structure but aren't critical
-			const float kEdgeWeightSeam = 1.f;
+			const float kEdgeWeightSeam = 0.5f; // applied twice due to opposite edges
 			const float kEdgeWeightBorder = 10.f;
 
 			float edgeWeight = (k0 == Kind_Border || k1 == Kind_Border) ? kEdgeWeightBorder : kEdgeWeightSeam;
@@ -853,13 +1139,20 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
 			Quadric Q;
 			quadricFromTriangleEdge(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight);
 
+			Quadric QT;
+			quadricFromTriangle(QT, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight);
+
+			// mix edge quadric with triangle quadric to stabilize collapses in both directions; both quadrics inherit edge weight so that their error is added
+			QT.w = 0;
+			quadricAdd(Q, QT);
+
 			quadricAdd(vertex_quadrics[remap[i0]], Q);
 			quadricAdd(vertex_quadrics[remap[i1]], Q);
 		}
 	}
 }
 
-static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const float* vertex_attributes, size_t attribute_count, const unsigned int* remap)
+static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const float* vertex_attributes, size_t attribute_count)
 {
 	for (size_t i = 0; i < index_count; i += 3)
 	{
@@ -871,14 +1164,13 @@ static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attr
 		QuadricGrad G[kMaxAttributes];
 		quadricFromAttributes(QA, G, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], &vertex_attributes[i0 * attribute_count], &vertex_attributes[i1 * attribute_count], &vertex_attributes[i2 * attribute_count], attribute_count);
 
-		// TODO: This blends together attribute weights across attribute discontinuities, which is probably not a great idea
-		quadricAdd(attribute_quadrics[remap[i0]], QA);
-		quadricAdd(attribute_quadrics[remap[i1]], QA);
-		quadricAdd(attribute_quadrics[remap[i2]], QA);
+		quadricAdd(attribute_quadrics[i0], QA);
+		quadricAdd(attribute_quadrics[i1], QA);
+		quadricAdd(attribute_quadrics[i2], QA);
 
-		quadricAdd(&attribute_gradients[remap[i0] * attribute_count], G, attribute_count);
-		quadricAdd(&attribute_gradients[remap[i1] * attribute_count], G, attribute_count);
-		quadricAdd(&attribute_gradients[remap[i2] * attribute_count], G, attribute_count);
+		quadricAdd(&attribute_gradients[i0 * attribute_count], G, attribute_count);
+		quadricAdd(&attribute_gradients[i1 * attribute_count], G, attribute_count);
+		quadricAdd(&attribute_gradients[i2 * attribute_count], G, attribute_count);
 	}
 }
 
@@ -922,6 +1214,30 @@ static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vert
 			continue;
 
 		// early-out when at least one triangle flips due to a collapse
+		if (hasTriangleFlip(vertex_positions[a], vertex_positions[b], v0, v1))
+		{
+#if TRACE >= 2
+			printf("edge block %d -> %d: flip welded %d %d %d\n", i0, i1, a, i0, b);
+#endif
+
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vertex_positions, unsigned int i0, const Vector3& v1)
+{
+	const Vector3& v0 = vertex_positions[i0];
+
+	const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[i0]];
+	size_t count = adjacency.offsets[i0 + 1] - adjacency.offsets[i0];
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int a = edges[i].next, b = edges[i].prev;
+
 		if (hasTriangleFlip(vertex_positions[a], vertex_positions[b], v0, v1))
 			return true;
 	}
@@ -929,6 +1245,46 @@ static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vert
 	return false;
 }
 
+static float getNeighborhoodRadius(const EdgeAdjacency& adjacency, const Vector3* vertex_positions, unsigned int i0)
+{
+	const Vector3& v0 = vertex_positions[i0];
+
+	const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[i0]];
+	size_t count = adjacency.offsets[i0 + 1] - adjacency.offsets[i0];
+
+	float result = 0.f;
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int a = edges[i].next, b = edges[i].prev;
+
+		const Vector3& va = vertex_positions[a];
+		const Vector3& vb = vertex_positions[b];
+
+		float da = (va.x - v0.x) * (va.x - v0.x) + (va.y - v0.y) * (va.y - v0.y) + (va.z - v0.z) * (va.z - v0.z);
+		float db = (vb.x - v0.x) * (vb.x - v0.x) + (vb.y - v0.y) * (vb.y - v0.y) + (vb.z - v0.z) * (vb.z - v0.z);
+
+		result = result < da ? da : result;
+		result = result < db ? db : result;
+	}
+
+	return sqrtf(result);
+}
+
+static unsigned int getComplexTarget(unsigned int v, unsigned int target, const unsigned int* remap, const unsigned int* loop, const unsigned int* loopback)
+{
+	unsigned int r = remap[target];
+
+	// use loop metadata to guide complex collapses towards the correct wedge
+	// this works for edges on attribute discontinuities because loop/loopback track the single half-edge without a pair, similar to seams
+	if (loop[v] != ~0u && remap[loop[v]] == r)
+		return loop[v];
+	else if (loopback[v] != ~0u && remap[loopback[v]] == r)
+		return loopback[v];
+	else
+		return target;
+}
+
 static size_t boundEdgeCollapses(const EdgeAdjacency& adjacency, size_t vertex_count, size_t index_count, unsigned char* vertex_kind)
 {
 	size_t dual_count = 0;
@@ -947,7 +1303,7 @@ static size_t boundEdgeCollapses(const EdgeAdjacency& adjacency, size_t vertex_c
 	return (index_count - dual_count / 2) + 3;
 }
 
-static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop)
+static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
 {
 	size_t collapse_count = 0;
 
@@ -983,8 +1339,10 @@ static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, c
 
 			// two vertices are on a border or a seam, but there's no direct edge between them
 			// this indicates that they belong to two different edge loops and we should not collapse this edge
-			// loop[] tracks half edges so we only need to check i0->i1
-			if (k0 == k1 && (k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1)
+			// loop[] and loopback[] track half edges so we only need to check one of them
+			if ((k0 == Kind_Border || k0 == Kind_Seam) && k1 != Kind_Manifold && loop[i0] != i1)
+				continue;
+			if ((k1 == Kind_Border || k1 == Kind_Seam) && k0 != Kind_Manifold && loopback[i1] != i0)
 				continue;
 
 			// edge can be collapsed in either direction - we will pick the one with minimum error
@@ -1009,7 +1367,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, c
 	return collapse_count;
 }
 
-static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap)
+static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
 {
 	for (size_t i = 0; i < collapse_count; ++i)
 	{
@@ -1017,40 +1375,94 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
 
 		unsigned int i0 = c.v0;
 		unsigned int i1 = c.v1;
-
-		// most edges are bidirectional which means we need to evaluate errors for two collapses
-		// to keep this code branchless we just use the same edge for unidirectional edges
-		unsigned int j0 = c.bidi ? i1 : i0;
-		unsigned int j1 = c.bidi ? i0 : i1;
+		bool bidi = c.bidi;
 
 		float ei = quadricError(vertex_quadrics[remap[i0]], vertex_positions[i1]);
-		float ej = quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]);
+		float ej = bidi ? quadricError(vertex_quadrics[remap[i1]], vertex_positions[i0]) : FLT_MAX;
+
+#if TRACE >= 3
+		float di = ei, dj = ej;
+#endif
 
 		if (attribute_count)
 		{
-			ei += quadricError(attribute_quadrics[remap[i0]], &attribute_gradients[remap[i0] * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]);
-			ej += quadricError(attribute_quadrics[remap[j0]], &attribute_gradients[remap[j0] * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]);
+			ei += quadricError(attribute_quadrics[i0], &attribute_gradients[i0 * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]);
+			ej += bidi ? quadricError(attribute_quadrics[i1], &attribute_gradients[i1 * attribute_count], attribute_count, vertex_positions[i0], &vertex_attributes[i0 * attribute_count]) : 0;
+
+			// seam edges need to aggregate attribute errors between primary and secondary edges, as attribute quadrics are separate
+			if (vertex_kind[i0] == Kind_Seam)
+			{
+				// for seam collapses we need to find the seam pair; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
+				unsigned int s0 = wedge[i0];
+				unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0];
+
+				assert(wedge[s0] == i0); // s0 may be equal to i0 for half-seams
+				assert(s1 != ~0u && remap[s1] == remap[i1]);
+
+				// note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe
+				s1 = (s1 != ~0u) ? s1 : wedge[i1];
+
+				ei += quadricError(attribute_quadrics[s0], &attribute_gradients[s0 * attribute_count], attribute_count, vertex_positions[s1], &vertex_attributes[s1 * attribute_count]);
+				ej += bidi ? quadricError(attribute_quadrics[s1], &attribute_gradients[s1 * attribute_count], attribute_count, vertex_positions[s0], &vertex_attributes[s0 * attribute_count]) : 0;
+			}
+			else
+			{
+				// complex edges can have multiple wedges, so we need to aggregate errors for all wedges based on the selected target
+				if (vertex_kind[i0] == Kind_Complex)
+					for (unsigned int v = wedge[i0]; v != i0; v = wedge[v])
+					{
+						unsigned int t = getComplexTarget(v, i1, remap, loop, loopback);
+
+						ei += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[t], &vertex_attributes[t * attribute_count]);
+					}
+
+				if (vertex_kind[i1] == Kind_Complex && bidi)
+					for (unsigned int v = wedge[i1]; v != i1; v = wedge[v])
+					{
+						unsigned int t = getComplexTarget(v, i0, remap, loop, loopback);
+
+						ej += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[t], &vertex_attributes[t * attribute_count]);
+					}
+			}
 		}
 
-		// pick edge direction with minimal error
-		c.v0 = ei <= ej ? i0 : j0;
-		c.v1 = ei <= ej ? i1 : j1;
-		c.error = ei <= ej ? ei : ej;
+		// pick edge direction with minimal error (branchless)
+		bool rev = bidi & (ej < ei);
+
+		c.v0 = rev ? i1 : i0;
+		c.v1 = rev ? i0 : i1;
+		c.error = ej < ei ? ej : ei;
+
+#if TRACE >= 3
+		if (bidi)
+			printf("edge eval %d -> %d: error %f (pos %f, attr %f); reverse %f (pos %f, attr %f)\n",
+			    rev ? i1 : i0, rev ? i0 : i1,
+			    sqrtf(rev ? ej : ei), sqrtf(rev ? dj : di), sqrtf(rev ? ej - dj : ei - di),
+			    sqrtf(rev ? ei : ej), sqrtf(rev ? di : dj), sqrtf(rev ? ei - di : ej - dj));
+		else
+			printf("edge eval %d -> %d: error %f (pos %f, attr %f)\n", i0, i1, sqrtf(c.error), sqrtf(di), sqrtf(ei - di));
+#endif
 	}
 }
 
 static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapses, size_t collapse_count)
 {
-	const int sort_bits = 11;
+	// we use counting sort to order collapses by error; since the exact sort order is not as critical,
+	// only top 12 bits of exponent+mantissa (8 bits of exponent and 4 bits of mantissa) are used.
+	// to avoid excessive stack usage, we clamp the exponent range as collapses with errors much higher than 1 are not useful.
+	const unsigned int sort_bits = 12;
+	const unsigned int sort_bins = 2048 + 512; // exponent range [-127, 32)
 
 	// fill histogram for counting sort
-	unsigned int histogram[1 << sort_bits];
+	unsigned int histogram[sort_bins];
 	memset(histogram, 0, sizeof(histogram));
 
 	for (size_t i = 0; i < collapse_count; ++i)
 	{
 		// skip sign bit since error is non-negative
-		unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits);
+		unsigned int error = collapses[i].errorui;
+		unsigned int key = (error << 1) >> (32 - sort_bits);
+		key = key < sort_bins ? key : sort_bins - 1;
 
 		histogram[key]++;
 	}
@@ -1058,7 +1470,7 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse
 	// compute offsets based on histogram data
 	size_t histogram_sum = 0;
 
-	for (size_t i = 0; i < 1 << sort_bits; ++i)
+	for (size_t i = 0; i < sort_bins; ++i)
 	{
 		size_t count = histogram[i];
 		histogram[i] = unsigned(histogram_sum);
@@ -1071,13 +1483,15 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse
 	for (size_t i = 0; i < collapse_count; ++i)
 	{
 		// skip sign bit since error is non-negative
-		unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits);
+		unsigned int error = collapses[i].errorui;
+		unsigned int key = (error << 1) >> (32 - sort_bits);
+		key = key < sort_bins ? key : sort_bins - 1;
 
 		sort_order[histogram[key]++] = unsigned(i);
 	}
 }
 
-static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error)
+static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error)
 {
 	size_t edge_collapses = 0;
 	size_t triangle_collapses = 0;
@@ -1087,7 +1501,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
 	size_t edge_collapse_goal = triangle_collapse_goal / 2;
 
 #if TRACE
-	size_t stats[4] = {};
+	size_t stats[7] = {};
 #endif
 
 	for (size_t i = 0; i < collapse_count; ++i)
@@ -1097,10 +1511,16 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
 		TRACESTATS(0);
 
 		if (c.error > error_limit)
+		{
+			TRACESTATS(4);
 			break;
+		}
 
 		if (triangle_collapses >= triangle_collapse_goal)
+		{
+			TRACESTATS(5);
 			break;
+		}
 
 		// we limit the error in each pass based on the error of optimal last collapse; since many collapses will be locked
 		// as they will share vertices with other successfull collapses, we need to increase the acceptable error by some factor
@@ -1108,8 +1528,11 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
 
 		// on average, each collapse is expected to lock 6 other collapses; to avoid degenerate passes on meshes with odd
 		// topology, we only abort if we got over 1/6 collapses accordingly.
-		if (c.error > error_goal && triangle_collapses > triangle_collapse_goal / 6)
+		if (c.error > error_goal && c.error > result_error && triangle_collapses > triangle_collapse_goal / 6)
+		{
+			TRACESTATS(6);
 			break;
+		}
 
 		unsigned int i0 = c.v0;
 		unsigned int i1 = c.v1;
@@ -1117,6 +1540,8 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
 		unsigned int r0 = remap[i0];
 		unsigned int r1 = remap[i1];
 
+		unsigned char kind = vertex_kind[i0];
+
 		// we don't collapse vertices that had source or target vertex involved in a collapse
 		// it's important to not move the vertices twice since it complicates the tracking/remapping logic
 		// it's important to not move other vertices towards a moved vertex to preserve error since we don't re-rank collapses mid-pass
@@ -1135,35 +1560,41 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
 			continue;
 		}
 
+#if TRACE >= 2
+		printf("edge commit %d -> %d: kind %d->%d, error %f\n", i0, i1, vertex_kind[i0], vertex_kind[i1], sqrtf(c.error));
+#endif
+
 		assert(collapse_remap[r0] == r0);
 		assert(collapse_remap[r1] == r1);
 
-		quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
-
-		if (attribute_count)
-		{
-			quadricAdd(attribute_quadrics[r1], attribute_quadrics[r0]);
-			quadricAdd(&attribute_gradients[r1 * attribute_count], &attribute_gradients[r0 * attribute_count], attribute_count);
-		}
-
-		if (vertex_kind[i0] == Kind_Complex)
+		if (kind == Kind_Complex)
 		{
+			// remap all vertices in the complex to the target vertex
 			unsigned int v = i0;
 
 			do
 			{
-				collapse_remap[v] = r1;
+				unsigned int t = getComplexTarget(v, i1, remap, loop, loopback);
+
+				collapse_remap[v] = t;
 				v = wedge[v];
 			} while (v != i0);
 		}
-		else if (vertex_kind[i0] == Kind_Seam)
+		else if (kind == Kind_Seam)
 		{
-			// remap v0 to v1 and seam pair of v0 to seam pair of v1
+			// for seam collapses we need to move the seam pair together; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
 			unsigned int s0 = wedge[i0];
-			unsigned int s1 = wedge[i1];
+			unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0];
+			assert(wedge[s0] == i0); // s0 may be equal to i0 for half-seams
+			assert(s1 != ~0u && remap[s1] == r1);
 
-			assert(s0 != i0 && s1 != i1);
-			assert(wedge[s0] == i0 && wedge[s1] == i1);
+			// additional asserts to verify that the seam pair is consistent
+			assert(kind != vertex_kind[i1] || s1 == wedge[i1]);
+			assert(loop[i0] == i1 || loopback[i0] == i1);
+			assert(loop[s0] == s1 || loopback[s0] == s1);
+
+			// note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe
+			s1 = (s1 != ~0u) ? s1 : wedge[i1];
 
 			collapse_remap[i0] = i1;
 			collapse_remap[s0] = s1;
@@ -1175,28 +1606,205 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
 			collapse_remap[i0] = i1;
 		}
 
+		// note: we technically don't need to lock r1 if it's a locked vertex, as it can't move and its quadric won't be used
+		// however, this results in slightly worse error on some meshes because the locked collapses get an unfair advantage wrt scheduling
 		collapse_locked[r0] = 1;
 		collapse_locked[r1] = 1;
 
 		// border edges collapse 1 triangle, other edges collapse 2 or more
-		triangle_collapses += (vertex_kind[i0] == Kind_Border) ? 1 : 2;
+		triangle_collapses += (kind == Kind_Border) ? 1 : 2;
 		edge_collapses++;
 
 		result_error = result_error < c.error ? c.error : result_error;
 	}
 
 #if TRACE
-	float error_goal_perfect = edge_collapse_goal < collapse_count ? collapses[collapse_order[edge_collapse_goal]].error : 0.f;
+	float error_goal_last = edge_collapse_goal < collapse_count ? 1.5f * collapses[collapse_order[edge_collapse_goal]].error : FLT_MAX;
+	float error_goal_limit = error_goal_last < error_limit ? error_goal_last : error_limit;
 
-	printf("removed %d triangles, error %e (goal %e); evaluated %d/%d collapses (done %d, skipped %d, invalid %d)\n",
-	    int(triangle_collapses), sqrtf(result_error), sqrtf(error_goal_perfect),
-	    int(stats[0]), int(collapse_count), int(edge_collapses), int(stats[1]), int(stats[2]));
+	printf("removed %d triangles, error %e (goal %e); evaluated %d/%d collapses (done %d, skipped %d, invalid %d); %s\n",
+	    int(triangle_collapses), sqrtf(result_error), sqrtf(error_goal_limit),
+	    int(stats[0]), int(collapse_count), int(edge_collapses), int(stats[1]), int(stats[2]),
+	    stats[4] ? "error limit" : (stats[5] ? "count limit" : (stats[6] ? "error goal" : "out of collapses")));
 #endif
 
 	return edge_collapses;
 }
 
-static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap)
+static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_count, Quadric* vertex_quadrics, QuadricGrad* volume_gradients, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Vector3* vertex_positions, const unsigned int* remap, float& vertex_error)
+{
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (collapse_remap[i] == i)
+			continue;
+
+		unsigned int i0 = unsigned(i);
+		unsigned int i1 = collapse_remap[i];
+
+		unsigned int r0 = remap[i0];
+		unsigned int r1 = remap[i1];
+
+		// ensure we only update vertex_quadrics once: primary vertex must be moved if any wedge is moved
+		if (i0 == r0)
+		{
+			quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
+
+			if (volume_gradients)
+				quadricAdd(volume_gradients[r1], volume_gradients[r0]);
+		}
+
+		if (attribute_count)
+		{
+			quadricAdd(attribute_quadrics[i1], attribute_quadrics[i0]);
+			quadricAdd(&attribute_gradients[i1 * attribute_count], &attribute_gradients[i0 * attribute_count], attribute_count);
+
+			if (i0 == r0)
+			{
+				// when attributes are used, distance error needs to be recomputed as collapses don't track it; it is safe to do this after the quadric adjustment
+				float derr = quadricError(vertex_quadrics[r0], vertex_positions[r1]);
+				vertex_error = vertex_error < derr ? derr : vertex_error;
+			}
+		}
+	}
+}
+
+static void solvePositions(Vector3* vertex_positions, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update)
+{
+#if TRACE
+	size_t stats[6] = {};
+#endif
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (!vertex_update[i])
+			continue;
+
+		// moving vertices on an attribute discontinuity may result in extrapolating UV outside of the chart bounds
+		// moving vertices on a border requires a stronger edge quadric to preserve the border geometry
+		if (vertex_kind[i] == Kind_Locked || vertex_kind[i] == Kind_Seam || vertex_kind[i] == Kind_Border)
+			continue;
+
+		if (remap[i] != i)
+		{
+			vertex_positions[i] = vertex_positions[remap[i]];
+			continue;
+		}
+
+		TRACESTATS(0);
+
+		const Vector3& vp = vertex_positions[i];
+
+		Quadric Q = vertex_quadrics[i];
+		QuadricGrad GV = {};
+
+		// add a point quadric for regularization to stabilize the solution
+		Quadric R;
+		quadricFromPoint(R, vp.x, vp.y, vp.z, Q.w * 1e-4f);
+		quadricAdd(Q, R);
+
+		if (attribute_count)
+		{
+			// optimal point simultaneously minimizes attribute quadrics for all wedges
+			unsigned int v = unsigned(i);
+			do
+			{
+				quadricReduceAttributes(Q, attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count);
+				v = wedge[v];
+			} while (v != i);
+
+			// minimizing attribute quadrics results in volume loss so we incorporate volume gradient as a constraint
+			if (volume_gradients)
+				GV = volume_gradients[i];
+		}
+
+		Vector3 p;
+		if (!quadricSolve(p, Q, GV))
+		{
+			TRACESTATS(2);
+			continue;
+		}
+
+		// reject updates that move the vertex too far from its neighborhood
+		// this detects and fixes most cases when the quadric is not well-defined
+		float nr = getNeighborhoodRadius(adjacency, vertex_positions, unsigned(i));
+		float dp = (p.x - vp.x) * (p.x - vp.x) + (p.y - vp.y) * (p.y - vp.y) + (p.z - vp.z) * (p.z - vp.z);
+
+		if (dp > nr * nr)
+		{
+			TRACESTATS(3);
+			continue;
+		}
+
+		// reject updates that would flip a neighboring triangle, as we do for edge collapse
+		if (hasTriangleFlips(adjacency, vertex_positions, unsigned(i), p))
+		{
+			TRACESTATS(4);
+			continue;
+		}
+
+		// reject updates that increase positional error too much; allow some tolerance to improve attribute quality
+		if (quadricError(vertex_quadrics[i], p) > quadricError(vertex_quadrics[i], vp) * 1.5f + 1e-6f)
+		{
+			TRACESTATS(5);
+			continue;
+		}
+
+		TRACESTATS(1);
+		vertex_positions[i] = p;
+	}
+
+#if TRACE
+	printf("updated %d/%d positions; failed solve %d bounds %d flip %d error %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4]), int(stats[5]));
+#endif
+}
+
+static void solveAttributes(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned char* vertex_update)
+{
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (!vertex_update[i])
+			continue;
+
+		if (remap[i] != i)
+			continue;
+
+		for (size_t k = 0; k < attribute_count; ++k)
+		{
+			unsigned int shared = ~0u;
+
+			// for complex vertices, preserve attribute continuity and use highest weight wedge if values were shared
+			if (vertex_kind[i] == Kind_Complex)
+			{
+				shared = unsigned(i);
+
+				for (unsigned int v = wedge[i]; v != i; v = wedge[v])
+					if (vertex_attributes[v * attribute_count + k] != vertex_attributes[i * attribute_count + k])
+						shared = ~0u;
+					else if (shared != ~0u && attribute_quadrics[v].w > attribute_quadrics[shared].w)
+						shared = v;
+			}
+
+			// update attributes for all wedges
+			unsigned int v = unsigned(i);
+			do
+			{
+				unsigned int r = (shared == ~0u) ? v : shared;
+
+				const Vector3& p = vertex_positions[i]; // same for all wedges
+				const Quadric& A = attribute_quadrics[r];
+				const QuadricGrad& G = attribute_gradients[r * attribute_count + k];
+
+				float iw = A.w == 0 ? 0.f : 1.f / A.w;
+				float av = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw;
+
+				vertex_attributes[v * attribute_count + k] = av;
+				v = wedge[v];
+			} while (v != i);
+		}
+	}
+}
+
+static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap, const unsigned int* remap)
 {
 	size_t write = 0;
 
@@ -1211,7 +1819,14 @@ static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const
 		assert(collapse_remap[v1] == v1);
 		assert(collapse_remap[v2] == v2);
 
-		if (v0 != v1 && v0 != v2 && v1 != v2)
+		// collapse zero area triangles even if they are not topologically degenerate
+		// this is required to cleanup manifold->seam collapses when a vertex is collapsed onto a seam pair
+		// as well as complex collapses and some other cases where cross wedge collapses are performed
+		unsigned int r0 = remap[v0];
+		unsigned int r1 = remap[v1];
+		unsigned int r2 = remap[v2];
+
+		if (r0 != r1 && r0 != r2 && r1 != r2)
 		{
 			indices[write + 0] = v0;
 			indices[write + 1] = v1;
@@ -1227,17 +1842,183 @@ static void remapEdgeLoops(unsigned int* loop, size_t vertex_count, const unsign
 {
 	for (size_t i = 0; i < vertex_count; ++i)
 	{
+		// note: this is a no-op for vertices that were remapped
+		// ideally we would clear the loop entries for those for consistency, even though they aren't going to be used
+		// however, the remapping process needs loop information for remapped vertices, so this would require a separate pass
 		if (loop[i] != ~0u)
 		{
 			unsigned int l = loop[i];
 			unsigned int r = collapse_remap[l];
 
 			// i == r is a special case when the seam edge is collapsed in a direction opposite to where loop goes
-			loop[i] = (i == r) ? loop[l] : r;
+			if (i == r)
+				loop[i] = (loop[l] != ~0u) ? collapse_remap[loop[l]] : ~0u;
+			else
+				loop[i] = r;
 		}
 	}
 }
 
+static unsigned int follow(unsigned int* parents, unsigned int index)
+{
+	while (index != parents[index])
+	{
+		unsigned int parent = parents[index];
+		parents[index] = parents[parent];
+		index = parent;
+	}
+
+	return index;
+}
+
+static size_t buildComponents(unsigned int* components, size_t vertex_count, const unsigned int* indices, size_t index_count, const unsigned int* remap)
+{
+	for (size_t i = 0; i < vertex_count; ++i)
+		components[i] = unsigned(i);
+
+	// compute a unique (but not sequential!) index for each component via union-find
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		static const int next[4] = {1, 2, 0, 1};
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+
+			unsigned int r0 = remap[i0];
+			unsigned int r1 = remap[i1];
+
+			r0 = follow(components, r0);
+			r1 = follow(components, r1);
+
+			// merge components with larger indices into components with smaller indices
+			// this guarantees that the root of the component is always the one with the smallest index
+			if (r0 != r1)
+				components[r0 < r1 ? r1 : r0] = r0 < r1 ? r0 : r1;
+		}
+	}
+
+	// make sure each element points to the component root *before* we renumber the components
+	for (size_t i = 0; i < vertex_count; ++i)
+		if (remap[i] == i)
+			components[i] = follow(components, unsigned(i));
+
+	unsigned int next_component = 0;
+
+	// renumber components using sequential indices
+	// a sequential pass is sufficient because component root always has the smallest index
+	// note: it is unsafe to use follow() in this pass because we're replacing component links with sequential indices inplace
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (remap[i] == i)
+		{
+			unsigned int root = components[i];
+			assert(root <= i); // make sure we already computed the component for non-roots
+			components[i] = (root == i) ? next_component++ : components[root];
+		}
+		else
+		{
+			assert(remap[i] < i); // make sure we already computed the component
+			components[i] = components[remap[i]];
+		}
+	}
+
+	return next_component;
+}
+
+static void measureComponents(float* component_errors, size_t component_count, const unsigned int* components, const Vector3* vertex_positions, size_t vertex_count)
+{
+	memset(component_errors, 0, component_count * 4 * sizeof(float));
+
+	// compute approximate sphere center for each component as an average
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int c = components[i];
+		assert(components[i] < component_count);
+
+		Vector3 v = vertex_positions[i]; // copy avoids aliasing issues
+
+		component_errors[c * 4 + 0] += v.x;
+		component_errors[c * 4 + 1] += v.y;
+		component_errors[c * 4 + 2] += v.z;
+		component_errors[c * 4 + 3] += 1; // weight
+	}
+
+	// complete the center computation, and reinitialize [3] as a radius
+	for (size_t i = 0; i < component_count; ++i)
+	{
+		float w = component_errors[i * 4 + 3];
+		float iw = w == 0.f ? 0.f : 1.f / w;
+
+		component_errors[i * 4 + 0] *= iw;
+		component_errors[i * 4 + 1] *= iw;
+		component_errors[i * 4 + 2] *= iw;
+		component_errors[i * 4 + 3] = 0; // radius
+	}
+
+	// compute squared radius for each component
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int c = components[i];
+
+		float dx = vertex_positions[i].x - component_errors[c * 4 + 0];
+		float dy = vertex_positions[i].y - component_errors[c * 4 + 1];
+		float dz = vertex_positions[i].z - component_errors[c * 4 + 2];
+		float r = dx * dx + dy * dy + dz * dz;
+
+		component_errors[c * 4 + 3] = component_errors[c * 4 + 3] < r ? r : component_errors[c * 4 + 3];
+	}
+
+	// we've used the output buffer as scratch space, so we need to move the results to proper indices
+	for (size_t i = 0; i < component_count; ++i)
+	{
+#if TRACE >= 2
+		printf("component %d: center %f %f %f, error %e\n", int(i),
+		    component_errors[i * 4 + 0], component_errors[i * 4 + 1], component_errors[i * 4 + 2], sqrtf(component_errors[i * 4 + 3]));
+#endif
+		// note: we keep the squared error to make it match quadric error metric
+		component_errors[i] = component_errors[i * 4 + 3];
+	}
+}
+
+static size_t pruneComponents(unsigned int* indices, size_t index_count, const unsigned int* components, const float* component_errors, size_t component_count, float error_cutoff, float& nexterror)
+{
+	(void)component_count;
+
+	size_t write = 0;
+	float min_error = FLT_MAX;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int v0 = indices[i + 0], v1 = indices[i + 1], v2 = indices[i + 2];
+		unsigned int c = components[v0];
+		assert(c == components[v1] && c == components[v2]);
+
+		if (component_errors[c] > error_cutoff)
+		{
+			min_error = min_error > component_errors[c] ? component_errors[c] : min_error;
+
+			indices[write + 0] = v0;
+			indices[write + 1] = v1;
+			indices[write + 2] = v2;
+			write += 3;
+		}
+	}
+
+#if TRACE
+	size_t pruned_components = 0;
+	for (size_t i = 0; i < component_count; ++i)
+		pruned_components += (component_errors[i] >= nexterror && component_errors[i] <= error_cutoff);
+
+	printf("pruned %d triangles in %d components (goal %e); next %e\n", int((index_count - write) / 3), int(pruned_components), sqrtf(error_cutoff), min_error < FLT_MAX ? sqrtf(min_error) : min_error * 2);
+#endif
+
+	// update next error with the smallest error of the remaining components
+	nexterror = min_error;
+	return write;
+}
+
 struct CellHasher
 {
 	const unsigned int* vertex_ids;
@@ -1299,7 +2080,7 @@ struct TriangleHasher
 	}
 };
 
-static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, size_t vertex_count, int grid_size)
+static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, const unsigned char* vertex_lock, size_t vertex_count, int grid_size)
 {
 	assert(grid_size >= 1 && grid_size <= 1024);
 	float cell_scale = float(grid_size - 1);
@@ -1312,7 +2093,10 @@ static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_pos
 		int yi = int(v.y * cell_scale + 0.5f);
 		int zi = int(v.z * cell_scale + 0.5f);
 
-		vertex_ids[i] = (xi << 20) | (yi << 10) | zi;
+		if (vertex_lock && (vertex_lock[i] & meshopt_SimplifyVertex_Lock))
+			vertex_ids[i] = (1 << 30) | unsigned(i);
+		else
+			vertex_ids[i] = (xi << 20) | (yi << 10) | zi;
 	}
 }
 
@@ -1541,17 +2325,17 @@ static float interpolate(float y, float x0, float y0, float x1, float y1, float
 	// three point interpolation from "revenge of interpolation search" paper
 	float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0);
 	float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2);
-	return x1 + num / den;
+	return x1 + (den == 0.f ? 0.f : num / den);
 }
 
 } // namespace meshopt
 
-#ifndef NDEBUG
-// Note: this is only exposed for debug visualization purposes; do *not* use these in debug builds
-MESHOPTIMIZER_API unsigned char* meshopt_simplifyDebugKind = NULL;
-MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoop = NULL;
-MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoopBack = NULL;
-#endif
+// Note: this is only exposed for development purposes; do *not* use
+enum
+{
+	meshopt_SimplifyInternalSolve = 1 << 29,
+	meshopt_SimplifyInternalDebug = 1 << 30
+};
 
 size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
 {
@@ -1561,10 +2345,13 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 	assert(target_index_count <= index_count);
-	assert((options & ~(meshopt_SimplifyLockBorder | meshopt_SimplifySparse | meshopt_SimplifyErrorAbsolute)) == 0);
+	assert(target_error >= 0);
+	assert((options & ~(meshopt_SimplifyLockBorder | meshopt_SimplifySparse | meshopt_SimplifyErrorAbsolute | meshopt_SimplifyPrune | meshopt_SimplifyRegularize | meshopt_SimplifyPermissive | meshopt_SimplifyInternalSolve | meshopt_SimplifyInternalDebug)) == 0);
 	assert(vertex_attributes_stride >= attribute_count * sizeof(float) && vertex_attributes_stride <= 256);
 	assert(vertex_attributes_stride % sizeof(float) == 0);
 	assert(attribute_count <= kMaxAttributes);
+	for (size_t i = 0; i < attribute_count; ++i)
+		assert(attribute_weights[i] >= 0);
 
 	meshopt_Allocator allocator;
 
@@ -1584,6 +2371,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 	updateEdgeAdjacency(adjacency, result, index_count, vertex_count, NULL);
 
 	// build position remap that maps each vertex to the one with identical position
+	// wedge table stores next vertex with identical position for each vertex
 	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
 	unsigned int* wedge = allocator.allocate<unsigned int>(vertex_count);
 	buildPositionRemap(remap, wedge, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap, allocator);
@@ -1610,14 +2398,23 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 #endif
 
 	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
-	float vertex_scale = rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap);
+	float vertex_offset[3] = {};
+	float vertex_scale = rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap, vertex_offset);
 
 	float* vertex_attributes = NULL;
+	unsigned int attribute_remap[kMaxAttributes];
 
 	if (attribute_count)
 	{
+		// remap attributes to only include ones with weight > 0 to minimize memory/compute overhead for quadrics
+		size_t attributes_used = 0;
+		for (size_t i = 0; i < attribute_count; ++i)
+			if (attribute_weights[i] > 0)
+				attribute_remap[attributes_used++] = unsigned(i);
+
+		attribute_count = attributes_used;
 		vertex_attributes = allocator.allocate<float>(vertex_count * attribute_count);
-		rescaleAttributes(vertex_attributes, vertex_attributes_data, vertex_count, vertex_attributes_stride, attribute_weights, attribute_count, sparse_remap);
+		rescaleAttributes(vertex_attributes, vertex_attributes_data, vertex_count, vertex_attributes_stride, attribute_weights, attribute_count, attribute_remap, sparse_remap);
 	}
 
 	Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count);
@@ -1625,6 +2422,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 
 	Quadric* attribute_quadrics = NULL;
 	QuadricGrad* attribute_gradients = NULL;
+	QuadricGrad* volume_gradients = NULL;
 
 	if (attribute_count)
 	{
@@ -1633,13 +2431,42 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 
 		attribute_gradients = allocator.allocate<QuadricGrad>(vertex_count * attribute_count);
 		memset(attribute_gradients, 0, vertex_count * attribute_count * sizeof(QuadricGrad));
+
+		if (options & meshopt_SimplifyInternalSolve)
+		{
+			volume_gradients = allocator.allocate<QuadricGrad>(vertex_count);
+			memset(volume_gradients, 0, vertex_count * sizeof(QuadricGrad));
+		}
 	}
 
-	fillFaceQuadrics(vertex_quadrics, result, index_count, vertex_positions, remap);
+	fillFaceQuadrics(vertex_quadrics, volume_gradients, result, index_count, vertex_positions, remap);
+	fillVertexQuadrics(vertex_quadrics, vertex_positions, vertex_count, remap, options);
 	fillEdgeQuadrics(vertex_quadrics, result, index_count, vertex_positions, remap, vertex_kind, loop, loopback);
 
 	if (attribute_count)
-		fillAttributeQuadrics(attribute_quadrics, attribute_gradients, result, index_count, vertex_positions, vertex_attributes, attribute_count, remap);
+		fillAttributeQuadrics(attribute_quadrics, attribute_gradients, result, index_count, vertex_positions, vertex_attributes, attribute_count);
+
+	unsigned int* components = NULL;
+	float* component_errors = NULL;
+	size_t component_count = 0;
+	float component_nexterror = 0;
+
+	if (options & meshopt_SimplifyPrune)
+	{
+		components = allocator.allocate<unsigned int>(vertex_count);
+		component_count = buildComponents(components, vertex_count, result, index_count, remap);
+
+		component_errors = allocator.allocate<float>(component_count * 4); // overallocate for temporary use inside measureComponents
+		measureComponents(component_errors, component_count, components, vertex_positions, vertex_count);
+
+		component_nexterror = FLT_MAX;
+		for (size_t i = 0; i < component_count; ++i)
+			component_nexterror = component_nexterror > component_errors[i] ? component_errors[i] : component_nexterror;
+
+#if TRACE
+		printf("components: %d (min error %e)\n", int(component_count), sqrtf(component_nexterror));
+#endif
+	}
 
 #if TRACE
 	size_t pass_count = 0;
@@ -1654,6 +2481,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 
 	size_t result_count = index_count;
 	float result_error = 0;
+	float vertex_error = 0;
 
 	// target_error input is linear; we need to adjust it to match quadricError units
 	float error_scale = (options & meshopt_SimplifyErrorAbsolute) ? vertex_scale : 1.f;
@@ -1664,14 +2492,18 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 		// note: throughout the simplification process adjacency structure reflects welded topology for result-in-progress
 		updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap);
 
-		size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, collapse_capacity, result, result_count, remap, vertex_kind, loop);
+		size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, collapse_capacity, result, result_count, remap, vertex_kind, loop, loopback);
 		assert(edge_collapse_count <= collapse_capacity);
 
 		// no edges can be collapsed any more due to topology restrictions
 		if (edge_collapse_count == 0)
 			break;
 
-		rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap);
+#if TRACE
+		printf("pass %d:%c", int(pass_count++), TRACE >= 2 ? '\n' : ' ');
+#endif
+
+		rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, loop, loopback);
 
 		sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count);
 
@@ -1682,39 +2514,101 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 
 		memset(collapse_locked, 0, vertex_count);
 
-#if TRACE
-		printf("pass %d: ", int(pass_count++));
-#endif
-
-		size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error);
+		size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, loop, loopback, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error);
 
 		// no edges can be collapsed any more due to hitting the error limit or triangle collapse limit
 		if (collapses == 0)
 			break;
 
+		updateQuadrics(collapse_remap, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, vertex_positions, remap, vertex_error);
+
+		// updateQuadrics will update vertex error if we use attributes, but if we don't then result_error and vertex_error are equivalent
+		vertex_error = attribute_count == 0 ? result_error : vertex_error;
+
+		// note: we update loops following edge collapses, but after this we might still have stale loop data
+		// this can happen when a triangle with a loop edge gets collapsed along a non-loop edge
+		// that works since a loop that points to a vertex that is no longer connected is not affecting collapse logic
 		remapEdgeLoops(loop, vertex_count, collapse_remap);
 		remapEdgeLoops(loopback, vertex_count, collapse_remap);
 
-		size_t new_count = remapIndexBuffer(result, result_count, collapse_remap);
-		assert(new_count < result_count);
+		result_count = remapIndexBuffer(result, result_count, collapse_remap, remap);
 
+		if ((options & meshopt_SimplifyPrune) && result_count > target_index_count && component_nexterror <= vertex_error)
+			result_count = pruneComponents(result, result_count, components, component_errors, component_count, vertex_error, component_nexterror);
+	}
+
+	// at this point, component_nexterror might be stale: component it references may have been removed through a series of edge collapses
+	bool component_nextstale = true;
+
+	// we're done with the regular simplification but we're still short of the target; try pruning more aggressively towards error_limit
+	while ((options & meshopt_SimplifyPrune) && result_count > target_index_count && component_nexterror <= error_limit)
+	{
+#if TRACE
+		printf("pass %d: cleanup; ", int(pass_count++));
+#endif
+
+		float component_cutoff = component_nexterror * 1.5f < error_limit ? component_nexterror * 1.5f : error_limit;
+
+		// track maximum error in eligible components as we are increasing resulting error
+		float component_maxerror = 0;
+		for (size_t i = 0; i < component_count; ++i)
+			if (component_errors[i] > component_maxerror && component_errors[i] <= component_cutoff)
+				component_maxerror = component_errors[i];
+
+		size_t new_count = pruneComponents(result, result_count, components, component_errors, component_count, component_cutoff, component_nexterror);
+		if (new_count == result_count && !component_nextstale)
+			break;
+
+		component_nextstale = false; // pruneComponents guarantees next error is up to date
 		result_count = new_count;
+		result_error = result_error < component_maxerror ? component_maxerror : result_error;
+		vertex_error = vertex_error < component_maxerror ? component_maxerror : vertex_error;
 	}
 
 #if TRACE
-	printf("result: %d triangles, error: %e; total %d passes\n", int(result_count / 3), sqrtf(result_error), int(pass_count));
+	printf("result: %d triangles, error: %e (pos %.3e); total %d passes\n", int(result_count / 3), sqrtf(result_error), sqrtf(vertex_error), int(pass_count));
 #endif
 
-#ifndef NDEBUG
-	if (meshopt_simplifyDebugKind)
-		memcpy(meshopt_simplifyDebugKind, vertex_kind, vertex_count);
+	// if solve is requested, update input buffers destructively from internal data
+	if (options & meshopt_SimplifyInternalSolve)
+	{
+		unsigned char* vertex_update = collapse_locked; // reuse as scratch space
+		memset(vertex_update, 0, vertex_count);
 
-	if (meshopt_simplifyDebugLoop)
-		memcpy(meshopt_simplifyDebugLoop, loop, vertex_count * sizeof(unsigned int));
+		// limit quadric solve to vertices that are still used in the result
+		for (size_t i = 0; i < result_count; ++i)
+		{
+			unsigned int v = result[i];
 
-	if (meshopt_simplifyDebugLoopBack)
-		memcpy(meshopt_simplifyDebugLoopBack, loopback, vertex_count * sizeof(unsigned int));
-#endif
+			// mark the vertex for finalizeVertices and root vertex for solve*
+			vertex_update[remap[v]] = vertex_update[v] = 1;
+		}
+
+		// edge adjacency may be stale as we haven't updated it after last series of edge collapses
+		updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap);
+
+		solvePositions(vertex_positions, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update);
+
+		if (attribute_count)
+			solveAttributes(vertex_positions, vertex_attributes, vertex_count, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, vertex_update);
+
+		finalizeVertices(const_cast<float*>(vertex_positions_data), vertex_positions_stride, const_cast<float*>(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_kind, vertex_update, vertex_lock);
+	}
+
+	// if debug visualization data is requested, fill it instead of index data; for simplicity, this doesn't work with sparsity
+	if ((options & meshopt_SimplifyInternalDebug) && !sparse_remap)
+	{
+		assert(Kind_Count <= 8 && vertex_count < (1 << 28)); // 3 bit kind, 1 bit loop
+
+		for (size_t i = 0; i < result_count; i += 3)
+		{
+			unsigned int a = result[i + 0], b = result[i + 1], c = result[i + 2];
+
+			result[i + 0] |= (vertex_kind[a] << 28) | (unsigned(loop[a] == b || loopback[b] == a) << 31);
+			result[i + 1] |= (vertex_kind[b] << 28) | (unsigned(loop[b] == c || loopback[c] == b) << 31);
+			result[i + 2] |= (vertex_kind[c] << 28) | (unsigned(loop[c] == a || loopback[a] == c) << 31);
+		}
+	}
 
 	// convert resulting indices back into the dense space of the larger mesh
 	if (sparse_remap)
@@ -1730,15 +2624,24 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 
 size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
 {
+	assert((options & meshopt_SimplifyInternalSolve) == 0); // use meshopt_simplifyWithUpdate instead
+
 	return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, NULL, 0, NULL, 0, NULL, target_index_count, target_error, options, out_result_error);
 }
 
 size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
 {
+	assert((options & meshopt_SimplifyInternalSolve) == 0); // use meshopt_simplifyWithUpdate instead
+
 	return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, vertex_attributes_data, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, out_result_error);
 }
 
-size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* out_result_error)
+size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
+{
+	return meshopt_simplifyEdge(indices, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, vertex_attributes_data, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options | meshopt_SimplifyInternalSolve, out_result_error);
+}
+
+size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* out_result_error)
 {
 	using namespace meshopt;
 
@@ -1766,15 +2669,15 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
 	const int kInterpolationPasses = 5;
 
 	// invariant: # of triangles in min_grid <= target_count
-	int min_grid = int(1.f / (target_error < 1e-3f ? 1e-3f : target_error));
+	int min_grid = int(1.f / (target_error < 1e-3f ? 1e-3f : (target_error < 1.f ? target_error : 1.f)));
 	int max_grid = 1025;
 	size_t min_triangles = 0;
 	size_t max_triangles = index_count / 3;
 
 	// when we're error-limited, we compute the triangle count for the min. size; this accelerates convergence and provides the correct answer when we can't use a larger grid
-	if (min_grid > 1)
+	if (min_grid > 1 || vertex_lock)
 	{
-		computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+		computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, min_grid);
 		min_triangles = countTriangles(vertex_ids, indices, index_count);
 	}
 
@@ -1790,7 +2693,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
 		int grid_size = next_grid_size;
 		grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid ? max_grid - 1 : grid_size);
 
-		computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
+		computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, grid_size);
 		size_t triangles = countTriangles(vertex_ids, indices, index_count);
 
 #if TRACE
@@ -1800,7 +2703,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
 		    (triangles <= target_index_count / 3) ? "under" : "over");
 #endif
 
-		float tip = interpolate(float(target_index_count / 3), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles));
+		float tip = interpolate(float(size_t(target_index_count / 3)), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles));
 
 		if (triangles <= target_index_count / 3)
 		{
@@ -1832,7 +2735,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
 
 	unsigned int* vertex_cells = allocator.allocate<unsigned int>(vertex_count);
 
-	computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+	computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, min_grid);
 	size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
 
 	// build a quadric for each target cell
@@ -1853,15 +2756,15 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
 	for (size_t i = 0; i < cell_count; ++i)
 		result_error = result_error < cell_errors[i] ? cell_errors[i] : result_error;
 
-	// collapse triangles!
-	// note that we need to filter out triangles that we've already output because we very frequently generate redundant triangles between cells :(
+	// vertex collapses often result in duplicate triangles; we need a table to filter them out
 	size_t tritable_size = hashBuckets2(min_triangles);
 	unsigned int* tritable = allocator.allocate<unsigned int>(tritable_size);
 
+	// note: this is the first and last write to destination, which allows aliasing destination with indices
 	size_t write = filterTriangles(destination, tritable, tritable_size, indices, index_count, vertex_cells, cell_remap);
 
 #if TRACE
-	printf("result: %d cells, %d triangles (%d unfiltered), error %e\n", int(cell_count), int(write / 3), int(min_triangles), sqrtf(result_error));
+	printf("result: grid size %d, %d cells, %d triangles (%d unfiltered), error %e\n", min_grid, int(cell_count), int(write / 3), int(min_triangles), sqrtf(result_error));
 #endif
 
 	if (out_result_error)
@@ -1870,6 +2773,40 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
 	return write;
 }
 
+size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, float target_error)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(target_error >= 0);
+
+	meshopt_Allocator allocator;
+
+	unsigned int* result = destination;
+	if (result != indices)
+		memcpy(result, indices, index_count * sizeof(unsigned int));
+
+	// build position remap that maps each vertex to the one with identical position
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	buildPositionRemap(remap, NULL, vertex_positions_data, vertex_count, vertex_positions_stride, NULL, allocator);
+
+	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
+	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, NULL);
+
+	unsigned int* components = allocator.allocate<unsigned int>(vertex_count);
+	size_t component_count = buildComponents(components, vertex_count, indices, index_count, remap);
+
+	float* component_errors = allocator.allocate<float>(component_count * 4); // overallocate for temporary use inside measureComponents
+	measureComponents(component_errors, component_count, components, vertex_positions, vertex_count);
+
+	float component_nexterror = 0;
+	size_t result_count = pruneComponents(result, index_count, components, component_errors, component_count, target_error * target_error, component_nexterror);
+
+	return result_count;
+}
+
 size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count)
 {
 	using namespace meshopt;
@@ -1922,7 +2859,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos
 		int grid_size = next_grid_size;
 		grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid ? max_grid - 1 : grid_size);
 
-		computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
+		computeVertexIds(vertex_ids, vertex_positions, NULL, vertex_count, grid_size);
 		size_t vertices = countVertexCells(table, table_size, vertex_ids, vertex_count);
 
 #if TRACE
@@ -1959,7 +2896,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos
 	// build vertex->cell association by mapping all vertices with the same quantized position to the same cell
 	unsigned int* vertex_cells = allocator.allocate<unsigned int>(vertex_count);
 
-	computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+	computeVertexIds(vertex_ids, vertex_positions, NULL, vertex_count, min_grid);
 	size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
 
 	// accumulate points into a reservoir for each target cell
@@ -1972,7 +2909,10 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos
 	unsigned int* cell_remap = allocator.allocate<unsigned int>(cell_count);
 	float* cell_errors = allocator.allocate<float>(cell_count);
 
-	fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_reservoirs, vertex_positions, vertex_colors, vertex_colors_stride, color_weight * color_weight, vertex_count);
+	// we scale the color weight to bring it to the same scale as position so that error addition makes sense
+	float color_weight_scaled = color_weight * (min_grid == 1 ? 1.f : 1.f / (min_grid - 1));
+
+	fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_reservoirs, vertex_positions, vertex_colors, vertex_colors_stride, color_weight_scaled * color_weight_scaled, vertex_count);
 
 	// copy results to the output
 	assert(cell_count <= target_vertex_count);
diff --git a/Source/ThirdParty/meshoptimizer/spatialorder.cpp b/Source/ThirdParty/meshoptimizer/spatialorder.cpp
index 7b1a06945..8a785fcd5 100644
--- a/Source/ThirdParty/meshoptimizer/spatialorder.cpp
+++ b/Source/ThirdParty/meshoptimizer/spatialorder.cpp
@@ -10,18 +10,19 @@
 namespace meshopt
 {
 
-// "Insert" two 0 bits after each of the 10 low bits of x
-inline unsigned int part1By2(unsigned int x)
+// "Insert" two 0 bits after each of the 20 low bits of x
+inline unsigned long long part1By2(unsigned long long x)
 {
-	x &= 0x000003ff;                  // x = ---- ---- ---- ---- ---- --98 7654 3210
-	x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
-	x = (x ^ (x << 8)) & 0x0300f00f;  // x = ---- --98 ---- ---- 7654 ---- ---- 3210
-	x = (x ^ (x << 4)) & 0x030c30c3;  // x = ---- --98 ---- 76-- --54 ---- 32-- --10
-	x = (x ^ (x << 2)) & 0x09249249;  // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+	x &= 0x000fffffull;                          // x = ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- jihg fedc ba98 7654 3210
+	x = (x ^ (x << 32)) & 0x000f00000000ffffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- ---- ---- ---- ---- fedc ba98 7654 3210
+	x = (x ^ (x << 16)) & 0x000f0000ff0000ffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- fedc ba98 ---- ---- ---- ---- 7654 3210
+	x = (x ^ (x << 8)) & 0x000f00f00f00f00full;  // x = ---- ---- ---- jihg ---- ---- fedc ---- ---- ba98 ---- ---- 7654 ---- ---- 3210
+	x = (x ^ (x << 4)) & 0x00c30c30c30c30c3ull;  // x = ---- ---- ji-- --hg ---- fe-- --dc ---- ba-- --98 ---- 76-- --54 ---- 32-- --10
+	x = (x ^ (x << 2)) & 0x0249249249249249ull;  // x = ---- --j- -i-- h--g --f- -e-- d--c --b- -a-- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
 	return x;
 }
 
-static void computeOrder(unsigned int* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
+static void computeOrder(unsigned long long* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, bool morton)
 {
 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
 
@@ -47,66 +48,171 @@ static void computeOrder(unsigned int* result, const float* vertex_positions_dat
 	extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
 	extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
 
-	float scale = extent == 0 ? 0.f : 1.f / extent;
+	// rescale each axis to 16 bits to get 48-bit Morton codes
+	float scale = extent == 0 ? 0.f : 65535.f / extent;
 
 	// generate Morton order based on the position inside a unit cube
 	for (size_t i = 0; i < vertex_count; ++i)
 	{
 		const float* v = vertex_positions_data + i * vertex_stride_float;
 
-		int x = int((v[0] - minv[0]) * scale * 1023.f + 0.5f);
-		int y = int((v[1] - minv[1]) * scale * 1023.f + 0.5f);
-		int z = int((v[2] - minv[2]) * scale * 1023.f + 0.5f);
+		int x = int((v[0] - minv[0]) * scale + 0.5f);
+		int y = int((v[1] - minv[1]) * scale + 0.5f);
+		int z = int((v[2] - minv[2]) * scale + 0.5f);
 
-		result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
+		if (morton)
+			result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
+		else
+			result[i] = ((unsigned long long)x << 0) | ((unsigned long long)y << 20) | ((unsigned long long)z << 40);
 	}
 }
 
-static void computeHistogram(unsigned int (&hist)[1024][3], const unsigned int* data, size_t count)
+static void radixSort10(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count)
 {
+	unsigned int hist[1024];
 	memset(hist, 0, sizeof(hist));
 
-	// compute 3 10-bit histograms in parallel
+	// compute histogram (assume keys are 10-bit)
 	for (size_t i = 0; i < count; ++i)
-	{
-		unsigned int id = data[i];
+		hist[keys[i]]++;
 
-		hist[(id >> 0) & 1023][0]++;
-		hist[(id >> 10) & 1023][1]++;
-		hist[(id >> 20) & 1023][2]++;
-	}
-
-	unsigned int sumx = 0, sumy = 0, sumz = 0;
+	unsigned int sum = 0;
 
 	// replace histogram data with prefix histogram sums in-place
 	for (int i = 0; i < 1024; ++i)
 	{
-		unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
-
-		hist[i][0] = sumx;
-		hist[i][1] = sumy;
-		hist[i][2] = sumz;
-
-		sumx += hx;
-		sumy += hy;
-		sumz += hz;
+		unsigned int h = hist[i];
+		hist[i] = sum;
+		sum += h;
 	}
 
-	assert(sumx == count && sumy == count && sumz == count);
+	assert(sum == count);
+
+	// reorder values
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = keys[source[i]];
+
+		destination[hist[id]++] = source[i];
+	}
 }
 
-static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned int* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
+static void computeHistogram(unsigned int (&hist)[256][2], const unsigned short* data, size_t count)
 {
-	int bitoff = pass * 10;
+	memset(hist, 0, sizeof(hist));
+
+	// compute 2 8-bit histograms in parallel
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned long long id = data[i];
+
+		hist[(id >> 0) & 255][0]++;
+		hist[(id >> 8) & 255][1]++;
+	}
+
+	unsigned int sum0 = 0, sum1 = 0;
+
+	// replace histogram data with prefix histogram sums in-place
+	for (int i = 0; i < 256; ++i)
+	{
+		unsigned int h0 = hist[i][0], h1 = hist[i][1];
+
+		hist[i][0] = sum0;
+		hist[i][1] = sum1;
+
+		sum0 += h0;
+		sum1 += h1;
+	}
+
+	assert(sum0 == count && sum1 == count);
+}
+
+static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count, unsigned int (&hist)[256][2], int pass)
+{
+	int bitoff = pass * 8;
 
 	for (size_t i = 0; i < count; ++i)
 	{
-		unsigned int id = (keys[source[i]] >> bitoff) & 1023;
+		unsigned int id = unsigned(keys[source[i]] >> bitoff) & 255;
 
 		destination[hist[id][pass]++] = source[i];
 	}
 }
 
+static void partitionPoints(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count)
+{
+	size_t l = 0, r = split;
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned char side = sides[order[i]];
+		target[side ? r : l] = order[i];
+		l += 1;
+		l -= side;
+		r += side;
+	}
+
+	assert(l == split && r == count);
+}
+
+static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, const unsigned long long* keys, size_t count, void* scratch, size_t cluster_size)
+{
+	if (count <= cluster_size)
+	{
+		memcpy(destination, orderx, count * sizeof(unsigned int));
+		return;
+	}
+
+	unsigned int* axes[3] = {orderx, ordery, orderz};
+
+	int bestk = -1;
+	unsigned int bestdim = 0;
+
+	for (int k = 0; k < 3; ++k)
+	{
+		const unsigned int mask = (1 << 20) - 1;
+		unsigned int dim = (unsigned(keys[axes[k][count - 1]] >> (k * 20)) & mask) - (unsigned(keys[axes[k][0]] >> (k * 20)) & mask);
+
+		if (dim >= bestdim)
+		{
+			bestk = k;
+			bestdim = dim;
+		}
+	}
+
+	assert(bestk >= 0);
+
+	// split roughly in half, with the left split always being aligned to cluster size
+	size_t split = ((count / 2) + cluster_size - 1) / cluster_size * cluster_size;
+	assert(split > 0 && split < count);
+
+	// mark sides of split for partitioning
+	unsigned char* sides = static_cast<unsigned char*>(scratch) + count * sizeof(unsigned int);
+
+	for (size_t i = 0; i < split; ++i)
+		sides[axes[bestk][i]] = 0;
+
+	for (size_t i = split; i < count; ++i)
+		sides[axes[bestk][i]] = 1;
+
+	// partition all axes into two sides, maintaining order
+	unsigned int* temp = static_cast<unsigned int*>(scratch);
+
+	for (int k = 0; k < 3; ++k)
+	{
+		if (k == bestk)
+			continue;
+
+		unsigned int* axis = axes[k];
+		memcpy(temp, axis, sizeof(unsigned int) * count);
+		partitionPoints(axis, temp, sides, split, count);
+	}
+
+	// recursion depth is logarithmic and bounded as we always split in approximately half
+	splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size);
+	splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size);
+}
+
 } // namespace meshopt
 
 void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
@@ -118,21 +224,26 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos
 
 	meshopt_Allocator allocator;
 
-	unsigned int* keys = allocator.allocate<unsigned int>(vertex_count);
-	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride);
+	unsigned long long* keys = allocator.allocate<unsigned long long>(vertex_count);
+	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ true);
 
-	unsigned int hist[1024][3];
-	computeHistogram(hist, keys, vertex_count);
-
-	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count);
+	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count * 2); // 4b for order + 2b for keys
+	unsigned short* keyk = (unsigned short*)(scratch + vertex_count);
 
 	for (size_t i = 0; i < vertex_count; ++i)
 		destination[i] = unsigned(i);
 
-	// 3-pass radix sort computes the resulting order into scratch
-	radixPass(scratch, destination, keys, vertex_count, hist, 0);
-	radixPass(destination, scratch, keys, vertex_count, hist, 1);
-	radixPass(scratch, destination, keys, vertex_count, hist, 2);
+	unsigned int* order[] = {scratch, destination};
+
+	// 5-pass radix sort computes the resulting order into scratch
+	for (int k = 0; k < 5; ++k)
+	{
+		// copy 10-bit key segments into keyk to reduce cache pressure during radix pass
+		for (size_t i = 0; i < vertex_count; ++i)
+			keyk[i] = (unsigned short)((keys[i] >> (k * 10)) & 1023);
+
+		radixSort10(order[k % 2], order[(k + 1) % 2], keyk, vertex_count);
+	}
 
 	// since our remap table is mapping old=>new, we need to reverse it
 	for (size_t i = 0; i < vertex_count; ++i)
@@ -192,3 +303,39 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int*
 		destination[r * 3 + 2] = c;
 	}
 }
+
+void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(cluster_size > 0);
+
+	meshopt_Allocator allocator;
+
+	unsigned long long* keys = allocator.allocate<unsigned long long>(vertex_count);
+	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ false);
+
+	unsigned int* order = allocator.allocate<unsigned int>(vertex_count * 3);
+	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count * 2); // 4b for order + 1b for side or 2b for keys
+	unsigned short* keyk = reinterpret_cast<unsigned short*>(scratch + vertex_count);
+
+	for (int k = 0; k < 3; ++k)
+	{
+		// copy 16-bit key segments into keyk to reduce cache pressure during radix pass
+		for (size_t i = 0; i < vertex_count; ++i)
+			keyk[i] = (unsigned short)(keys[i] >> (k * 20));
+
+		unsigned int hist[256][2];
+		computeHistogram(hist, keyk, vertex_count);
+
+		for (size_t i = 0; i < vertex_count; ++i)
+			order[k * vertex_count + i] = unsigned(i);
+
+		radixPass(scratch, order + k * vertex_count, keyk, vertex_count, hist, 0);
+		radixPass(order + k * vertex_count, scratch, keyk, vertex_count, hist, 1);
+	}
+
+	splitPoints(destination, order, order + vertex_count, order + 2 * vertex_count, keys, vertex_count, scratch, cluster_size);
+}
diff --git a/Source/ThirdParty/meshoptimizer/stripifier.cpp b/Source/ThirdParty/meshoptimizer/stripifier.cpp
index d57fb512b..4043195ae 100644
--- a/Source/ThirdParty/meshoptimizer/stripifier.cpp
+++ b/Source/ThirdParty/meshoptimizer/stripifier.cpp
@@ -10,14 +10,14 @@
 namespace meshopt
 {
 
-static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned int* valence)
+static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned char* valence)
 {
 	unsigned int index = 0;
 	unsigned int iv = ~0u;
 
 	for (size_t i = 0; i < buffer_size; ++i)
 	{
-		unsigned int va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
+		unsigned char va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
 		unsigned int v = (va < vb && va < vc) ? va : (vb < vc ? vb : vc);
 
 		if (v < iv)
@@ -71,8 +71,9 @@ size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices,
 	size_t strip_size = 0;
 
 	// compute vertex valence; this is used to prioritize starting triangle for strips
-	unsigned int* valence = allocator.allocate<unsigned int>(vertex_count);
-	memset(valence, 0, vertex_count * sizeof(unsigned int));
+	// note: we use 8-bit counters for performance; for outlier vertices the valence is incorrect but that just affects the heuristic
+	unsigned char* valence = allocator.allocate<unsigned char>(vertex_count);
+	memset(valence, 0, vertex_count);
 
 	for (size_t i = 0; i < index_count; ++i)
 	{
@@ -151,7 +152,7 @@ size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices,
 		{
 			// if we didn't find anything, we need to find the next new triangle
 			// we use a heuristic to maximize the strip length
-			unsigned int i = findStripFirst(buffer, buffer_size, &valence[0]);
+			unsigned int i = findStripFirst(buffer, buffer_size, valence);
 			unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
 
 			// ordered removal from the buffer
diff --git a/Source/ThirdParty/meshoptimizer/vertexcodec.cpp b/Source/ThirdParty/meshoptimizer/vertexcodec.cpp
index 94f7a1adc..7085cce32 100644
--- a/Source/ThirdParty/meshoptimizer/vertexcodec.cpp
+++ b/Source/ThirdParty/meshoptimizer/vertexcodec.cpp
@@ -60,6 +60,15 @@
 #define SIMD_LATENCYOPT
 #endif
 
+// In switch dispatch, marking default case as unreachable allows to remove redundant bounds checks
+#if defined(__GNUC__)
+#define SIMD_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define SIMD_UNREACHABLE() __assume(false)
+#else
+#define SIMD_UNREACHABLE() assert(!"Unreachable")
+#endif
+
 #endif // !MESHOPTIMIZER_NO_SIMD
 
 #ifdef SIMD_SSE
@@ -90,6 +99,14 @@
 #include <wasm_simd128.h>
 #endif
 
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
 #ifdef SIMD_WASM
 #define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)
 #define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
@@ -105,50 +122,76 @@ namespace meshopt
 
 const unsigned char kVertexHeader = 0xa0;
 
-static int gEncodeVertexVersion = 0;
+static int gEncodeVertexVersion = 1;
+const int kDecodeVertexVersion = 1;
 
 const size_t kVertexBlockSizeBytes = 8192;
 const size_t kVertexBlockMaxSize = 256;
 const size_t kByteGroupSize = 16;
 const size_t kByteGroupDecodeLimit = 24;
-const size_t kTailMaxSize = 32;
+const size_t kTailMinSizeV0 = 32;
+const size_t kTailMinSizeV1 = 24;
+
+static const int kBitsV0[4] = {0, 2, 4, 8};
+static const int kBitsV1[5] = {0, 1, 2, 4, 8};
+
+const int kEncodeDefaultLevel = 2;
 
 static size_t getVertexBlockSize(size_t vertex_size)
 {
-	// make sure the entire block fits into the scratch buffer
-	size_t result = kVertexBlockSizeBytes / vertex_size;
-
-	// align to byte group size; we encode each byte as a byte group
-	// if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
-	result &= ~(kByteGroupSize - 1);
+	// make sure the entire block fits into the scratch buffer and is aligned to byte group size
+	// note: the block size is implicitly part of the format, so we can't change it without breaking compatibility
+	size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1);
 
 	return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
 }
 
-inline unsigned char zigzag8(unsigned char v)
+inline unsigned int rotate(unsigned int v, int r)
 {
-	return ((signed char)(v) >> 7) ^ (v << 1);
+	return (v << r) | (v >> ((32 - r) & 31));
 }
 
-inline unsigned char unzigzag8(unsigned char v)
+template <typename T>
+inline T zigzag(T v)
 {
-	return -(v & 1) ^ (v >> 1);
+	return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1);
 }
 
+template <typename T>
+inline T unzigzag(T v)
+{
+	return (0 - (v & 1)) ^ (v >> 1);
+}
+
+#if TRACE
+struct Stats
+{
+	size_t size;
+	size_t header;  // bytes for header
+	size_t bitg[9]; // bytes for bit groups
+	size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group
+	size_t ctrl[4]; // number of control groups
+};
+
+static Stats* bytestats = NULL;
+static Stats vertexstats[256];
+#endif
+
 static bool encodeBytesGroupZero(const unsigned char* buffer)
 {
-	for (size_t i = 0; i < kByteGroupSize; ++i)
-		if (buffer[i])
-			return false;
+	assert(kByteGroupSize == sizeof(unsigned long long) * 2);
 
-	return true;
+	unsigned long long v[2];
+	memcpy(v, buffer, sizeof(v));
+
+	return (v[0] | v[1]) == 0;
 }
 
 static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
 {
-	assert(bits >= 1 && bits <= 8);
+	assert(bits >= 0 && bits <= 8);
 
-	if (bits == 1)
+	if (bits == 0)
 		return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
 
 	if (bits == 8)
@@ -166,9 +209,10 @@ static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
 
 static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
 {
-	assert(bits >= 1 && bits <= 8);
+	assert(bits >= 0 && bits <= 8);
+	assert(kByteGroupSize % 8 == 0);
 
-	if (bits == 1)
+	if (bits == 0)
 		return data;
 
 	if (bits == 8)
@@ -196,21 +240,27 @@ static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char*
 			byte |= enc;
 		}
 
+		// encode 1-bit groups in reverse bit order
+		// this makes them faster to decode alongside other groups
+		if (bits == 1)
+			byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+
 		*data++ = byte;
 	}
 
 	for (size_t i = 0; i < kByteGroupSize; ++i)
 	{
-		if (buffer[i] >= sentinel)
-		{
-			*data++ = buffer[i];
-		}
+		unsigned char v = buffer[i];
+
+		// branchless append of out-of-range values
+		*data = v;
+		data += v >= sentinel;
 	}
 
 	return data;
 }
 
-static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
+static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size, const int bits[4])
 {
 	assert(buffer_size % kByteGroupSize == 0);
 
@@ -226,69 +276,301 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
 
 	memset(header, 0, header_size);
 
+	int last_bits = -1;
+
 	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
 	{
 		if (size_t(data_end - data) < kByteGroupDecodeLimit)
 			return NULL;
 
-		int best_bits = 8;
-		size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
+		int best_bitk = 3;
+		size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]);
 
-		for (int bits = 1; bits < 8; bits *= 2)
+		for (int bitk = 0; bitk < 3; ++bitk)
 		{
-			size_t size = encodeBytesGroupMeasure(buffer + i, bits);
+			size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]);
 
-			if (size < best_size)
+			// favor consistent bit selection across groups, but never replace literals
+			if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8))
 			{
-				best_bits = bits;
+				best_bitk = bitk;
 				best_size = size;
 			}
 		}
 
-		int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2 ? 1 : (best_bits == 4 ? 2 : 3));
-		assert((1 << bitslog2) == best_bits);
-
 		size_t header_offset = i / kByteGroupSize;
+		header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2);
 
-		header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
-
+		int best_bits = bits[best_bitk];
 		unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
 
 		assert(data + best_size == next);
 		data = next;
+		last_bits = best_bits;
+
+#if TRACE
+		bytestats->bitg[best_bits] += best_size;
+#endif
 	}
 
+#if TRACE
+	bytestats->header += header_size;
+#endif
+
 	return data;
 }
 
-static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+template <typename T, bool Xor>
+static void encodeDeltas1(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int rot)
+{
+	size_t k0 = k & ~(sizeof(T) - 1);
+	int ks = (k & (sizeof(T) - 1)) * 8;
+
+	T p = last_vertex[k0];
+	for (size_t j = 1; j < sizeof(T); ++j)
+		p |= T(last_vertex[k0 + j]) << (j * 8);
+
+	const unsigned char* vertex = vertex_data + k0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		T v = vertex[0];
+		for (size_t j = 1; j < sizeof(T); ++j)
+			v |= vertex[j] << (j * 8);
+
+		T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
+
+		buffer[i] = (unsigned char)(d >> ks);
+		p = v;
+		vertex += vertex_size;
+	}
+}
+
+static void encodeDeltas(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int channel)
+{
+	switch (channel & 3)
+	{
+	case 0:
+		return encodeDeltas1<unsigned char, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
+	case 1:
+		return encodeDeltas1<unsigned short, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
+	case 2:
+		return encodeDeltas1<unsigned int, true>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4);
+	default:
+		assert(!"Unsupported channel encoding"); // unreachable
+	}
+}
+
+static int estimateBits(unsigned char v)
+{
+	return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8;
+}
+
+static int estimateRotate(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t group_size)
+{
+	size_t sizes[8] = {};
+
+	const unsigned char* vertex = vertex_data + k;
+	unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
+
+	for (size_t i = 0; i < vertex_count; i += group_size)
+	{
+		unsigned int bitg = 0;
+
+		// calculate bit consistency mask for the group
+		for (size_t j = 0; j < group_size && i + j < vertex_count; ++j)
+		{
+			unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
+			unsigned int d = v ^ last;
+
+			bitg |= d;
+			last = v;
+			vertex += vertex_size;
+		}
+
+#if TRACE
+		for (int j = 0; j < 32; ++j)
+			vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1));
+#endif
+
+		for (int j = 0; j < 8; ++j)
+		{
+			unsigned int bitr = rotate(bitg, j);
+
+			sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8));
+			sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24));
+		}
+	}
+
+	int best_rot = 0;
+	for (int rot = 1; rot < 8; ++rot)
+		best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot;
+
+	return best_rot;
+}
+
+static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t vertex_block_size, size_t block_skip, int max_channel, int xor_rot)
+{
+	unsigned char block[kVertexBlockMaxSize];
+	assert(vertex_block_size <= kVertexBlockMaxSize);
+
+	unsigned char last_vertex[256] = {};
+
+	size_t sizes[3] = {};
+	assert(max_channel <= 3);
+
+	for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip)
+	{
+		size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i;
+		size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+		memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size);
+
+		// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
+		if (block_size < block_size_aligned)
+			memset(block + block_size, 0, block_size_aligned - block_size);
+
+		for (int channel = 0; channel < max_channel; ++channel)
+			for (size_t j = 0; j < 4; ++j)
+			{
+				encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4));
+
+				for (size_t ig = 0; ig < block_size; ig += kByteGroupSize)
+				{
+					// to maximize encoding performance we only evaluate 1/2/4/8 bit groups
+					size_t size1 = encodeBytesGroupMeasure(block + ig, 1);
+					size_t size2 = encodeBytesGroupMeasure(block + ig, 2);
+					size_t size4 = encodeBytesGroupMeasure(block + ig, 4);
+					size_t size8 = encodeBytesGroupMeasure(block + ig, 8);
+
+					size_t best_size = size1 < size2 ? size1 : size2;
+					best_size = best_size < size4 ? best_size : size4;
+					best_size = best_size < size8 ? best_size : size8;
+
+					sizes[channel] += best_size;
+				}
+			}
+	}
+
+	int best_channel = 0;
+	for (int channel = 1; channel < max_channel; ++channel)
+		best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel;
+
+	return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel;
+}
+
+static bool estimateControlZero(const unsigned char* buffer, size_t vertex_count_aligned)
+{
+	for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
+		if (!encodeBytesGroupZero(buffer + i))
+			return false;
+
+	return true;
+}
+
+static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level)
+{
+	if (estimateControlZero(buffer, vertex_count_aligned))
+		return 2; // zero encoding
+
+	if (level == 0)
+		return 1; // 1248 encoding in level 0 for encoding speed
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4;
+
+	size_t est_bytes0 = header_size, est_bytes1 = header_size;
+
+	for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
+	{
+		// assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance
+		size_t size0 = encodeBytesGroupMeasure(buffer + i, 0);
+		size_t size1 = encodeBytesGroupMeasure(buffer + i, 1);
+		size_t size2 = encodeBytesGroupMeasure(buffer + i, 2);
+		size_t size4 = encodeBytesGroupMeasure(buffer + i, 4);
+		size_t size8 = encodeBytesGroupMeasure(buffer + i, 8);
+
+		// both control modes have access to 1/2/4 bit encoding
+		size_t size12 = size1 < size2 ? size1 : size2;
+		size_t size124 = size12 < size4 ? size12 : size4;
+
+		// each control mode has access to 0/8 bit encoding respectively
+		est_bytes0 += size124 < size0 ? size124 : size0;
+		est_bytes1 += size124 < size8 ? size124 : size8;
+	}
+
+	// pick shortest control entry but prefer literal encoding
+	if (est_bytes0 < vertex_count || est_bytes1 < vertex_count)
+		return est_bytes0 < est_bytes1 ? 0 : 1;
+	else
+		return 3; // literal encoding
+}
+
+static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version, int level)
 {
 	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+	assert(vertex_size % 4 == 0);
 
 	unsigned char buffer[kVertexBlockMaxSize];
 	assert(sizeof(buffer) % kByteGroupSize == 0);
 
+	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
 	// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
 	memset(buffer, 0, sizeof(buffer));
 
+	size_t control_size = version == 0 ? 0 : vertex_size / 4;
+	if (size_t(data_end - data) < control_size)
+		return NULL;
+
+	unsigned char* control = data;
+	data += control_size;
+
+	memset(control, 0, control_size);
+
 	for (size_t k = 0; k < vertex_size; ++k)
 	{
-		size_t vertex_offset = k;
+		encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]);
 
-		unsigned char p = last_vertex[k];
+#if TRACE
+		const unsigned char* olddata = data;
+		bytestats = &vertexstats[k];
+#endif
 
-		for (size_t i = 0; i < vertex_count; ++i)
+		int ctrl = 0;
+
+		if (version != 0)
 		{
-			buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
+			ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level);
 
-			p = vertex_data[vertex_offset];
+			assert(unsigned(ctrl) < 4);
+			control[k / 4] |= ctrl << ((k % 4) * 2);
 
-			vertex_offset += vertex_size;
+#if TRACE
+			vertexstats[k].ctrl[ctrl]++;
+#endif
 		}
 
-		data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
-		if (!data)
-			return NULL;
+		if (ctrl == 3)
+		{
+			// literal encoding
+			if (size_t(data_end - data) < vertex_count)
+				return NULL;
+
+			memcpy(data, buffer, vertex_count);
+			data += vertex_count;
+		}
+		else if (ctrl != 2) // non-zero encoding
+		{
+			data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
+			if (!data)
+				return NULL;
+		}
+
+#if TRACE
+		bytestats = NULL;
+		vertexstats[k].size += data - olddata;
+#endif
 	}
 
 	memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
@@ -297,7 +579,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
 }
 
 #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
-static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
+static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bits)
 {
 #define READ() byte = *data++
 #define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
@@ -305,12 +587,24 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
 	unsigned char byte, enc, encv;
 	const unsigned char* data_var;
 
-	switch (bitslog2)
+	switch (bits)
 	{
 	case 0:
 		memset(buffer, 0, kByteGroupSize);
 		return data;
 	case 1:
+		data_var = data + 2;
+
+		// 2 groups with 8 1-bit values in each byte (reversed from the order in other groups)
+		READ();
+		byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+		NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);
+		READ();
+		byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+		NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);
+
+		return data_var;
+	case 2:
 		data_var = data + 4;
 
 		// 4 groups with 4 2-bit values in each byte
@@ -320,7 +614,7 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
 		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
 
 		return data_var;
-	case 2:
+	case 4:
 		data_var = data + 8;
 
 		// 8 groups with 2 4-bit values in each byte
@@ -334,11 +628,11 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
 		READ(), NEXT(4), NEXT(4);
 
 		return data_var;
-	case 3:
+	case 8:
 		memcpy(buffer, data, kByteGroupSize);
 		return data + kByteGroupSize;
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		assert(!"Unexpected bit length"); // unreachable
 		return data;
 	}
 
@@ -346,18 +640,16 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
 #undef NEXT
 }
 
-static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, const int* bits)
 {
 	assert(buffer_size % kByteGroupSize == 0);
 
-	const unsigned char* header = data;
-
 	// round number of groups to 4 to get number of header bytes
 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
-
 	if (size_t(data_end - data) < header_size)
 		return NULL;
 
+	const unsigned char* header = data;
 	data += header_size;
 
 	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
@@ -366,43 +658,109 @@ static const unsigned char* decodeBytes(const unsigned char* data, const unsigne
 			return NULL;
 
 		size_t header_offset = i / kByteGroupSize;
+		int bitsk = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
 
-		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
-
-		data = decodeBytesGroup(data, buffer + i, bitslog2);
+		data = decodeBytesGroup(data, buffer + i, bits[bitsk]);
 	}
 
 	return data;
 }
 
-static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+template <typename T, bool Xor>
+static void decodeDeltas1(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count, size_t vertex_size, const unsigned char* last_vertex, int rot)
 {
-	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
-
-	unsigned char buffer[kVertexBlockMaxSize];
-	unsigned char transposed[kVertexBlockSizeBytes];
-
-	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
-
-	for (size_t k = 0; k < vertex_size; ++k)
+	for (size_t k = 0; k < 4; k += sizeof(T))
 	{
-		data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
-		if (!data)
-			return NULL;
-
 		size_t vertex_offset = k;
 
-		unsigned char p = last_vertex[k];
+		T p = last_vertex[0];
+		for (size_t j = 1; j < sizeof(T); ++j)
+			p |= last_vertex[j] << (8 * j);
 
 		for (size_t i = 0; i < vertex_count; ++i)
 		{
-			unsigned char v = unzigzag8(buffer[i]) + p;
+			T v = buffer[i];
+			for (size_t j = 1; j < sizeof(T); ++j)
+				v |= buffer[i + vertex_count * j] << (8 * j);
+
+			v = Xor ? T(rotate(v, rot)) ^ p : unzigzag(v) + p;
+
+			for (size_t j = 0; j < sizeof(T); ++j)
+				transposed[vertex_offset + j] = (unsigned char)(v >> (j * 8));
 
-			transposed[vertex_offset] = v;
 			p = v;
 
 			vertex_offset += vertex_size;
 		}
+
+		buffer += vertex_count * sizeof(T);
+		last_vertex += sizeof(T);
+	}
+}
+
+static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)
+{
+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+	unsigned char buffer[kVertexBlockMaxSize * 4];
+	unsigned char transposed[kVertexBlockSizeBytes];
+
+	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+	assert(vertex_count <= vertex_count_aligned);
+
+	size_t control_size = version == 0 ? 0 : vertex_size / 4;
+	if (size_t(data_end - data) < control_size)
+		return NULL;
+
+	const unsigned char* control = data;
+	data += control_size;
+
+	for (size_t k = 0; k < vertex_size; k += 4)
+	{
+		unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
+
+		for (size_t j = 0; j < 4; ++j)
+		{
+			int ctrl = (ctrl_byte >> (j * 2)) & 3;
+
+			if (ctrl == 3)
+			{
+				// literal encoding
+				if (size_t(data_end - data) < vertex_count)
+					return NULL;
+
+				memcpy(buffer + j * vertex_count, data, vertex_count);
+				data += vertex_count;
+			}
+			else if (ctrl == 2)
+			{
+				// zero encoding
+				memset(buffer + j * vertex_count, 0, vertex_count);
+			}
+			else
+			{
+				data = decodeBytes(data, data_end, buffer + j * vertex_count, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
+				if (!data)
+					return NULL;
+			}
+		}
+
+		int channel = version == 0 ? 0 : channels[k / 4];
+
+		switch (channel & 3)
+		{
+		case 0:
+			decodeDeltas1<unsigned char, false>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);
+			break;
+		case 1:
+			decodeDeltas1<unsigned short, false>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);
+			break;
+		case 2:
+			decodeDeltas1<unsigned int, true>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
+			break;
+		default:
+			return NULL; // invalid channel type
+		}
 	}
 
 	memcpy(vertex_data, transposed, vertex_count * vertex_size);
@@ -447,7 +805,7 @@ static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
 
 #ifdef SIMD_SSE
 SIMD_TARGET
-static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+inline __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 {
 	__m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
 	__m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
@@ -459,11 +817,12 @@ static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 }
 
 SIMD_TARGET
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
 {
-	switch (bitslog2)
+	switch (hbits)
 	{
 	case 0:
+	case 4:
 	{
 		__m128i result = _mm_setzero_si128();
 
@@ -473,6 +832,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 1:
+	case 6:
 	{
 #ifdef __GNUC__
 		typedef int __attribute__((aligned(1))) unaligned_int;
@@ -505,7 +865,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		unsigned char mask1 = (unsigned char)(mask16 >> 8);
 
 		__m128i shuf = decodeShuffleMask(mask0, mask1);
-
 		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
 
 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
@@ -518,6 +877,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 2:
+	case 7:
 	{
 #ifdef SIMD_LATENCYOPT
 		unsigned long long data64;
@@ -541,7 +901,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		unsigned char mask1 = (unsigned char)(mask16 >> 8);
 
 		__m128i shuf = decodeShuffleMask(mask0, mask1);
-
 		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
 
 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
@@ -554,6 +913,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 3:
+	case 8:
 	{
 		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
 
@@ -562,26 +922,46 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		return data + 16;
 	}
 
+	case 5:
+	{
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 2));
+
+		unsigned char mask0 = data[0];
+		unsigned char mask1 = data[1];
+
+		__m128i shuf = decodeShuffleMask(mask0, mask1);
+		__m128i result = _mm_shuffle_epi8(rest, shuf);
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
-		return data;
+		SIMD_UNREACHABLE(); // unreachable
 	}
 }
 #endif
 
 #ifdef SIMD_AVX
-static const __m128i decodeBytesGroupConfig[] = {
-    _mm_set1_epi8(3),
-    _mm_set1_epi8(15),
-    _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
-    _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
+static const __m128i kDecodeBytesGroupConfig[8][2] = {
+    {_mm_setzero_si128(), _mm_setzero_si128()},
+    {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},
+    {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},
+    {_mm_setzero_si128(), _mm_setzero_si128()},
+    {_mm_setzero_si128(), _mm_setzero_si128()},
+    {_mm_set1_epi8(1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)},
+    {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},
+    {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},
 };
 
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+SIMD_TARGET
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
 {
-	switch (bitslog2)
+	switch (hbits)
 	{
 	case 0:
+	case 4:
 	{
 		__m128i result = _mm_setzero_si128();
 
@@ -590,16 +970,19 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		return data;
 	}
 
-	case 1:
-	case 2:
+	case 5: // 1-bit
+	case 1: // 2-bit
+	case 6:
+	case 2: // 4-bit
+	case 7:
 	{
-		const unsigned char* skip = data + (bitslog2 << 2);
+		const unsigned char* skip = data + (2 << (hbits < 3 ? hbits : hbits - 5));
 
 		__m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
 
-		__m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
-		__m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
+		__m128i sent = kDecodeBytesGroupConfig[hbits][0];
+		__m128i ctrl = kDecodeBytesGroupConfig[hbits][1];
 
 		__m128i selw = _mm_shuffle_epi32(selb, 0x44);
 		__m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
@@ -613,6 +996,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 3:
+	case 8:
 	{
 		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
 
@@ -622,14 +1006,14 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
-		return data;
+		SIMD_UNREACHABLE(); // unreachable
 	}
 }
 #endif
 
 #ifdef SIMD_NEON
-static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
+SIMD_TARGET
+inline uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
 {
 	uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
 	uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
@@ -640,7 +1024,8 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8
 	return vcombine_u8(r0, r1);
 }
 
-static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
+SIMD_TARGET
+inline void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
 {
 	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
 	const uint64_t magic = 0x000103070f1f3f80ull;
@@ -651,11 +1036,13 @@ static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& m
 	mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
 }
 
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+SIMD_TARGET
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
 {
-	switch (bitslog2)
+	switch (hbits)
 	{
 	case 0:
+	case 4:
 	{
 		uint8x16_t result = vdupq_n_u8(0);
 
@@ -665,6 +1052,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 1:
+	case 6:
 	{
 #ifdef SIMD_LATENCYOPT
 		unsigned int data32;
@@ -702,6 +1090,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 2:
+	case 7:
 	{
 #ifdef SIMD_LATENCYOPT
 		unsigned long long data64;
@@ -736,6 +1125,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 3:
+	case 8:
 	{
 		uint8x16_t result = vld1q_u8(data);
 
@@ -744,30 +1134,42 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		return data + 16;
 	}
 
+	case 5:
+	{
+		unsigned char mask0 = data[0];
+		unsigned char mask1 = data[1];
+
+		uint8x8_t rest0 = vld1_u8(data + 2);
+		uint8x8_t rest1 = vld1_u8(data + 2 + kDecodeBytesGroupCount[mask0]);
+
+		uint8x16_t result = shuffleBytes(mask0, mask1, rest0, rest1);
+
+		vst1q_u8(buffer, result);
+
+		return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
-		return data;
+		SIMD_UNREACHABLE(); // unreachable
 	}
 }
 #endif
 
 #ifdef SIMD_WASM
 SIMD_TARGET
-static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+inline v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 {
 	v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
 	v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
 
-	v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
-	sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-
+	v128_t sm1off = wasm_v128_load8_splat(&kDecodeBytesGroupCount[mask0]);
 	v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
 
 	return wasmx_unpacklo_v64x2(sm0, sm1r);
 }
 
 SIMD_TARGET
-static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
+inline void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
 {
 	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
 	const uint64_t magic = 0x000103070f1f3f80ull;
@@ -777,11 +1179,12 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1
 }
 
 SIMD_TARGET
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
 {
-	switch (bitslog2)
+	switch (hbits)
 	{
 	case 0:
+	case 4:
 	{
 		v128_t result = wasm_i8x16_splat(0);
 
@@ -791,6 +1194,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 1:
+	case 6:
 	{
 		v128_t sel2 = wasm_v128_load(data);
 		v128_t rest = wasm_v128_load(data + 4);
@@ -805,7 +1209,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		wasmMoveMask(mask, mask0, mask1);
 
 		v128_t shuf = decodeShuffleMask(mask0, mask1);
-
 		v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
 
 		wasm_v128_store(buffer, result);
@@ -814,6 +1217,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 2:
+	case 7:
 	{
 		v128_t sel4 = wasm_v128_load(data);
 		v128_t rest = wasm_v128_load(data + 8);
@@ -827,7 +1231,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		wasmMoveMask(mask, mask0, mask1);
 
 		v128_t shuf = decodeShuffleMask(mask0, mask1);
-
 		v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
 
 		wasm_v128_store(buffer, result);
@@ -836,6 +1239,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 3:
+	case 8:
 	{
 		v128_t result = wasm_v128_load(data);
 
@@ -844,16 +1248,30 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		return data + 16;
 	}
 
+	case 5:
+	{
+		v128_t rest = wasm_v128_load(data + 2);
+
+		unsigned char mask0 = data[0];
+		unsigned char mask1 = data[1];
+
+		v128_t shuf = decodeShuffleMask(mask0, mask1);
+		v128_t result = wasm_i8x16_swizzle(rest, shuf);
+
+		wasm_v128_store(buffer, result);
+
+		return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
-		return data;
+		SIMD_UNREACHABLE(); // unreachable
 	}
 }
 #endif
 
 #if defined(SIMD_SSE) || defined(SIMD_AVX)
 SIMD_TARGET
-static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
+inline void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
 {
 	__m128i t0 = _mm_unpacklo_epi8(x0, x1);
 	__m128i t1 = _mm_unpackhi_epi8(x0, x1);
@@ -867,17 +1285,33 @@ static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
 }
 
 SIMD_TARGET
-static __m128i unzigzag8(__m128i v)
+inline __m128i unzigzag8(__m128i v)
 {
 	__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
 	__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
 
 	return _mm_xor_si128(xl, xr);
 }
+
+SIMD_TARGET
+inline __m128i unzigzag16(__m128i v)
+{
+	__m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1)));
+	__m128i xr = _mm_srli_epi16(v, 1);
+
+	return _mm_xor_si128(xl, xr);
+}
+
+SIMD_TARGET
+inline __m128i rotate32(__m128i v, int r)
+{
+	return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r));
+}
 #endif
 
 #ifdef SIMD_NEON
-static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
+SIMD_TARGET
+inline void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
 {
 	uint8x16x2_t t01 = vzipq_u8(x0, x1);
 	uint8x16x2_t t23 = vzipq_u8(x2, x3);
@@ -891,18 +1325,64 @@ static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_
 	x3 = vreinterpretq_u8_u16(x23.val[1]);
 }
 
-static uint8x16_t unzigzag8(uint8x16_t v)
+SIMD_TARGET
+inline uint8x16_t unzigzag8(uint8x16_t v)
 {
 	uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
 	uint8x16_t xr = vshrq_n_u8(v, 1);
 
 	return veorq_u8(xl, xr);
 }
+
+SIMD_TARGET
+inline uint8x16_t unzigzag16(uint8x16_t v)
+{
+	uint16x8_t vv = vreinterpretq_u16_u8(v);
+	uint8x16_t xl = vreinterpretq_u8_s16(vnegq_s16(vreinterpretq_s16_u16(vandq_u16(vv, vdupq_n_u16(1)))));
+	uint8x16_t xr = vreinterpretq_u8_u16(vshrq_n_u16(vv, 1));
+
+	return veorq_u8(xl, xr);
+}
+
+SIMD_TARGET
+inline uint8x16_t rotate32(uint8x16_t v, int r)
+{
+	uint32x4_t v32 = vreinterpretq_u32_u8(v);
+	return vreinterpretq_u8_u32(vorrq_u32(vshlq_u32(v32, vdupq_n_s32(r)), vshlq_u32(v32, vdupq_n_s32(r - 32))));
+}
+
+template <int Channel>
+SIMD_TARGET inline uint8x8_t rebase(uint8x8_t npi, uint8x16_t r0, uint8x16_t r1, uint8x16_t r2, uint8x16_t r3)
+{
+	switch (Channel)
+	{
+	case 0:
+	{
+		uint8x16_t rsum = vaddq_u8(vaddq_u8(r0, r1), vaddq_u8(r2, r3));
+		uint8x8_t rsumx = vadd_u8(vget_low_u8(rsum), vget_high_u8(rsum));
+		return vadd_u8(vadd_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));
+	}
+	case 1:
+	{
+		uint16x8_t rsum = vaddq_u16(vaddq_u16(vreinterpretq_u16_u8(r0), vreinterpretq_u16_u8(r1)), vaddq_u16(vreinterpretq_u16_u8(r2), vreinterpretq_u16_u8(r3)));
+		uint16x4_t rsumx = vadd_u16(vget_low_u16(rsum), vget_high_u16(rsum));
+		return vreinterpret_u8_u16(vadd_u16(vadd_u16(vreinterpret_u16_u8(npi), rsumx), vext_u16(rsumx, rsumx, 2)));
+	}
+	case 2:
+	{
+		uint8x16_t rsum = veorq_u8(veorq_u8(r0, r1), veorq_u8(r2, r3));
+		uint8x8_t rsumx = veor_u8(vget_low_u8(rsum), vget_high_u8(rsum));
+		return veor_u8(veor_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));
+	}
+	default:
+		return npi;
+	}
+}
 #endif
 
 #ifdef SIMD_WASM
 SIMD_TARGET
-static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
+inline void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
 {
 	v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
 	v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
@@ -916,44 +1396,57 @@ static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
 }
 
 SIMD_TARGET
-static v128_t unzigzag8(v128_t v)
+inline v128_t unzigzag8(v128_t v)
 {
 	v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
 	v128_t xr = wasm_u8x16_shr(v, 1);
 
 	return wasm_v128_xor(xl, xr);
 }
+
+SIMD_TARGET
+inline v128_t unzigzag16(v128_t v)
+{
+	v128_t xl = wasm_i16x8_neg(wasm_v128_and(v, wasm_i16x8_splat(1)));
+	v128_t xr = wasm_u16x8_shr(v, 1);
+
+	return wasm_v128_xor(xl, xr);
+}
+
+SIMD_TARGET
+inline v128_t rotate32(v128_t v, int r)
+{
+	return wasm_v128_or(wasm_i32x4_shl(v, r), wasm_i32x4_shr(v, 32 - r));
+}
 #endif
 
 #if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
 SIMD_TARGET
-static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, int hshift)
 {
 	assert(buffer_size % kByteGroupSize == 0);
 	assert(kByteGroupSize == 16);
 
-	const unsigned char* header = data;
-
 	// round number of groups to 4 to get number of header bytes
 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
-
 	if (size_t(data_end - data) < header_size)
 		return NULL;
 
+	const unsigned char* header = data;
 	data += header_size;
 
 	size_t i = 0;
 
-	// fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
+	// fast-path: process 4 groups at a time, do a shared bounds check
 	for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
 	{
 		size_t header_offset = i / kByteGroupSize;
 		unsigned char header_byte = header[header_offset / 4];
 
-		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
-		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
-		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
-		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3));
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3));
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3));
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3));
 	}
 
 	// slow-path: process remaining groups
@@ -963,17 +1456,102 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns
 			return NULL;
 
 		size_t header_offset = i / kByteGroupSize;
+		unsigned char header_byte = header[header_offset / 4];
 
-		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
-
-		data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
+		data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3));
 	}
 
 	return data;
 }
 
+template <int Channel>
+SIMD_TARGET static void
+decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count_aligned, size_t vertex_size, unsigned char last_vertex[4], int rot)
+{
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
+#define TEMP __m128i
+#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
+#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
+#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
+#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
+#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
+#endif
+
+#ifdef SIMD_NEON
+#define TEMP uint8x8_t
+#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))
+#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
+#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
+#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
+#endif
+
+#ifdef SIMD_WASM
+#define TEMP v128_t
+#define PREP() v128_t pi = wasm_v128_load(last_vertex)
+#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
+#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
+#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
+#endif
+
+#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
+
+	PREP();
+
+	unsigned char* savep = transposed;
+
+	for (size_t j = 0; j < vertex_count_aligned; j += 16)
+	{
+		LOAD(0);
+		LOAD(1);
+		LOAD(2);
+		LOAD(3);
+
+		transpose8(r0, r1, r2, r3);
+
+		TEMP t0, t1, t2, t3;
+		TEMP npi = pi;
+
+		UNZR(0);
+		GRP4(0);
+		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+		UNZR(1);
+		GRP4(1);
+		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+		UNZR(2);
+		GRP4(2);
+		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+		UNZR(3);
+		GRP4(3);
+		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
+		// instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
+		pi = rebase<Channel>(npi, r0, r1, r2, r3);
+#else
+		(void)npi;
+#endif
+
+#undef UNZR
+#undef TEMP
+#undef PREP
+#undef LOAD
+#undef GRP4
+#undef FIXD
+#undef SAVE
+	}
+}
+
 SIMD_TARGET
-static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)
 {
 	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
 
@@ -982,84 +1560,61 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
 
 	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
 
+	size_t control_size = version == 0 ? 0 : vertex_size / 4;
+	if (size_t(data_end - data) < control_size)
+		return NULL;
+
+	const unsigned char* control = data;
+	data += control_size;
+
 	for (size_t k = 0; k < vertex_size; k += 4)
 	{
+		unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
+
 		for (size_t j = 0; j < 4; ++j)
 		{
-			data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
-			if (!data)
-				return NULL;
+			int ctrl = (ctrl_byte >> (j * 2)) & 3;
+
+			if (ctrl == 3)
+			{
+				// literal encoding; safe to over-copy due to tail
+				if (size_t(data_end - data) < vertex_count_aligned)
+					return NULL;
+
+				memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned);
+				data += vertex_count;
+			}
+			else if (ctrl == 2)
+			{
+				// zero encoding
+				memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned);
+			}
+			else
+			{
+				// for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8
+				int hshift = version == 0 ? 0 : 4 + ctrl;
+
+				data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift);
+				if (!data)
+					return NULL;
+			}
 		}
 
-#if defined(SIMD_SSE) || defined(SIMD_AVX)
-#define TEMP __m128i
-#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
-#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
-#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
-#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
-#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
-#endif
+		int channel = version == 0 ? 0 : channels[k / 4];
 
-#ifdef SIMD_NEON
-#define TEMP uint8x8_t
-#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
-#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
-#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
-#define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
-#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
-#endif
-
-#ifdef SIMD_WASM
-#define TEMP v128_t
-#define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
-#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
-#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
-#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
-#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
-#endif
-
-		PREP();
-
-		unsigned char* savep = transposed + k;
-
-		for (size_t j = 0; j < vertex_count_aligned; j += 16)
+		switch (channel & 3)
 		{
-			LOAD(0);
-			LOAD(1);
-			LOAD(2);
-			LOAD(3);
-
-			r0 = unzigzag8(r0);
-			r1 = unzigzag8(r1);
-			r2 = unzigzag8(r2);
-			r3 = unzigzag8(r3);
-
-			transpose8(r0, r1, r2, r3);
-
-			TEMP t0, t1, t2, t3;
-
-			GRP4(0);
-			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
-			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
-			GRP4(1);
-			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
-			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
-			GRP4(2);
-			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
-			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
-			GRP4(3);
-			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
-			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
-#undef TEMP
-#undef PREP
-#undef LOAD
-#undef GRP4
-#undef FIXD
-#undef SAVE
+		case 0:
+			decodeDeltas4Simd<0>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
+			break;
+		case 1:
+			decodeDeltas4Simd<1>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
+			break;
+		case 2:
+			decodeDeltas4Simd<2>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
+			break;
+		default:
+			return NULL; // invalid channel type
 		}
 	}
 
@@ -1088,23 +1643,29 @@ static unsigned int cpuid = getCpuFeatures();
 
 } // namespace meshopt
 
-size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
+size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version)
 {
 	using namespace meshopt;
 
 	assert(vertex_size > 0 && vertex_size <= 256);
 	assert(vertex_size % 4 == 0);
+	assert(level >= 0 && level <= 9); // only a subset of this range is used right now
+	assert(version < 0 || unsigned(version) <= kDecodeVertexVersion);
+
+	version = version < 0 ? gEncodeVertexVersion : version;
+
+#if TRACE
+	memset(vertexstats, 0, sizeof(vertexstats));
+#endif
 
 	const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
 
 	unsigned char* data = buffer;
 	unsigned char* data_end = buffer + buffer_size;
 
-	if (size_t(data_end - data) < 1 + vertex_size)
+	if (size_t(data_end - data) < 1)
 		return 0;
 
-	int version = gEncodeVertexVersion;
-
 	*data++ = (unsigned char)(kVertexHeader | version);
 
 	unsigned char first_vertex[256] = {};
@@ -1116,40 +1677,110 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
 
 	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 
+	unsigned char channels[64] = {};
+	if (version != 0 && level > 1 && vertex_count > 1)
+		for (size_t k = 0; k < vertex_size; k += 4)
+		{
+			int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0;
+			int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot);
+
+			assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8));
+			channels[k / 4] = (unsigned char)channel;
+		}
+
 	size_t vertex_offset = 0;
 
 	while (vertex_offset < vertex_count)
 	{
 		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
 
-		data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+		data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level);
 		if (!data)
 			return 0;
 
 		vertex_offset += block_size;
 	}
 
-	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+	size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
+	size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
+	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
 
-	if (size_t(data_end - data) < tail_size)
+	if (size_t(data_end - data) < tail_size_pad)
 		return 0;
 
-	// write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
-	if (vertex_size < kTailMaxSize)
+	if (tail_size < tail_size_pad)
 	{
-		memset(data, 0, kTailMaxSize - vertex_size);
-		data += kTailMaxSize - vertex_size;
+		memset(data, 0, tail_size_pad - tail_size);
+		data += tail_size_pad - tail_size;
 	}
 
 	memcpy(data, first_vertex, vertex_size);
 	data += vertex_size;
 
+	if (version != 0)
+	{
+		memcpy(data, channels, vertex_size / 4);
+		data += vertex_size / 4;
+	}
+
 	assert(data >= buffer + tail_size);
 	assert(data <= buffer + buffer_size);
 
+#if TRACE
+	size_t total_size = data - buffer;
+
+	for (size_t k = 0; k < vertex_size; ++k)
+	{
+		const Stats& vsk = vertexstats[k];
+
+		printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
+
+		size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8];
+		double total_kr = total_k ? 1.0 / double(total_k) : 0;
+
+		if (version != 0)
+		{
+			int channel = channels[k / 4];
+
+			if ((channel & 3) == 2 && k % 4 == 0)
+				printf(" | ^%d", channel >> 4);
+			else
+				printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : "."));
+		}
+
+		printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]",
+		    double(vsk.header) * total_kr * 100,
+		    double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100,
+		    double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100);
+
+		size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3];
+
+		if (total_ctrl)
+		{
+			printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%",
+			    double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100,
+			    double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100);
+		}
+
+		if (level >= 3)
+			printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
+			    double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,
+			    double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,
+			    double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,
+			    double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);
+
+		printf("\n");
+	}
+#endif
+
 	return data - buffer;
 }
 
+size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, meshopt::kEncodeDefaultLevel, meshopt::gEncodeVertexVersion);
+}
+
 size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
 {
 	using namespace meshopt;
@@ -1160,21 +1791,42 @@ size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
 	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 	size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
 
+	size_t vertex_block_control_size = vertex_size / 4;
 	size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
 	size_t vertex_block_data_size = vertex_block_size;
 
-	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+	size_t tail_size = vertex_size + (vertex_size / 4);
+	size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1;
+	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
+	assert(tail_size_pad >= kByteGroupDecodeLimit);
 
-	return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
+	return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad;
 }
 
 void meshopt_encodeVertexVersion(int version)
 {
-	assert(unsigned(version) <= 0);
+	assert(unsigned(version) <= unsigned(meshopt::kDecodeVertexVersion));
 
 	meshopt::gEncodeVertexVersion = version;
 }
 
+int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size)
+{
+	if (buffer_size < 1)
+		return -1;
+
+	unsigned char header = buffer[0];
+
+	if ((header & 0xf0) != meshopt::kVertexHeader)
+		return -1;
+
+	int version = header & 0x0f;
+	if (version > meshopt::kDecodeVertexVersion)
+		return -1;
+
+	return version;
+}
+
 int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
 {
 	using namespace meshopt;
@@ -1182,7 +1834,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 	assert(vertex_size > 0 && vertex_size <= 256);
 	assert(vertex_size % 4 == 0);
 
-	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL;
+	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL;
 
 #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
 	decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
@@ -1202,7 +1854,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 	const unsigned char* data = buffer;
 	const unsigned char* data_end = buffer + buffer_size;
 
-	if (size_t(data_end - data) < 1 + vertex_size)
+	if (size_t(data_end - data) < 1)
 		return -2;
 
 	unsigned char data_header = *data++;
@@ -1211,11 +1863,22 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 		return -1;
 
 	int version = data_header & 0x0f;
-	if (version > 0)
+	if (version > kDecodeVertexVersion)
 		return -1;
 
+	size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
+	size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
+	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
+
+	if (size_t(data_end - data) < tail_size_pad)
+		return -2;
+
+	const unsigned char* tail = data_end - tail_size;
+
 	unsigned char last_vertex[256];
-	memcpy(last_vertex, data_end - vertex_size, vertex_size);
+	memcpy(last_vertex, tail, vertex_size);
+
+	const unsigned char* channels = version == 0 ? NULL : tail + vertex_size;
 
 	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 
@@ -1225,16 +1888,14 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 	{
 		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
 
-		data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+		data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version);
 		if (!data)
 			return -2;
 
 		vertex_offset += block_size;
 	}
 
-	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
-
-	if (size_t(data_end - data) != tail_size)
+	if (size_t(data_end - data) != tail_size_pad)
 		return -3;
 
 	return 0;
@@ -1246,3 +1907,4 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 #undef SIMD_WASM
 #undef SIMD_FALLBACK
 #undef SIMD_TARGET
+#undef SIMD_LATENCYOPT
diff --git a/Source/ThirdParty/meshoptimizer/vertexfilter.cpp b/Source/ThirdParty/meshoptimizer/vertexfilter.cpp
index 4b5f444f0..3fd836083 100644
--- a/Source/ThirdParty/meshoptimizer/vertexfilter.cpp
+++ b/Source/ThirdParty/meshoptimizer/vertexfilter.cpp
@@ -109,28 +109,33 @@ static void decodeFilterOct(T* data, size_t count)
 
 static void decodeFilterQuat(short* data, size_t count)
 {
-	const float scale = 1.f / sqrtf(2.f);
+	const float scale = 32767.f / sqrtf(2.f);
 
 	for (size_t i = 0; i < count; ++i)
 	{
 		// recover scale from the high byte of the component
 		int sf = data[i * 4 + 3] | 3;
-		float ss = scale / float(sf);
+		float s = float(sf);
 
-		// convert x/y/z to [-1..1] (scaled...)
-		float x = float(data[i * 4 + 0]) * ss;
-		float y = float(data[i * 4 + 1]) * ss;
-		float z = float(data[i * 4 + 2]) * ss;
+		// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+		float x = float(data[i * 4 + 0]);
+		float y = float(data[i * 4 + 1]);
+		float z = float(data[i * 4 + 2]);
 
-		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
-		float ww = 1.f - x * x - y * y - z * z;
+		// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
+		float ws = s * s;
+		float ww = ws * 2.f - x * x - y * y - z * z;
 		float w = sqrtf(ww >= 0.f ? ww : 0.f);
 
+		// compute final scale; note that all computations above are unscaled
+		// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+		float ss = scale / s;
+
 		// rounded signed float->int
-		int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
-		int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
-		int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
-		int wf = int(w * 32767.f + 0.5f);
+		int xf = int(x * ss + (x >= 0.f ? 0.5f : -0.5f));
+		int yf = int(y * ss + (y >= 0.f ? 0.5f : -0.5f));
+		int zf = int(z * ss + (z >= 0.f ? 0.5f : -0.5f));
+		int wf = int(w * ss + 0.5f);
 
 		int qc = data[i * 4 + 3] & 3;
 
@@ -165,6 +170,47 @@ static void decodeFilterExp(unsigned int* data, size_t count)
 		data[i] = u.ui;
 	}
 }
+
+template <typename ST, typename T>
+static void decodeFilterColor(T* data, size_t count)
+{
+	const float max = float((1 << (sizeof(T) * 8)) - 1);
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		// recover scale from alpha high bit
+		int as = data[i * 4 + 3];
+		as |= as >> 1;
+		as |= as >> 2;
+		as |= as >> 4;
+		as |= as >> 8; // noop for 8-bit
+
+		// convert to RGB in fixed point (co/cg are sign extended)
+		int y = data[i * 4 + 0], co = ST(data[i * 4 + 1]), cg = ST(data[i * 4 + 2]);
+
+		int r = y + co - cg;
+		int g = y + cg;
+		int b = y - co - cg;
+
+		// expand alpha by one bit to match other components
+		int a = data[i * 4 + 3];
+		a = ((a << 1) & as) | (a & 1);
+
+		// compute scaling factor
+		float ss = max / float(as);
+
+		// rounded float->int
+		int rf = int(float(r) * ss + 0.5f);
+		int gf = int(float(g) * ss + 0.5f);
+		int bf = int(float(b) * ss + 0.5f);
+		int af = int(float(a) * ss + 0.5f);
+
+		data[i * 4 + 0] = T(rf);
+		data[i * 4 + 1] = T(gf);
+		data[i * 4 + 2] = T(bf);
+		data[i * 4 + 3] = T(af);
+	}
+}
 #endif
 
 #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
@@ -201,7 +247,7 @@ inline uint64_t rotateleft64(uint64_t v, int x)
 #endif
 
 #ifdef SIMD_SSE
-static void decodeFilterOctSimd(signed char* data, size_t count)
+static void decodeFilterOctSimd8(signed char* data, size_t count)
 {
 	const __m128 sign = _mm_set1_ps(-0.f);
 
@@ -246,7 +292,7 @@ static void decodeFilterOctSimd(signed char* data, size_t count)
 	}
 }
 
-static void decodeFilterOctSimd(short* data, size_t count)
+static void decodeFilterOctSimd16(short* data, size_t count)
 {
 	const __m128 sign = _mm_set1_ps(-0.f);
 
@@ -295,8 +341,9 @@ static void decodeFilterOctSimd(short* data, size_t count)
 		__m128i res_1 = _mm_unpackhi_epi16(xzr, y0r);
 
 		// patch in .w
-		res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000)));
-		res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000)));
+		__m128i maskw = _mm_set_epi32(0xffff0000, 0, 0xffff0000, 0);
+		res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), maskw));
+		res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), maskw));
 
 		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
 		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
@@ -305,7 +352,7 @@ static void decodeFilterOctSimd(short* data, size_t count)
 
 static void decodeFilterQuatSimd(short* data, size_t count)
 {
-	const float scale = 1.f / sqrtf(2.f);
+	const float scale = 32767.f / sqrtf(2.f);
 
 	for (size_t i = 0; i < count; i += 4)
 	{
@@ -324,24 +371,27 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 
 		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
 		__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
-		__m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
+		__m128 s = _mm_cvtepi32_ps(sf);
 
-		// convert x/y/z to [-1..1] (scaled...)
-		__m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
-		__m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
-		__m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
+		// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+		__m128 x = _mm_cvtepi32_ps(xf);
+		__m128 y = _mm_cvtepi32_ps(yf);
+		__m128 z = _mm_cvtepi32_ps(zf);
 
-		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
-		__m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
+		// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
+		__m128 ws = _mm_mul_ps(s, _mm_add_ps(s, s)); // s*2s instead of 2*(s*s) to work around clang bug with integer multiplication
+		__m128 ww = _mm_sub_ps(ws, _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
 		__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
 
-		__m128 s = _mm_set1_ps(32767.f);
+		// compute final scale; note that all computations above are unscaled
+		// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+		__m128 ss = _mm_div_ps(_mm_set1_ps(scale), s);
 
 		// rounded signed float->int
-		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
-		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
-		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
-		__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, ss));
+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, ss));
+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, ss));
+		__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, ss));
 
 		// mix x/z and w/y to make 16-bit unpack easier
 		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
@@ -385,6 +435,105 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
 		_mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
 	}
 }
+
+static void decodeFilterColorSimd8(unsigned char* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128i c4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
+
+		// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
+		__m128i yf = _mm_and_si128(c4, _mm_set1_epi32(0xff));
+		__m128i cof = _mm_srai_epi32(_mm_slli_epi32(c4, 16), 24);
+		__m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4, 8), 24);
+		__m128i af = _mm_srli_epi32(c4, 24);
+
+		// recover scale from alpha high bit
+		__m128i as = af;
+		as = _mm_or_si128(as, _mm_srli_epi32(as, 1));
+		as = _mm_or_si128(as, _mm_srli_epi32(as, 2));
+		as = _mm_or_si128(as, _mm_srli_epi32(as, 4));
+
+		// expand alpha by one bit to match other components
+		af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1)));
+
+		// compute scaling factor
+		__m128 ss = _mm_mul_ps(_mm_set1_ps(255.f), _mm_rcp_ps(_mm_cvtepi32_ps(as)));
+
+		// convert to RGB in fixed point
+		__m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf));
+		__m128i gf = _mm_add_epi32(yf, cgf);
+		__m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf));
+
+		// rounded signed float->int
+		__m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss));
+		__m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss));
+		__m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss));
+		__m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss));
+
+		// repack rgba into final value
+		__m128i res = rr;
+		res = _mm_or_si128(res, _mm_slli_epi32(gr, 8));
+		res = _mm_or_si128(res, _mm_slli_epi32(br, 16));
+		res = _mm_or_si128(res, _mm_slli_epi32(ar, 24));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
+	}
+}
+
+static void decodeFilterColorSimd16(unsigned short* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128i c4_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]));
+		__m128i c4_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]));
+
+		// gather both y/co 16-bit pairs in each 32-bit lane
+		__m128i c4_yco = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(2, 0, 2, 0)));
+		__m128i c4_cga = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(3, 1, 3, 1)));
+
+		// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
+		__m128i yf = _mm_and_si128(c4_yco, _mm_set1_epi32(0xffff));
+		__m128i cof = _mm_srai_epi32(c4_yco, 16);
+		__m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4_cga, 16), 16);
+		__m128i af = _mm_srli_epi32(c4_cga, 16);
+
+		// recover scale from alpha high bit
+		__m128i as = af;
+		as = _mm_or_si128(as, _mm_srli_epi32(as, 1));
+		as = _mm_or_si128(as, _mm_srli_epi32(as, 2));
+		as = _mm_or_si128(as, _mm_srli_epi32(as, 4));
+		as = _mm_or_si128(as, _mm_srli_epi32(as, 8));
+
+		// expand alpha by one bit to match other components
+		af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1)));
+
+		// compute scaling factor
+		__m128 ss = _mm_div_ps(_mm_set1_ps(65535.f), _mm_cvtepi32_ps(as));
+
+		// convert to RGB in fixed point
+		__m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf));
+		__m128i gf = _mm_add_epi32(yf, cgf);
+		__m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf));
+
+		// rounded signed float->int
+		__m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss));
+		__m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss));
+		__m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss));
+		__m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss));
+
+		// mix r/b and g/a to make 16-bit unpack easier
+		__m128i rbr = _mm_or_si128(_mm_and_si128(rr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(br, 16));
+		__m128i gar = _mm_or_si128(_mm_and_si128(gr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(ar, 16));
+
+		// pack r/g/b/a using 16-bit unpacks
+		__m128i res_0 = _mm_unpacklo_epi16(rbr, gar);
+		__m128i res_1 = _mm_unpackhi_epi16(rbr, gar);
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
+	}
+}
 #endif
 
 #if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
@@ -401,10 +550,17 @@ inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
 	r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
 	return vmulq_f32(x, r);
 }
+
+#ifndef __ARM_FEATURE_FMA
+inline float32x4_t vfmaq_f32(float32x4_t x, float32x4_t y, float32x4_t z)
+{
+	return vaddq_f32(x, vmulq_f32(y, z));
+}
+#endif
 #endif
 
 #ifdef SIMD_NEON
-static void decodeFilterOctSimd(signed char* data, size_t count)
+static void decodeFilterOctSimd8(signed char* data, size_t count)
 {
 	const int32x4_t sign = vdupq_n_s32(0x80000000);
 
@@ -431,29 +587,27 @@ static void decodeFilterOctSimd(signed char* data, size_t count)
 		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
 
 		// compute normal length & scale
-		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
+		float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
 		float32x4_t rl = vrsqrteq_f32(ll);
 		float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
 
 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
-		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
 		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
 
-		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
-		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
-		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+		int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
+		int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
+		int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));
 
 		// combine xr/yr/zr into final value
-		int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
-		res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
-		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
-		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
+		int32x4_t res = vsliq_n_s32(xr, vsliq_n_s32(yr, zr, 8), 8);
+		res = vbslq_s32(vdupq_n_u32(0xff000000), n4, res);
 
 		vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
 	}
 }
 
-static void decodeFilterOctSimd(short* data, size_t count)
+static void decodeFilterOctSimd16(short* data, size_t count)
 {
 	const int32x4_t sign = vdupq_n_s32(0x80000000);
 
@@ -485,21 +639,25 @@ static void decodeFilterOctSimd(short* data, size_t count)
 		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
 
 		// compute normal length & scale
-		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
+		float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
+#if !defined(__aarch64__) && !defined(_M_ARM64)
 		float32x4_t rl = vrsqrteq_f32(ll);
 		rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
 		float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
+#else
+		float32x4_t s = vdivq_f32(vdupq_n_f32(32767.f), vsqrtq_f32(ll));
+#endif
 
 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
 		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
 		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
 
-		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
-		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
-		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+		int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
+		int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
+		int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));
 
 		// mix x/z and y/0 to make 16-bit unpack easier
-		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
+		int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
 		int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
 
 		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
@@ -517,7 +675,7 @@ static void decodeFilterOctSimd(short* data, size_t count)
 
 static void decodeFilterQuatSimd(short* data, size_t count)
 {
-	const float scale = 1.f / sqrtf(2.f);
+	const float scale = 32767.f / sqrtf(2.f);
 
 	for (size_t i = 0; i < count; i += 4)
 	{
@@ -536,43 +694,52 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 
 		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
 		int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
-		float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
+		float32x4_t s = vcvtq_f32_s32(sf);
 
-		// convert x/y/z to [-1..1] (scaled...)
-		float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
-		float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
-		float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
+		// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+		float32x4_t x = vcvtq_f32_s32(xf);
+		float32x4_t y = vcvtq_f32_s32(yf);
+		float32x4_t z = vcvtq_f32_s32(zf);
 
-		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
-		float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
+		// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
+		float32x4_t ws = vmulq_f32(s, s);
+		float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z));
 		float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
 
-		float32x4_t s = vdupq_n_f32(32767.f);
+		// compute final scale; note that all computations above are unscaled
+		// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+		float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), s);
 
 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
 		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
 		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
 
-		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
-		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
-		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
-		int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
+		int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, ss));
+		int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, ss));
+		int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, ss));
+		int32x4_t wr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, w, ss));
 
 		// mix x/z and w/y to make 16-bit unpack easier
-		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
-		int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
+		int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
+		int32x4_t wyr = vsliq_n_s32(wr, yr, 16);
 
 		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
-		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
-		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
+		uint64x2_t res_0 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
+		uint64x2_t res_1 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
+
+		// store results to stack so that we can rotate using scalar instructions
+		// TODO: volatile works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/166808
+		volatile uint64_t res[4];
+		vst1q_u64(const_cast<uint64_t*>(&res[0]), res_0);
+		vst1q_u64(const_cast<uint64_t*>(&res[2]), res_1);
 
 		// rotate and store
-		uint64_t* out = (uint64_t*)&data[i * 4];
+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
 
-		out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
-		out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
-		out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
-		out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
+		out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
+		out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
+		out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
+		out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
 	}
 }
 
@@ -595,10 +762,112 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
 		vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
 	}
 }
+
+static void decodeFilterColorSimd8(unsigned char* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t c4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]));
+
+		// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
+		int32x4_t yf = vandq_s32(c4, vdupq_n_s32(0xff));
+		int32x4_t cof = vshrq_n_s32(vshlq_n_s32(c4, 16), 24);
+		int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4, 8), 24);
+		int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4), 24));
+
+		// recover scale from alpha high bit
+		int32x4_t as = af;
+		as = vorrq_s32(as, vshrq_n_s32(as, 1));
+		as = vorrq_s32(as, vshrq_n_s32(as, 2));
+		as = vorrq_s32(as, vshrq_n_s32(as, 4));
+
+		// expand alpha by one bit to match other components
+		af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1)));
+
+		// compute scaling factor
+		float32x4_t ss = vmulq_f32(vdupq_n_f32(255.f), vrecpeq_f32(vcvtq_f32_s32(as)));
+
+		// convert to RGB in fixed point
+		int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf));
+		int32x4_t gf = vaddq_s32(yf, cgf);
+		int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+		int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
+		int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
+		int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
+		int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
+
+		// repack rgba into final value
+		int32x4_t res = vsliq_n_s32(rr, vsliq_n_s32(gr, vsliq_n_s32(br, ar, 8), 8), 8);
+
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
+	}
+}
+
+static void decodeFilterColorSimd16(unsigned short* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t c4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
+		int32x4_t c4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
+
+		// gather both y/co 16-bit pairs in each 32-bit lane
+		int32x4_t c4_yco = vuzpq_s32(c4_0, c4_1).val[0];
+		int32x4_t c4_cga = vuzpq_s32(c4_0, c4_1).val[1];
+
+		// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
+		int32x4_t yf = vandq_s32(c4_yco, vdupq_n_s32(0xffff));
+		int32x4_t cof = vshrq_n_s32(c4_yco, 16);
+		int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4_cga, 16), 16);
+		int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4_cga), 16));
+
+		// recover scale from alpha high bit
+		int32x4_t as = af;
+		as = vorrq_s32(as, vshrq_n_s32(as, 1));
+		as = vorrq_s32(as, vshrq_n_s32(as, 2));
+		as = vorrq_s32(as, vshrq_n_s32(as, 4));
+		as = vorrq_s32(as, vshrq_n_s32(as, 8));
+
+		// expand alpha by one bit to match other components
+		af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1)));
+
+		// compute scaling factor
+		float32x4_t ss = vdivq_f32(vdupq_n_f32(65535.f), vcvtq_f32_s32(as));
+
+		// convert to RGB in fixed point
+		int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf));
+		int32x4_t gf = vaddq_s32(yf, cgf);
+		int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+		int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
+		int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
+		int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
+		int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
+
+		// mix r/b and g/a to make 16-bit unpack easier
+		int32x4_t rbr = vsliq_n_s32(rr, br, 16);
+		int32x4_t gar = vsliq_n_s32(gr, ar, 16);
+
+		// pack r/g/b/a using 16-bit unpacks
+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[0]);
+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[1]);
+
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0);
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1);
+	}
+}
 #endif
 
 #ifdef SIMD_WASM
-static void decodeFilterOctSimd(signed char* data, size_t count)
+static void decodeFilterOctSimd8(signed char* data, size_t count)
 {
 	const v128_t sign = wasm_f32x4_splat(-0.f);
 
@@ -647,10 +916,11 @@ static void decodeFilterOctSimd(signed char* data, size_t count)
 	}
 }
 
-static void decodeFilterOctSimd(short* data, size_t count)
+static void decodeFilterOctSimd16(short* data, size_t count)
 {
 	const v128_t sign = wasm_f32x4_splat(-0.f);
-	const v128_t zmask = wasm_i32x4_splat(0x7fff);
+	// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
+	volatile v128_t zmask = wasm_i32x4_splat(0x7fff);
 
 	for (size_t i = 0; i < count; i += 4)
 	{
@@ -711,7 +981,7 @@ static void decodeFilterOctSimd(short* data, size_t count)
 
 static void decodeFilterQuatSimd(short* data, size_t count)
 {
-	const float scale = 1.f / sqrtf(2.f);
+	const float scale = 32767.f / sqrtf(2.f);
 
 	for (size_t i = 0; i < count; i += 4)
 	{
@@ -730,28 +1000,31 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 
 		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
 		v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
-		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
+		v128_t s = wasm_f32x4_convert_i32x4(sf);
 
-		// convert x/y/z to [-1..1] (scaled...)
-		v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
-		v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
-		v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
+		// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
+		v128_t x = wasm_f32x4_convert_i32x4(xf);
+		v128_t y = wasm_f32x4_convert_i32x4(yf);
+		v128_t z = wasm_f32x4_convert_i32x4(zf);
 
-		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
 		// note: i32x4_max with 0 is equivalent to f32x4_max
-		v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
+		v128_t ws = wasm_f32x4_mul(s, s);
+		v128_t ww = wasm_f32x4_sub(wasm_f32x4_add(ws, ws), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
 		v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));
 
-		v128_t s = wasm_f32x4_splat(32767.f);
+		// compute final scale; note that all computations above are unscaled
+		// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), s);
 
 		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
 		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
 		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
 
-		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
-		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
-		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
-		v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, ss), fsnap);
+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, ss), fsnap);
+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, ss), fsnap);
+		v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, ss), fsnap);
 
 		// mix x/z and w/y to make 16-bit unpack easier
 		v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
@@ -762,8 +1035,7 @@ static void decodeFilterQuatSimd(short* data, size_t count)
 		v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);
 
 		// compute component index shifted left by 4 (and moved into i32x4 slot)
-		// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449
-		volatile v128_t cm = wasm_i32x4_shl(cf, 4);
+		v128_t cm = wasm_i32x4_shl(cf, 4);
 
 		// rotate and store
 		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
@@ -794,6 +1066,117 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count)
 		wasm_v128_store(&data[i], r);
 	}
 }
+
+static void decodeFilterColorSimd8(unsigned char* data, size_t count)
+{
+	// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
+	volatile v128_t zero = wasm_i32x4_splat(0);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t c4 = wasm_v128_load(&data[i * 4]);
+
+		// unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts)
+		v128_t yf = wasm_v128_and(c4, wasm_i32x4_splat(0xff));
+		v128_t cof = wasm_i32x4_shr(wasm_i32x4_shl(c4, 16), 24);
+		v128_t cgf = wasm_i32x4_shr(wasm_i32x4_shl(c4, 8), 24);
+		v128_t af = wasm_v128_or(zero, wasm_u32x4_shr(c4, 24));
+
+		// recover scale from alpha high bit
+		v128_t as = af;
+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 1));
+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 2));
+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 4));
+
+		// expand alpha by one bit to match other components
+		af = wasm_v128_or(wasm_v128_and(wasm_i32x4_shl(af, 1), as), wasm_v128_and(af, wasm_i32x4_splat(1)));
+
+		// compute scaling factor
+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(255.f), wasm_f32x4_convert_i32x4(as));
+
+		// convert to RGB in fixed point
+		v128_t rf = wasm_i32x4_add(yf, wasm_i32x4_sub(cof, cgf));
+		v128_t gf = wasm_i32x4_add(yf, cgf);
+		v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+		v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);
+		v128_t gr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(gf), ss), fsnap);
+		v128_t br = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(bf), ss), fsnap);
+		v128_t ar = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(af), ss), fsnap);
+
+		// repack rgba into final value
+		v128_t res = wasm_v128_and(rr, wasm_i32x4_splat(0xff));
+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(gr, wasm_i32x4_splat(0xff)), 8));
+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(br, wasm_i32x4_splat(0xff)), 16));
+		res = wasm_v128_or(res, wasm_i32x4_shl(ar, 24));
+
+		wasm_v128_store(&data[i * 4], res);
+	}
+}
+
+static void decodeFilterColorSimd16(unsigned short* data, size_t count)
+{
+	// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457
+	volatile v128_t zero = wasm_i32x4_splat(0);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t c4_0 = wasm_v128_load(&data[(i + 0) * 4]);
+		v128_t c4_1 = wasm_v128_load(&data[(i + 2) * 4]);
+
+		// gather both y/co 16-bit pairs in each 32-bit lane
+		v128_t c4_yco = wasmx_unziplo_v32x4(c4_0, c4_1);
+		v128_t c4_cga = wasmx_unziphi_v32x4(c4_0, c4_1);
+
+		// unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts)
+		v128_t yf = wasm_v128_and(c4_yco, wasm_i32x4_splat(0xffff));
+		v128_t cof = wasm_i32x4_shr(c4_yco, 16);
+		v128_t cgf = wasm_i32x4_shr(wasm_i32x4_shl(c4_cga, 16), 16);
+		v128_t af = wasm_v128_or(zero, wasm_u32x4_shr(c4_cga, 16));
+
+		// recover scale from alpha high bit
+		v128_t as = af;
+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 1));
+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 2));
+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 4));
+		as = wasm_v128_or(as, wasm_i32x4_shr(as, 8));
+
+		// expand alpha by one bit to match other components
+		af = wasm_v128_or(wasm_v128_and(wasm_i32x4_shl(af, 1), as), wasm_v128_and(af, wasm_i32x4_splat(1)));
+
+		// compute scaling factor
+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(65535.f), wasm_f32x4_convert_i32x4(as));
+
+		// convert to RGB in fixed point
+		v128_t rf = wasm_i32x4_add(yf, wasm_i32x4_sub(cof, cgf));
+		v128_t gf = wasm_i32x4_add(yf, cgf);
+		v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+		v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);
+		v128_t gr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(gf), ss), fsnap);
+		v128_t br = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(bf), ss), fsnap);
+		v128_t ar = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(af), ss), fsnap);
+
+		// mix r/b and g/a to make 16-bit unpack easier
+		v128_t rbr = wasm_v128_or(wasm_v128_and(rr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(br, 16));
+		v128_t gar = wasm_v128_or(wasm_v128_and(gr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(ar, 16));
+
+		// pack r/g/b/a using 16-bit unpacks
+		v128_t res_0 = wasmx_unpacklo_v16x8(rbr, gar);
+		v128_t res_1 = wasmx_unpackhi_v16x8(rbr, gar);
+
+		wasm_v128_store(&data[(i + 0) * 4], res_0);
+		wasm_v128_store(&data[(i + 2) * 4], res_1);
+	}
+}
 #endif
 
 // optimized variant of frexp
@@ -807,7 +1190,7 @@ inline int optlog2(float v)
 
 	u.f = v;
 	// +1 accounts for implicit 1. in mantissa; denormalized numbers will end up clamped to min_exp by calling code
-	return u.ui == 0 ? 0 : int((u.ui >> 23) & 0xff) - 127 + 1;
+	return v == 0 ? 0 : int((u.ui >> 23) & 0xff) - 127 + 1;
 }
 
 // optimized variant of ldexp
@@ -833,9 +1216,9 @@ void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride)
 
 #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
 	if (stride == 4)
-		dispatchSimd(decodeFilterOctSimd, static_cast<signed char*>(buffer), count, 4);
+		dispatchSimd(decodeFilterOctSimd8, static_cast<signed char*>(buffer), count, 4);
 	else
-		dispatchSimd(decodeFilterOctSimd, static_cast<short*>(buffer), count, 4);
+		dispatchSimd(decodeFilterOctSimd16, static_cast<short*>(buffer), count, 4);
 #else
 	if (stride == 4)
 		decodeFilterOct(static_cast<signed char*>(buffer), count);
@@ -871,10 +1254,29 @@ void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride)
 #endif
 }
 
+void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride)
+{
+	using namespace meshopt;
+
+	assert(stride == 4 || stride == 8);
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	if (stride == 4)
+		dispatchSimd(decodeFilterColorSimd8, static_cast<unsigned char*>(buffer), count, 4);
+	else
+		dispatchSimd(decodeFilterColorSimd16, static_cast<unsigned short*>(buffer), count, 4);
+#else
+	if (stride == 4)
+		decodeFilterColor<signed char>(static_cast<unsigned char*>(buffer), count);
+	else
+		decodeFilterColor<short>(static_cast<unsigned short*>(buffer), count);
+#endif
+}
+
 void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data)
 {
 	assert(stride == 4 || stride == 8);
-	assert(bits >= 1 && bits <= 16);
+	assert(bits >= 2 && bits <= 16);
 
 	signed char* d8 = static_cast<signed char*>(destination);
 	short* d16 = static_cast<short*>(destination);
@@ -1010,6 +1412,20 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in
 				component_exp[j] = (min_exp < e) ? e : min_exp;
 			}
 		}
+		else if (mode == meshopt_EncodeExpClamped)
+		{
+			for (size_t j = 0; j < stride_float; ++j)
+			{
+				int e = optlog2(v[j]);
+
+				component_exp[j] = (0 < e) ? e : 0;
+			}
+		}
+		else
+		{
+			// the code below assumes component_exp is initialized outside of the loop
+			assert(mode == meshopt_EncodeExpSharedComponent);
+		}
 
 		for (size_t j = 0; j < stride_float; ++j)
 		{
@@ -1020,7 +1436,6 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in
 
 			// compute renormalized rounded mantissa for each component
 			int mmask = (1 << 24) - 1;
-
 			int m = int(v[j] * optexp2(-exp) + (v[j] >= 0 ? 0.5f : -0.5f));
 
 			d[j] = (m & mmask) | (unsigned(exp) << 24);
@@ -1028,6 +1443,51 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in
 	}
 }
 
+void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data)
+{
+	assert(stride == 4 || stride == 8);
+	assert(bits >= 2 && bits <= 16);
+
+	unsigned char* d8 = static_cast<unsigned char*>(destination);
+	unsigned short* d16 = static_cast<unsigned short*>(destination);
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		const float* c = &data[i * 4];
+
+		int fr = meshopt_quantizeUnorm(c[0], bits);
+		int fg = meshopt_quantizeUnorm(c[1], bits);
+		int fb = meshopt_quantizeUnorm(c[2], bits);
+
+		// YCoCg-R encoding with truncated Co/Cg ensures that decoding can be done using integers
+		int fco = (fr - fb) / 2;
+		int tmp = fb + fco;
+		int fcg = (fg - tmp) / 2;
+		int fy = tmp + fcg;
+
+		// validate that R/G/B can be reconstructed with K bit integers
+		assert(unsigned((fy + fco - fcg) | (fy + fcg) | (fy - fco - fcg)) < (1u << bits));
+
+		// alpha: K-1-bit encoding with high bit set to 1
+		int fa = meshopt_quantizeUnorm(c[3], bits - 1) | (1 << (bits - 1));
+
+		if (stride == 4)
+		{
+			d8[i * 4 + 0] = (unsigned char)(fy);
+			d8[i * 4 + 1] = (unsigned char)(fco);
+			d8[i * 4 + 2] = (unsigned char)(fcg);
+			d8[i * 4 + 3] = (unsigned char)(fa);
+		}
+		else
+		{
+			d16[i * 4 + 0] = (unsigned short)(fy);
+			d16[i * 4 + 1] = (unsigned short)(fco);
+			d16[i * 4 + 2] = (unsigned short)(fcg);
+			d16[i * 4 + 3] = (unsigned short)(fa);
+		}
+	}
+}
+
 #undef SIMD_SSE
 #undef SIMD_NEON
 #undef SIMD_WASM
diff --git a/Source/ThirdParty/meshoptimizer/vfetchanalyzer.cpp b/Source/ThirdParty/meshoptimizer/vfetchanalyzer.cpp
deleted file mode 100644
index 51dca873f..000000000
--- a/Source/ThirdParty/meshoptimizer/vfetchanalyzer.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
-#include "meshoptimizer.h"
-
-#include <assert.h>
-#include <string.h>
-
-meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
-{
-	assert(index_count % 3 == 0);
-	assert(vertex_size > 0 && vertex_size <= 256);
-
-	meshopt_Allocator allocator;
-
-	meshopt_VertexFetchStatistics result = {};
-
-	unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count);
-	memset(vertex_visited, 0, vertex_count);
-
-	const size_t kCacheLine = 64;
-	const size_t kCacheSize = 128 * 1024;
-
-	// simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
-	size_t cache[kCacheSize / kCacheLine] = {};
-
-	for (size_t i = 0; i < index_count; ++i)
-	{
-		unsigned int index = indices[i];
-		assert(index < vertex_count);
-
-		vertex_visited[index] = 1;
-
-		size_t start_address = index * vertex_size;
-		size_t end_address = start_address + vertex_size;
-
-		size_t start_tag = start_address / kCacheLine;
-		size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
-
-		assert(start_tag < end_tag);
-
-		for (size_t tag = start_tag; tag < end_tag; ++tag)
-		{
-			size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
-
-			// we store +1 since cache is filled with 0 by default
-			result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
-			cache[line] = tag + 1;
-		}
-	}
-
-	size_t unique_vertex_count = 0;
-
-	for (size_t i = 0; i < vertex_count; ++i)
-		unique_vertex_count += vertex_visited[i];
-
-	result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
-
-	return result;
-}

From 27896b6410614ecb737e0165ab9ee65eeccfb740 Mon Sep 17 00:00:00 2001
From: Wojtek Figat <wojtek@figat.pl>
Date: Sun, 11 Jan 2026 22:06:30 +0100
Subject: [PATCH 25/33] Add view flag to hide particles drawing

#3840
---
 Flax.flaxproj                              |  2 +-
 Source/Editor/Options/InputOptions.cs      |  4 ++++
 Source/Editor/Viewport/EditorViewport.cs   |  5 ++++-
 Source/Engine/Graphics/Enums.h             | 11 ++++++++---
 Source/Engine/Particles/ParticleEffect.cpp |  8 ++++++--
 5 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/Flax.flaxproj b/Flax.flaxproj
index 100d4e9ff..74ab82f55 100644
--- a/Flax.flaxproj
+++ b/Flax.flaxproj
@@ -4,7 +4,7 @@
     "Major": 1,
     "Minor": 11,
     "Revision": 0,
-    "Build": 6805
+    "Build": 6806
   },
   "Company": "Flax",
   "Copyright": "Copyright (c) 2012-2025 Wojciech Figat. All rights reserved.",
diff --git a/Source/Editor/Options/InputOptions.cs b/Source/Editor/Options/InputOptions.cs
index ab473ebed..a759b7247 100644
--- a/Source/Editor/Options/InputOptions.cs
+++ b/Source/Editor/Options/InputOptions.cs
@@ -571,6 +571,10 @@ namespace FlaxEditor.Options
         [EditorDisplay("View Flags"), EditorOrder(3260)]
         public InputBinding DebugDraw = new InputBinding(KeyboardKeys.Alpha4, KeyboardKeys.Control, KeyboardKeys.Shift);
 
+        [DefaultValue(typeof(InputBinding), "None")]
+        [EditorDisplay("View Flags"), EditorOrder(3270)]
+        public InputBinding Particles = new InputBinding(KeyboardKeys.None);
+
         #endregion
 
         #region Interface
diff --git a/Source/Editor/Viewport/EditorViewport.cs b/Source/Editor/Viewport/EditorViewport.cs
index c16d3d9f5..2af065c68 100644
--- a/Source/Editor/Viewport/EditorViewport.cs
+++ b/Source/Editor/Viewport/EditorViewport.cs
@@ -1063,6 +1063,7 @@ namespace FlaxEditor.Viewport
             InputActions.Add(options => options.Fog, () => Task.ViewFlags ^= ViewFlags.Fog);
             InputActions.Add(options => options.SpecularLight, () => Task.ViewFlags ^= ViewFlags.SpecularLight);
             InputActions.Add(options => options.Decals, () => Task.ViewFlags ^= ViewFlags.Decals);
+            InputActions.Add(options => options.Particles, () => Task.ViewFlags ^= ViewFlags.Particles);
             InputActions.Add(options => options.CustomPostProcess, () => Task.ViewFlags ^= ViewFlags.CustomPostProcess);
             InputActions.Add(options => options.Bloom, () => Task.ViewFlags ^= ViewFlags.Bloom);
             InputActions.Add(options => options.ToneMapping, () => Task.ViewFlags ^= ViewFlags.ToneMapping);
@@ -2115,6 +2116,7 @@ namespace FlaxEditor.Viewport
             new ViewFlagOptions(ViewFlags.Fog, "Fog", Editor.Instance.Options.Options.Input.Fog),
             new ViewFlagOptions(ViewFlags.SpecularLight, "Specular Light", Editor.Instance.Options.Options.Input.SpecularLight),
             new ViewFlagOptions(ViewFlags.Decals, "Decals", Editor.Instance.Options.Options.Input.Decals),
+            new ViewFlagOptions(ViewFlags.Particles, "Particles", Editor.Instance.Options.Options.Input.Particles),
             new ViewFlagOptions(ViewFlags.CustomPostProcess, "Custom Post Process", Editor.Instance.Options.Options.Input.CustomPostProcess),
             new ViewFlagOptions(ViewFlags.Bloom, "Bloom", Editor.Instance.Options.Options.Input.Bloom),
             new ViewFlagOptions(ViewFlags.ToneMapping, "Tone Mapping", Editor.Instance.Options.Options.Input.ToneMapping),
@@ -2134,12 +2136,13 @@ namespace FlaxEditor.Viewport
             if (cm.Visible == false)
                 return;
             var ccm = (ContextMenu)cm;
+            var flags = Task.View.Flags;
             foreach (var e in ccm.Items)
             {
                 if (e is ContextMenuButton b && b.Tag != null)
                 {
                     var v = (ViewFlags)b.Tag;
-                    b.Icon = (Task.View.Flags & v) != 0 ? Style.Current.CheckBoxTick : SpriteHandle.Invalid;
+                    b.Icon = (flags & v) != 0 ? Style.Current.CheckBoxTick : SpriteHandle.Invalid;
                 }
             }
         }
diff --git a/Source/Engine/Graphics/Enums.h b/Source/Engine/Graphics/Enums.h
index f6af6c16b..107fe3533 100644
--- a/Source/Engine/Graphics/Enums.h
+++ b/Source/Engine/Graphics/Enums.h
@@ -1075,20 +1075,25 @@ API_ENUM(Attributes="Flags") enum class ViewFlags : uint64
     /// </summary>
     LightsDebug = 1 << 27,
 
+    /// <summary>
+    /// Shows/hides particle effects.
+    /// </summary>
+    Particles = 1 << 28,
+
     /// <summary>
     /// Default flags for Game.
     /// </summary>
-    DefaultGame = Reflections | DepthOfField | Fog | Decals | MotionBlur | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | GlobalSDF | Sky,
+    DefaultGame = Reflections | DepthOfField | Fog | Decals | MotionBlur | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | GlobalSDF | Sky | Particles,
 
     /// <summary>
     /// Default flags for Editor.
     /// </summary>
-    DefaultEditor = Reflections | Fog | Decals | DebugDraw | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | EditorSprites | ContactShadows | GlobalSDF | Sky,
+    DefaultEditor = Reflections | Fog | Decals | DebugDraw | SSR | AO | GI | DirectionalLights | PointLights | SpotLights | SkyLights | Shadows | SpecularLight | AntiAliasing | CustomPostProcess | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | EditorSprites | ContactShadows | GlobalSDF | Sky | Particles,
 
     /// <summary>
     /// Default flags for materials/models previews generating.
     /// </summary>
-    DefaultAssetPreview = Reflections | Decals | DirectionalLights | PointLights | SpotLights | SkyLights | SpecularLight | AntiAliasing | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | Sky,
+    DefaultAssetPreview = Reflections | Decals | DirectionalLights | PointLights | SpotLights | SkyLights | SpecularLight | AntiAliasing | Bloom | ToneMapping | EyeAdaptation | CameraArtifacts | LensFlares | ContactShadows | Sky | Particles,
 };
 
 DECLARE_ENUM_OPERATORS(ViewFlags);
diff --git a/Source/Engine/Particles/ParticleEffect.cpp b/Source/Engine/Particles/ParticleEffect.cpp
index 6e94594b0..9592147a7 100644
--- a/Source/Engine/Particles/ParticleEffect.cpp
+++ b/Source/Engine/Particles/ParticleEffect.cpp
@@ -601,7 +601,9 @@ bool ParticleEffect::HasContentLoaded() const
 
 void ParticleEffect::Draw(RenderContext& renderContext)
 {
-    if (renderContext.View.Pass == DrawPass::GlobalSDF || renderContext.View.Pass == DrawPass::GlobalSurfaceAtlas)
+    if (renderContext.View.Pass == DrawPass::GlobalSDF || 
+        renderContext.View.Pass == DrawPass::GlobalSurfaceAtlas ||
+        EnumHasNoneFlags(renderContext.View.Flags, ViewFlags::Particles))
         return;
     _lastMinDstSqr = Math::Min(_lastMinDstSqr, Vector3::DistanceSquared(GetPosition(), renderContext.View.WorldPosition));
     RenderContextBatch renderContextBatch(renderContext);
@@ -610,10 +612,12 @@ void ParticleEffect::Draw(RenderContext& renderContext)
 
 void ParticleEffect::Draw(RenderContextBatch& renderContextBatch)
 {
+    const RenderView& mainView = renderContextBatch.GetMainContext().View;
+    if (EnumHasNoneFlags(mainView.Flags, ViewFlags::Particles))
+        return;
     Particles::DrawParticles(renderContextBatch, this);
 
     // Cull again against the main context (if using multiple ones) to skip caching draw distance from shadow projections
-    const RenderView& mainView = renderContextBatch.GetMainContext().View;
     const BoundingSphere bounds(_sphere.Center - mainView.Origin, _sphere.Radius);
     if (renderContextBatch.Contexts.Count() > 1 && !mainView.CullingFrustum.Intersects(bounds))
         return;

From e21cb9154a02a49f854166bc6b7e88751f1623f4 Mon Sep 17 00:00:00 2001
From: Wojtek Figat <wojtek@figat.pl>
Date: Sun, 11 Jan 2026 22:11:16 +0100
Subject: [PATCH 26/33] Adjust #3677 to evaluate product local path in lazy

---
 Source/Editor/Modules/UIModule.cs | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/Source/Editor/Modules/UIModule.cs b/Source/Editor/Modules/UIModule.cs
index db1845b42..8a678370a 100644
--- a/Source/Editor/Modules/UIModule.cs
+++ b/Source/Editor/Modules/UIModule.cs
@@ -727,12 +727,15 @@ namespace FlaxEditor.Modules
             cm.AddSeparator();
             cm.AddButton("Plugins", () => Editor.Windows.PluginsWin.Show());
             cm.AddSeparator();
-            var childMenu = cm.AddChildMenu("Open product local folder");
+            var childMenu = cm.AddChildMenu("Open Product Local folder");
             childMenu.ContextMenu.AddButton("Editor", () => FileSystem.ShowFileExplorer(Globals.ProductLocalFolder));
-            string localAppData = Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData);
-            GameSettings settings = GameSettings.Load<GameSettings>();
-            string path = Path.Combine(localAppData, settings.CompanyName, settings.ProductName);
-            _menuToolsOpenLocalFolder = childMenu.ContextMenu.AddButton("Cooked game", () => FileSystem.ShowFileExplorer(path));
+            _menuToolsOpenLocalFolder = childMenu.ContextMenu.AddButton("Game", () =>
+            {
+                string localAppData = Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData);
+                GameSettings settings = GameSettings.Load<GameSettings>();
+                string path = Path.Combine(localAppData, settings.CompanyName, settings.ProductName);
+                FileSystem.ShowFileExplorer(path);
+            });
 
             // Window
             MenuWindow = MainMenu.AddButton("Window");

From 72bb2dd932310ef4e3b5e15620a472cb5b156047 Mon Sep 17 00:00:00 2001
From: Wojtek Figat <wojtek@figat.pl>
Date: Sun, 11 Jan 2026 22:17:50 +0100
Subject: [PATCH 27/33] Fix `Resize to Fit` to properly dirty state of level
 editor

#3670 #3735
---
 Source/Editor/SceneGraph/Actors/BoxColliderNode.cs | 1 +
 Source/Engine/Physics/Colliders/BoxCollider.cpp    | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs b/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs
index c4fd47f71..4a7150972 100644
--- a/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs
+++ b/Source/Editor/SceneGraph/Actors/BoxColliderNode.cs
@@ -42,6 +42,7 @@ namespace FlaxEditor.SceneGraph.Actors
                 if (value is BoxCollider collider)
                     collider.AutoResize(!_keepLocalOrientation);
             }
+            Presenter.OnModified();
         }
     }
 
diff --git a/Source/Engine/Physics/Colliders/BoxCollider.cpp b/Source/Engine/Physics/Colliders/BoxCollider.cpp
index 1e90cb91f..47e551b37 100644
--- a/Source/Engine/Physics/Colliders/BoxCollider.cpp
+++ b/Source/Engine/Physics/Colliders/BoxCollider.cpp
@@ -23,15 +23,15 @@ void BoxCollider::SetSize(const Float3& value)
 void BoxCollider::AutoResize(bool globalOrientation = true)
 {
     Actor* parent = GetParent();
-    if (Cast<Scene>(parent))
+    if (parent == nullptr || Cast<Scene>(parent))
         return;
 
     // Get bounds of all siblings (excluding itself)
     const Vector3 parentScale = parent->GetScale();
     if (parentScale.IsAnyZero())
-        return; // Avoid division by zero
+        return;
 
-    // Hacky way to get unrotated bounded box of parent.
+    // Hacky way to get unrotated bounded box of parent
     const Quaternion parentOrientation = parent->GetOrientation();
     parent->SetOrientation(Quaternion::Identity);
     BoundingBox parentBox = parent->GetBox();

From ab6dfca36edfa3a4cace7aaf812771dfc97e9f53 Mon Sep 17 00:00:00 2001
From: Wojtek Figat <wojtek@figat.pl>
Date: Sun, 11 Jan 2026 22:23:49 +0100
Subject: [PATCH 28/33] Fix broken `DrawWireTriangles`

#3650
---
 Source/Engine/Debug/DebugDraw.cpp | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/Source/Engine/Debug/DebugDraw.cpp b/Source/Engine/Debug/DebugDraw.cpp
index 7c798f88f..bea9e76f4 100644
--- a/Source/Engine/Debug/DebugDraw.cpp
+++ b/Source/Engine/Debug/DebugDraw.cpp
@@ -490,6 +490,18 @@ FORCE_INLINE DebugTriangle* AppendTriangles(int32 count, float duration, bool de
     return list->Get() + startIndex;
 }
 
+FORCE_INLINE DebugTriangle* AppendWireTriangles(int32 count, float duration, bool depthTest)
+{
+    Array<DebugTriangle>* list;
+    if (depthTest)
+        list = duration > 0 ? &Context->DebugDrawDepthTest.DefaultWireTriangles : &Context->DebugDrawDepthTest.OneFrameWireTriangles;
+    else
+        list = duration > 0 ? &Context->DebugDrawDefault.DefaultWireTriangles : &Context->DebugDrawDefault.OneFrameWireTriangles;
+    const int32 startIndex = list->Count();
+    list->AddUninitialized(count);
+    return list->Get() + startIndex;
+}
+
 inline void DrawText3D(const DebugText3D& t, const RenderContext& renderContext, const Float3& viewUp, const Matrix& f, const Matrix& vp, const Viewport& viewport, GPUContext* context, GPUTextureView* target, GPUTextureView* depthBuffer)
 {
     Matrix w, fw, m;
@@ -1714,7 +1726,7 @@ void DebugDraw::DrawWireTriangles(const Span<Float3>& vertices, const Color& col
     DebugTriangle t;
     t.Color = Color32(color);
     t.TimeLeft = duration;
-    auto dst = AppendTriangles(vertices.Length() / 3, duration, depthTest);
+    auto dst = AppendWireTriangles(vertices.Length() / 3, duration, depthTest);
     const Float3 origin = Context->Origin;
     for (int32 i = 0; i < vertices.Length();)
     {
@@ -1736,7 +1748,7 @@ void DebugDraw::DrawWireTriangles(const Span<Float3>& vertices, const Span<int32
     DebugTriangle t;
     t.Color = Color32(color);
     t.TimeLeft = duration;
-    auto dst = AppendTriangles(indices.Length() / 3, duration, depthTest);
+    auto dst = AppendWireTriangles(indices.Length() / 3, duration, depthTest);
     const Float3 origin = Context->Origin;
     for (int32 i = 0; i < indices.Length();)
     {
@@ -1758,7 +1770,7 @@ void DebugDraw::DrawWireTriangles(const Span<Double3>& vertices, const Color& co
     DebugTriangle t;
     t.Color = Color32(color);
     t.TimeLeft = duration;
-    auto dst = AppendTriangles(vertices.Length() / 3, duration, depthTest);
+    auto dst = AppendWireTriangles(vertices.Length() / 3, duration, depthTest);
     const Double3 origin = Context->Origin;
     for (int32 i = 0; i < vertices.Length();)
     {
@@ -1780,7 +1792,7 @@ void DebugDraw::DrawWireTriangles(const Span<Double3>& vertices, const Span<int3
     DebugTriangle t;
     t.Color = Color32(color);
     t.TimeLeft = duration;
-    auto dst = AppendTriangles(indices.Length() / 3, duration, depthTest);
+    auto dst = AppendWireTriangles(indices.Length() / 3, duration, depthTest);
     const Double3 origin = Context->Origin;
     for (int32 i = 0; i < indices.Length();)
     {

From 890df659702badf50ee9983a3dceebdff8e47ae8 Mon Sep 17 00:00:00 2001
From: Wojtek Figat <wojtek@figat.pl>
Date: Sun, 11 Jan 2026 22:34:57 +0100
Subject: [PATCH 29/33] Add input `GamepadButtonDown` and `GamepadButtonUp`
 events

#3626
---
 Source/Engine/Input/Input.cpp | 15 +++++++++++++++
 Source/Engine/Input/Input.h   | 10 ++++++++++
 2 files changed, 25 insertions(+)

diff --git a/Source/Engine/Input/Input.cpp b/Source/Engine/Input/Input.cpp
index 8438977b1..7048140ef 100644
--- a/Source/Engine/Input/Input.cpp
+++ b/Source/Engine/Input/Input.cpp
@@ -80,6 +80,8 @@ Delegate<const Float2&, MouseButton> Input::MouseDoubleClick;
 Delegate<const Float2&, float> Input::MouseWheel;
 Delegate<const Float2&> Input::MouseMove;
 Action Input::MouseLeave;
+Delegate<InputGamepadIndex, GamepadButton> Input::GamepadButtonDown;
+Delegate<InputGamepadIndex, GamepadButton> Input::GamepadButtonUp;
 Delegate<const Float2&, int32> Input::TouchDown;
 Delegate<const Float2&, int32> Input::TouchMove;
 Delegate<const Float2&, int32> Input::TouchUp;
@@ -1027,6 +1029,19 @@ void InputService::Update()
             break;
         }
     }
+    // TODO: route gamepad button events into global InputEvents queue to improve processing
+    for (int32 i = 0; i < Input::Gamepads.Count(); i++)
+    {
+        auto gamepad = Input::Gamepads[i];
+        for (int32 buttonIdx = 1; buttonIdx < (int32)GamepadButton::MAX; buttonIdx++)
+        {
+            GamepadButton button = (GamepadButton)buttonIdx;
+            if (gamepad->GetButtonDown(button))
+                Input::GamepadButtonDown((InputGamepadIndex)i, button);
+            else if (gamepad->GetButtonUp(button))
+                Input::GamepadButtonUp((InputGamepadIndex)i, button);
+        }
+    }
 
     // Update all actions
     for (int32 i = 0; i < Input::ActionMappings.Count(); i++)
diff --git a/Source/Engine/Input/Input.h b/Source/Engine/Input/Input.h
index 8cc1b2106..73e87f5f0 100644
--- a/Source/Engine/Input/Input.h
+++ b/Source/Engine/Input/Input.h
@@ -113,6 +113,16 @@ public:
     /// </summary>
     API_EVENT() static Action MouseLeave;
 
+    /// <summary>
+    /// Event fired when gamepad button goes down.
+    /// </summary>
+    API_EVENT() static Delegate<InputGamepadIndex, GamepadButton> GamepadButtonDown;
+
+    /// <summary>
+    /// Event fired when gamepad button goes up.
+    /// </summary>
+    API_EVENT() static Delegate<InputGamepadIndex, GamepadButton> GamepadButtonUp;
+
     /// <summary>
     /// Event fired when touch action begins.
     /// </summary>

From 5dbaf3f94e0c7a68ea0a822c46cee8645f7b0659 Mon Sep 17 00:00:00 2001
From: Wojtek Figat <wojtek@figat.pl>
Date: Sun, 11 Jan 2026 23:23:04 +0100
Subject: [PATCH 30/33] Fix to not crash after unhandled exception

#3570
---
 Source/Engine/Scripting/Scripting.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Source/Engine/Scripting/Scripting.cs b/Source/Engine/Scripting/Scripting.cs
index 7f9f2980c..229e411f3 100644
--- a/Source/Engine/Scripting/Scripting.cs
+++ b/Source/Engine/Scripting/Scripting.cs
@@ -137,8 +137,8 @@ namespace FlaxEngine
             {
                 Debug.LogError($"Unhandled Exception: {exception.Message}");
                 Debug.LogException(exception);
-                if (e.IsTerminating && !System.Diagnostics.Debugger.IsAttached)
-                    Platform.Fatal($"Unhandled Exception: {exception}");
+                //if (e.IsTerminating && !System.Diagnostics.Debugger.IsAttached)
+                //    Platform.Fatal($"Unhandled Exception: {exception}");
             }
         }
 

From fc2f56aca6310d5edd81ecd6b0907801acd95683 Mon Sep 17 00:00:00 2001
From: Wojtek Figat <wojtek@figat.pl>
Date: Mon, 12 Jan 2026 00:31:59 +0100
Subject: [PATCH 31/33] Fix missing console platforms

#3746
---
 .../Tools/Flax.Build/Deps/Dependencies/AGS.cs |   1 -
 .../Dependencies/DirectXShaderCompiler.cs     |   1 -
 .../Deps/Dependencies/DirectXTex.cs           |   6 +
 .../Flax.Build/Deps/Dependencies/EnvDTE.cs    |  11 +-
 .../Flax.Build/Deps/Dependencies/NvCloth.cs   |  64 -----------
 .../Flax.Build/Deps/Dependencies/OpenAL.cs    |  10 ++
 .../Flax.Build/Deps/Dependencies/PhysX.cs     |  64 -----------
 .../Flax.Build/Deps/Dependencies/UVAtlas.cs   |  12 --
 .../Flax.Build/Deps/Dependencies/astc.cs      |  11 +-
 .../Flax.Build/Deps/Dependencies/curl.cs      |   2 +-
 .../Flax.Build/Deps/Dependencies/freetype.cs  |  64 -----------
 .../Flax.Build/Deps/Dependencies/mono.cs      |  12 ++
 .../Flax.Build/Deps/Dependencies/nethost.cs   |  30 -----
 .../Flax.Build/Deps/Dependencies/vorbis.cs    |  64 -----------
 Source/Tools/Flax.Build/Deps/Dependency.cs    | 106 ++++++++++++++++--
 15 files changed, 136 insertions(+), 322 deletions(-)

diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/AGS.cs b/Source/Tools/Flax.Build/Deps/Dependencies/AGS.cs
index c756bb28b..ff16bd1c1 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/AGS.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/AGS.cs
@@ -29,7 +29,6 @@ namespace Flax.Deps.Dependencies
                     return new[]
                     {
                         TargetArchitecture.x64,
-                        TargetArchitecture.ARM64,
                     };
                 default: return new TargetArchitecture[0];
                 }
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXShaderCompiler.cs b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXShaderCompiler.cs
index 3c48290ee..f74494a30 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXShaderCompiler.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXShaderCompiler.cs
@@ -1,6 +1,5 @@
 // Copyright (c) Wojciech Figat. All rights reserved.
 
-using System;
 using System.IO;
 using System.Linq;
 using Flax.Build;
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXTex.cs b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXTex.cs
index 3a842b48c..cfbb88870 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/DirectXTex.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/DirectXTex.cs
@@ -43,6 +43,12 @@ namespace Flax.Deps.Dependencies
                         TargetArchitecture.x64,
                         TargetArchitecture.ARM64,
                     };
+                case TargetPlatform.XboxOne:
+                case TargetPlatform.XboxScarlett:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                    };
                 default: return new TargetArchitecture[0];
                 }
             }
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/EnvDTE.cs b/Source/Tools/Flax.Build/Deps/Dependencies/EnvDTE.cs
index 32d783e81..3f9a2148b 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/EnvDTE.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/EnvDTE.cs
@@ -1,11 +1,8 @@
 // Copyright (c) Wojciech Figat. All rights reserved.
 
-using System;
 using System.IO;
 using System.IO.Compression;
-using System.Linq;
 using Flax.Build;
-using Flax.Build.Platforms;
 
 namespace Flax.Deps.Dependencies
 {
@@ -23,8 +20,8 @@ namespace Flax.Deps.Dependencies
                 switch (BuildPlatform)
                 {
                 case TargetPlatform.Windows:
-                return new[]
-                {
+                    return new[]
+                    {
                         TargetPlatform.Windows,
                     };
                 default: return new TargetPlatform[0];
@@ -40,8 +37,8 @@ namespace Flax.Deps.Dependencies
                 switch (BuildPlatform)
                 {
                 case TargetPlatform.Windows:
-                return new[]
-                {
+                    return new[]
+                    {
                         TargetArchitecture.x64,
                         TargetArchitecture.ARM64,
                     };
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs b/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs
index aa15aadac..f3c3ff210 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/NvCloth.cs
@@ -17,70 +17,6 @@ namespace Flax.Deps.Dependencies
     {
         private string root, nvCloth;
 
-        /// <inheritdoc />
-        public override TargetPlatform[] Platforms
-        {
-            get
-            {
-                switch (BuildPlatform)
-                {
-                case TargetPlatform.Windows:
-                    return new[]
-                    {
-                        TargetPlatform.Windows,
-                        TargetPlatform.XboxOne,
-                        TargetPlatform.XboxScarlett,
-                        TargetPlatform.PS4,
-                        TargetPlatform.PS5,
-                        TargetPlatform.Switch,
-                        TargetPlatform.Android,
-                    };
-                case TargetPlatform.Linux:
-                    return new[]
-                    {
-                        TargetPlatform.Linux,
-                    };
-                case TargetPlatform.Mac:
-                    return new[]
-                    {
-                        TargetPlatform.Mac,
-                        TargetPlatform.iOS,
-                    };
-                default: return new TargetPlatform[0];
-                }
-            }
-        }
-
-        /// <inheritdoc />
-        public override TargetArchitecture[] Architectures
-        {
-            get
-            {
-                switch (BuildPlatform)
-                {
-                case TargetPlatform.Windows:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        TargetArchitecture.ARM64,
-                    };
-                case TargetPlatform.Linux:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        //TargetArchitecture.ARM64,
-                    };
-                case TargetPlatform.Mac:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        TargetArchitecture.ARM64,
-                    };
-                default: return new TargetArchitecture[0];
-                }
-            }
-        }
-
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs b/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs
index 5e194edd4..37e446ce1 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/OpenAL.cs
@@ -70,6 +70,16 @@ namespace Flax.Deps.Dependencies
                         TargetArchitecture.x64,
                         TargetArchitecture.ARM64,
                     };
+                case TargetPlatform.iOS:
+                    return new[]
+                    {
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Android:
+                    return new[]
+                    {
+                        TargetArchitecture.ARM64,
+                    };
                 default: return new TargetArchitecture[0];
                 }
             }
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
index 46ad23381..18bb4e69f 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/PhysX.cs
@@ -17,70 +17,6 @@ namespace Flax.Deps.Dependencies
     /// <seealso cref="Flax.Deps.Dependency" />
     class PhysX : Dependency
     {
-        /// <inheritdoc />
-        public override TargetPlatform[] Platforms
-        {
-            get
-            {
-                switch (BuildPlatform)
-                {
-                case TargetPlatform.Windows:
-                    return new[]
-                    {
-                        TargetPlatform.Windows,
-                        TargetPlatform.XboxOne,
-                        TargetPlatform.PS4,
-                        TargetPlatform.PS5,
-                        TargetPlatform.XboxScarlett,
-                        TargetPlatform.Android,
-                        TargetPlatform.Switch,
-                    };
-                case TargetPlatform.Linux:
-                    return new[]
-                    {
-                        TargetPlatform.Linux,
-                    };
-                case TargetPlatform.Mac:
-                    return new[]
-                    {
-                        TargetPlatform.Mac,
-                        TargetPlatform.iOS,
-                    };
-                default: return new TargetPlatform[0];
-                }
-            }
-        }
-
-        /// <inheritdoc />
-        public override TargetArchitecture[] Architectures
-        {
-            get
-            {
-                switch (BuildPlatform)
-                {
-                case TargetPlatform.Windows:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        TargetArchitecture.ARM64,
-                    };
-                case TargetPlatform.Linux:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        //TargetArchitecture.ARM64,
-                    };
-                case TargetPlatform.Mac:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        TargetArchitecture.ARM64,
-                    };
-                default: return new TargetArchitecture[0];
-                }
-            }
-        }
-
         private string root;
         private string projectGenDir;
         private string projectGenPath;
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/UVAtlas.cs b/Source/Tools/Flax.Build/Deps/Dependencies/UVAtlas.cs
index f0d29dba9..19e314326 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/UVAtlas.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/UVAtlas.cs
@@ -42,18 +42,6 @@ namespace Flax.Deps.Dependencies
                         TargetArchitecture.x64,
                         TargetArchitecture.ARM64,
                     };
-                case TargetPlatform.Linux:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        //TargetArchitecture.ARM64,
-                    };
-                case TargetPlatform.Mac:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        TargetArchitecture.ARM64,
-                    };
                 default: return new TargetArchitecture[0];
                 }
             }
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/astc.cs b/Source/Tools/Flax.Build/Deps/Dependencies/astc.cs
index 40ae9d1e0..62a2b1097 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/astc.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/astc.cs
@@ -1,6 +1,5 @@
 // Copyright (c) Wojciech Figat. All rights reserved.
 
-using System.Collections.Generic;
 using System.IO;
 using Flax.Build;
 
@@ -76,7 +75,7 @@ namespace Flax.Deps.Dependencies
                     {
                     case TargetPlatform.Windows:
                     {
-                        string buildDir = Path.Combine(root, "build-" + architecture.ToString());
+                        string buildDir = Path.Combine(root, "build-" + architecture);
                         var isa = architecture == TargetArchitecture.ARM64 ? "-DASTCENC_ISA_NEON=ON" : "-DASTCENC_ISA_SSE2=ON";
                         var lib = architecture == TargetArchitecture.ARM64 ? "astcenc-neon-static.lib" : "astcenc-sse2-static.lib";
                         SetupDirectory(buildDir, true);
@@ -84,11 +83,11 @@ namespace Flax.Deps.Dependencies
                         BuildCmake(buildDir);
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         Utilities.FileCopy(Path.Combine(buildDir, "Source/Release", lib), Path.Combine(depsFolder, "astcenc.lib"));
-                    }
-                    break;
+                        break;
+                        }
                     case TargetPlatform.Mac:
                     {
-                        string buildDir = Path.Combine(root, "build-" + architecture.ToString());
+                        string buildDir = Path.Combine(root, "build-" + architecture);
                         var isa = architecture == TargetArchitecture.ARM64 ? "-DASTCENC_ISA_NEON=ON" : "-DASTCENC_ISA_SSE2=ON";
                         var lib = architecture == TargetArchitecture.ARM64 ? "libastcenc-neon-static.a" : "libastcenc-sse2-static.a";
                         SetupDirectory(buildDir, true);
@@ -96,8 +95,8 @@ namespace Flax.Deps.Dependencies
                         BuildCmake(buildDir);
                         var depsFolder = GetThirdPartyFolder(options, platform, architecture);
                         Utilities.FileCopy(Path.Combine(buildDir, "Source", lib), Path.Combine(depsFolder, "libastcenc.a"));
+                        break;
                     }
-                    break;
                     }
                 }
             }
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs b/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs
index ceca92798..2d25fed3d 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/curl.cs
@@ -107,7 +107,7 @@ namespace Flax.Deps.Dependencies
                     case TargetPlatform.Windows:
                     {
                         // Build for Windows
-                        var buildDir = Path.Combine(root, "build-" + architecture.ToString());
+                        var buildDir = Path.Combine(root, "build-" + architecture);
                         var solutionPath = Path.Combine(buildDir, "CURL.sln");
 
                         RunCmake(root, platform, architecture, $"-B\"{buildDir}\" -DBUILD_CURL_EXE=OFF -DBUILD_SHARED_LIBS=OFF -DCURL_STATIC_CRT=OFF");
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs b/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs
index ec3ab5e18..d43c73770 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/freetype.cs
@@ -15,70 +15,6 @@ namespace Flax.Deps.Dependencies
     /// <seealso cref="Flax.Deps.Dependency" />
     class freetype : Dependency
     {
-        /// <inheritdoc />
-        public override TargetPlatform[] Platforms
-        {
-            get
-            {
-                switch (BuildPlatform)
-                {
-                case TargetPlatform.Windows:
-                    return new[]
-                    {
-                        TargetPlatform.Windows,
-                        TargetPlatform.XboxOne,
-                        TargetPlatform.PS4,
-                        TargetPlatform.PS5,
-                        TargetPlatform.XboxScarlett,
-                        TargetPlatform.Android,
-                        TargetPlatform.Switch,
-                    };
-                case TargetPlatform.Linux:
-                    return new[]
-                    {
-                        TargetPlatform.Linux,
-                    };
-                case TargetPlatform.Mac:
-                    return new[]
-                    {
-                        TargetPlatform.Mac,
-                        TargetPlatform.iOS,
-                    };
-                default: return new TargetPlatform[0];
-                }
-            }
-        }
-
-        /// <inheritdoc />
-        public override TargetArchitecture[] Architectures
-        {
-            get
-            {
-                switch (BuildPlatform)
-                {
-                case TargetPlatform.Windows:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        TargetArchitecture.ARM64,
-                    };
-                case TargetPlatform.Linux:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        //TargetArchitecture.ARM64,
-                    };
-                case TargetPlatform.Mac:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        TargetArchitecture.ARM64,
-                    };
-                default: return new TargetArchitecture[0];
-                }
-            }
-        }
-
         /// <inheritdoc />
         public override void Build(BuildOptions options)
         {
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/mono.cs b/Source/Tools/Flax.Build/Deps/Dependencies/mono.cs
index ad402d3d4..a90d1c2a0 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/mono.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/mono.cs
@@ -78,6 +78,18 @@ namespace Flax.Deps.Dependencies
                         TargetArchitecture.x64,
                         TargetArchitecture.ARM64,
                     };
+                case TargetPlatform.XboxOne:
+                case TargetPlatform.XboxScarlett:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                    };
+                case TargetPlatform.Switch:
+                case TargetPlatform.Android:
+                    return new[]
+                    {
+                        TargetArchitecture.ARM64,
+                    };
                 default: return new TargetArchitecture[0];
                 }
             }
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/nethost.cs b/Source/Tools/Flax.Build/Deps/Dependencies/nethost.cs
index 0ac16286f..f67244c9b 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/nethost.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/nethost.cs
@@ -43,36 +43,6 @@ namespace Flax.Deps.Dependencies
             }
         }
 
-        /// <inheritdoc />
-        public override TargetArchitecture[] Architectures
-        {
-            get
-            {
-                switch (BuildPlatform)
-                {
-                case TargetPlatform.Windows:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        TargetArchitecture.ARM64,
-                    };
-                case TargetPlatform.Linux:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        //TargetArchitecture.ARM64,
-                    };
-                case TargetPlatform.Mac:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        TargetArchitecture.ARM64,
-                    };
-                default: return new TargetArchitecture[0];
-                }
-            }
-        }
-
         /// <inheritdoc />
         public override bool BuildByDefault => false;
 
diff --git a/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs b/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
index c19fab782..15ca415da 100644
--- a/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependencies/vorbis.cs
@@ -15,70 +15,6 @@ namespace Flax.Deps.Dependencies
     /// <seealso cref="Flax.Deps.Dependency" />
     class vorbis : Dependency
     {
-        /// <inheritdoc />
-        public override TargetPlatform[] Platforms
-        {
-            get
-            {
-                switch (BuildPlatform)
-                {
-                case TargetPlatform.Windows:
-                    return new[]
-                    {
-                        TargetPlatform.Windows,
-                        TargetPlatform.XboxOne,
-                        TargetPlatform.PS4,
-                        TargetPlatform.PS5,
-                        TargetPlatform.XboxScarlett,
-                        TargetPlatform.Android,
-                        TargetPlatform.Switch,
-                    };
-                case TargetPlatform.Linux:
-                    return new[]
-                    {
-                        TargetPlatform.Linux,
-                    };
-                case TargetPlatform.Mac:
-                    return new[]
-                    {
-                        TargetPlatform.Mac,
-                        TargetPlatform.iOS,
-                    };
-                default: return new TargetPlatform[0];
-                }
-            }
-        }
-
-        /// <inheritdoc />
-        public override TargetArchitecture[] Architectures
-        {
-            get
-            {
-                switch (BuildPlatform)
-                {
-                case TargetPlatform.Windows:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        TargetArchitecture.ARM64,
-                    };
-                case TargetPlatform.Linux:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        //TargetArchitecture.ARM64,
-                    };
-                case TargetPlatform.Mac:
-                    return new[]
-                    {
-                        TargetArchitecture.x64,
-                        TargetArchitecture.ARM64,
-                    };
-                default: return new TargetArchitecture[0];
-                }
-            }
-        }
-
         private struct Binary
         {
             public string Filename;
diff --git a/Source/Tools/Flax.Build/Deps/Dependency.cs b/Source/Tools/Flax.Build/Deps/Dependency.cs
index 381783e29..7286bf9f3 100644
--- a/Source/Tools/Flax.Build/Deps/Dependency.cs
+++ b/Source/Tools/Flax.Build/Deps/Dependency.cs
@@ -52,7 +52,6 @@ namespace Flax.Deps
         /// </summary>
         protected static TargetPlatform BuildPlatform => Platform.BuildPlatform.Target;
 
-
         private static Version? _cmakeVersion;
         protected static Version CMakeVersion
         {
@@ -60,11 +59,19 @@ namespace Flax.Deps
             {
                 if (_cmakeVersion == null)
                 {
-                    var versionOutput = Utilities.ReadProcessOutput("cmake", "--version");
-                    var versionStart = versionOutput.IndexOf("cmake version ") + "cmake version ".Length;
-                    var versionEnd = versionOutput.IndexOfAny(['-', '\n', '\r'], versionStart); // End of line or dash before Git hash
-                    var versionString = versionOutput.Substring(versionStart, versionEnd - versionStart);
-                    _cmakeVersion = new Version(versionString);
+                    try
+                    {
+                        var versionOutput = Utilities.ReadProcessOutput("cmake", "--version");
+                        var versionStart = versionOutput.IndexOf("cmake version ") + "cmake version ".Length;
+                        var versionEnd = versionOutput.IndexOfAny(['-', '\n', '\r'], versionStart); // End of line or dash before Git hash
+                        var versionString = versionOutput.Substring(versionStart, versionEnd - versionStart);
+                        _cmakeVersion = new Version(versionString);
+                    }
+                    catch (Exception)
+                    {
+                        // Assume old version by default (in case of errors)
+                        _cmakeVersion = new Version(3, 0);
+                    }
                 }
                 return _cmakeVersion;
             }
@@ -73,12 +80,95 @@ namespace Flax.Deps
         /// <summary>
         /// Gets the platforms list supported by this dependency to build on the current build platform (based on <see cref="Platform.BuildPlatform"/>).
         /// </summary>
-        public abstract TargetPlatform[] Platforms { get; }
+        public virtual TargetPlatform[] Platforms
+        {
+            get
+            {
+                // The most common build setup
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetPlatform.Windows,
+                        TargetPlatform.XboxOne,
+                        TargetPlatform.XboxScarlett,
+                        TargetPlatform.PS4,
+                        TargetPlatform.PS5,
+                        TargetPlatform.Android,
+                        TargetPlatform.Switch,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetPlatform.Linux,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetPlatform.Mac,
+                        TargetPlatform.iOS,
+                    };
+                default: return new TargetPlatform[0];
+                }
+            }
+        }
 
         /// <summary>
         /// Gets the architectures list supported by this dependency to build on the current build platform (based on <see cref="Platform.BuildPlatform"/>).
         /// </summary>
-        public abstract TargetArchitecture[] Architectures { get; }
+        public virtual TargetArchitecture[] Architectures
+        {
+            get
+            {
+                // Default value returns all supported architectures for all supported platforms
+                switch (BuildPlatform)
+                {
+                case TargetPlatform.Windows:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Linux:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        //TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Mac:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.XboxOne:
+                case TargetPlatform.XboxScarlett:
+                case TargetPlatform.PS4:
+                case TargetPlatform.PS5:
+                    return new[]
+                    {
+                        TargetArchitecture.x64,
+                    };
+                case TargetPlatform.Switch:
+                    return new[]
+                    {
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.Android:
+                    return new[]
+                    {
+                        TargetArchitecture.ARM64,
+                    };
+                case TargetPlatform.iOS:
+                    return new[]
+                    {
+                        TargetArchitecture.ARM64,
+                    };
+                default: return new TargetArchitecture[0];
+                }
+            }
+        }
 
         /// <summary>
         /// True if build dependency by default, otherwise only when explicitly specified via command line.

From 788d8660b815c87fca0605d6c8cd464b862b758b Mon Sep 17 00:00:00 2001
From: Wojtek Figat <wojtek@figat.pl>
Date: Mon, 12 Jan 2026 20:47:03 +0100
Subject: [PATCH 32/33] Fix transparent materials flickering without DDGI

---
 Source/Shaders/GI/DDGI.hlsl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Source/Shaders/GI/DDGI.hlsl b/Source/Shaders/GI/DDGI.hlsl
index 3e31c2e53..b88b846a6 100644
--- a/Source/Shaders/GI/DDGI.hlsl
+++ b/Source/Shaders/GI/DDGI.hlsl
@@ -305,6 +305,8 @@ float3 SampleDDGIIrradiance(DDGIData data, Texture2D<snorm float4> probesData, T
     uint cascadeIndex = DDGI_DEBUG_CASCADE;
 #else
     uint cascadeIndex = 0;
+    if (data.CascadesCount == 0)
+        return float3(0, 0, 0);
     for (; cascadeIndex < data.CascadesCount; cascadeIndex++)
     {
         // Get cascade data

From 05e7e6630c5f8b67a9b6addcdf061ee5de039ab1 Mon Sep 17 00:00:00 2001
From: Wojtek Figat <wojtek@figat.pl>
Date: Mon, 12 Jan 2026 21:28:45 +0100
Subject: [PATCH 33/33] Fix missing material graph references

#3839
---
 Source/Engine/Content/Assets/Material.cpp | 29 +++++++++++++++++++++++
 Source/Engine/Content/Assets/Material.h   |  3 +++
 2 files changed, 32 insertions(+)

diff --git a/Source/Engine/Content/Assets/Material.cpp b/Source/Engine/Content/Assets/Material.cpp
index 019fd9dd8..b4cf55d4d 100644
--- a/Source/Engine/Content/Assets/Material.cpp
+++ b/Source/Engine/Content/Assets/Material.cpp
@@ -41,6 +41,35 @@ bool Material::IsMaterialInstance() const
     return false;
 }
 
+#if USE_EDITOR
+
+void Material::GetReferences(Array<Guid>& assets, Array<String>& files) const
+{
+    ShaderAssetTypeBase<MaterialBase>::GetReferences(assets, files);
+
+    // Collect references from material graph (needs to load it)
+    if (!WaitForLoaded() && HasChunk(SHADER_FILE_CHUNK_VISJECT_SURFACE))
+    {
+        ScopeLock lock(Locker);
+        if (!LoadChunks(GET_CHUNK_FLAG(SHADER_FILE_CHUNK_VISJECT_SURFACE)))
+        {
+            const auto surfaceChunk = GetChunk(SHADER_FILE_CHUNK_VISJECT_SURFACE);
+            if (surfaceChunk)
+            {
+                MemoryReadStream stream(surfaceChunk->Get(), surfaceChunk->Size());
+                MaterialGraph graph;
+                if (!graph.Load(&stream, false))
+                {
+                    graph.GetReferences(assets);
+                }
+            }
+        }
+    }
+
+}
+
+#endif
+
 const MaterialInfo& Material::GetInfo() const
 {
     if (_materialShader)
diff --git a/Source/Engine/Content/Assets/Material.h b/Source/Engine/Content/Assets/Material.h
index 4ce47b154..cd2ae8e97 100644
--- a/Source/Engine/Content/Assets/Material.h
+++ b/Source/Engine/Content/Assets/Material.h
@@ -38,6 +38,9 @@ public:
 public:
     // [MaterialBase]
     bool IsMaterialInstance() const override;
+#if USE_EDITOR
+    void GetReferences(Array<Guid>& assets, Array<String>& files) const override;
+#endif
 
     // [IMaterial]
     const MaterialInfo& GetInfo() const override;